From: Scott Gasch <scott@gasch.org>
Date: Wed, 15 Sep 2021 16:32:08 +0000 (-0700)
Subject: Adding doctests.  Also added a logging filter.
X-Git-Url: https://wannabe.guru.org/gitweb/?a=commitdiff_plain;h=4c315e387f18010ba0b5661744ad3c792f21d2d1;p=python_utils.git

Adding doctests.  Also added a logging filter.
---

diff --git a/argparse_utils.py b/argparse_utils.py
index 3799a47..0ee2be9 100644
--- a/argparse_utils.py
+++ b/argparse_utils.py
@@ -4,6 +4,7 @@ import argparse
 import datetime
 import logging
 import os
+from typing import Any
 
 # This module is commonly used by others in here and should avoid
 # taking any unnecessary dependencies back on them.
@@ -56,14 +57,54 @@ class ActionNoYes(argparse.Action):
             setattr(namespace, self.dest, True)
 
 
-def valid_bool(v):
+def valid_bool(v: Any) -> bool:
+    """
+    If the string is a valid bool, return its value.
+
+    >>> valid_bool(True)
+    True
+
+    >>> valid_bool("true")
+    True
+
+    >>> valid_bool("yes")
+    True
+
+    >>> valid_bool("on")
+    True
+
+    >>> valid_bool("1")
+    True
+
+    >>> valid_bool(12345)
+    Traceback (most recent call last):
+    ...
+    argparse.ArgumentTypeError: 12345
+
+    """
     if isinstance(v, bool):
         return v
     from string_utils import to_bool
-    return to_bool(v)
+    try:
+        return to_bool(v)
+    except Exception:
+        raise argparse.ArgumentTypeError(v)
 
 
 def valid_ip(ip: str) -> str:
+    """
+    If the string is a valid IPv4 address, return it.  Otherwise raise
+    an ArgumentTypeError.
+
+    >>> valid_ip("1.2.3.4")
+    '1.2.3.4'
+
+    >>> valid_ip("localhost")
+    Traceback (most recent call last):
+    ...
+    argparse.ArgumentTypeError: localhost is an invalid IP address
+
+    """
     from string_utils import extract_ip_v4
     s = extract_ip_v4(ip.strip())
     if s is not None:
@@ -74,6 +115,22 @@ def valid_ip(ip: str) -> str:
 
 
 def valid_mac(mac: str) -> str:
+    """
+    If the string is a valid MAC address, return it.  Otherwise raise
+    an ArgumentTypeError.
+
+    >>> valid_mac('12:23:3A:4F:55:66')
+    '12:23:3A:4F:55:66'
+
+    >>> valid_mac('12-23-3A-4F-55-66')
+    '12-23-3A-4F-55-66'
+
+    >>> valid_mac('big')
+    Traceback (most recent call last):
+    ...
+    argparse.ArgumentTypeError: big is an invalid MAC address
+
+    """
     from string_utils import extract_mac_address
     s = extract_mac_address(mac)
     if s is not None:
@@ -84,6 +141,22 @@ def valid_mac(mac: str) -> str:
 
 
 def valid_percentage(num: str) -> float:
+    """
+    If the string is a valid percentage, return it.  Otherwise raise
+    an ArgumentTypeError.
+
+    >>> valid_percentage("15%")
+    15.0
+
+    >>> valid_percentage('40')
+    40.0
+
+    >>> valid_percentage('115')
+    Traceback (most recent call last):
+    ...
+    argparse.ArgumentTypeError: 115 is an invalid percentage; expected 0 <= n <= 100.0
+
+    """
     num = num.strip('%')
     n = float(num)
     if 0.0 <= n <= 100.0:
@@ -94,6 +167,19 @@ def valid_percentage(num: str) -> float:
 
 
 def valid_filename(filename: str) -> str:
+    """
+    If the string is a valid filename, return it.  Otherwise raise
+    an ArgumentTypeError.
+
+    >>> valid_filename('/tmp')
+    '/tmp'
+
+    >>> valid_filename('wfwefwefwefwefwefwefwefwef')
+    Traceback (most recent call last):
+    ...
+    argparse.ArgumentTypeError: wfwefwefwefwefwefwefwefwef was not found and is therefore invalid.
+
+    """
     s = filename.strip()
     if os.path.exists(s):
         return s
@@ -103,6 +189,18 @@ def valid_filename(filename: str) -> str:
 
 
 def valid_date(txt: str) -> datetime.date:
+    """If the string is a valid date, return it.  Otherwise raise
+    an ArgumentTypeError.
+
+    >>> valid_date('6/5/2021')
+    datetime.date(2021, 6, 5)
+
+    # Note: dates like 'next wednesday' work fine, they are just
+    # hard to test for without knowing when the testcase will be
+    # executed...
+    >>> valid_date('next wednesday') # doctest: +ELLIPSIS
+    -ANYTHING-
+    """
     from string_utils import to_date
     date = to_date(txt)
     if date is not None:
@@ -113,6 +211,18 @@ def valid_date(txt: str) -> datetime.date:
 
 
 def valid_datetime(txt: str) -> datetime.datetime:
+    """If the string is a valid datetime, return it.  Otherwise raise
+    an ArgumentTypeError.
+
+    >>> valid_datetime('6/5/2021 3:01:02')
+    datetime.datetime(2021, 6, 5, 3, 1, 2)
+
+    # Again, these types of expressions work fine but are
+    # difficult to test with doctests because the answer is
+    # relative to the time the doctest is executed.
+    >>> valid_datetime('next christmas at 4:15am') # doctest: +ELLIPSIS
+    -ANYTHING-
+    """
     from string_utils import to_datetime
     dt = to_datetime(txt)
     if dt is not None:
@@ -120,3 +230,9 @@ def valid_datetime(txt: str) -> datetime.datetime:
     msg = f'Cannot parse argument as datetime: {txt}'
     logger.warning(msg)
     raise argparse.ArgumentTypeError(msg)
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.ELLIPSIS_MARKER = '-ANYTHING-'
+    doctest.testmod()
diff --git a/bootstrap.py b/bootstrap.py
index 7f63dbb..bf5d91f 100644
--- a/bootstrap.py
+++ b/bootstrap.py
@@ -42,6 +42,11 @@ original_hook = sys.excepthook
 
 
 def handle_uncaught_exception(exc_type, exc_value, exc_tb):
+    """
+    Top-level exception handler for exceptions that make it past any exception
+    handlers in the python code being run.  Logs the error and stacktrace then
+    maybe attaches a debugger.
+    """
     global original_hook
     msg = f'Unhandled top level exception {exc_type}'
     logger.exception(msg)
@@ -57,11 +62,11 @@ def handle_uncaught_exception(exc_type, exc_value, exc_tb):
             # stdin or stderr is redirected, just do the normal thing
             original_hook(exc_type, exc_value, exc_tb)
         else:
-            # a terminal is attached and stderr is not redirected, debug.
+            # a terminal is attached and stderr is not redirected, maybe debug.
+            import traceback
+            traceback.print_exception(exc_type, exc_value, exc_tb)
             if config.config['debug_unhandled_exceptions']:
-                import traceback
                 import pdb
-                traceback.print_exception(exc_type, exc_value, exc_tb)
                 logger.info("Invoking the debugger...")
                 pdb.pm()
             else:
@@ -69,12 +74,21 @@ def handle_uncaught_exception(exc_type, exc_value, exc_tb):
 
 
 def initialize(entry_point):
+    """
+    Remember to initialize config, initialize logging, set/log a random
+    seed, etc... before running main.
 
-    """Remember to initialize config and logging before running main."""
+    """
     @functools.wraps(entry_point)
     def initialize_wrapper(*args, **kwargs):
+
+        # Hook top level unhandled exceptions, maybe invoke debugger.
         if sys.excepthook == sys.__excepthook__:
             sys.excepthook = handle_uncaught_exception
+
+        # Try to figure out the name of the program entry point.  Then
+        # parse configuration (based on cmdline flags, environment vars
+        # etc...)
         if (
                 '__globals__' in entry_point.__dict__ and
                 '__file__' in entry_point.__globals__
@@ -83,8 +97,9 @@ def initialize(entry_point):
         else:
             config.parse(None)
 
+        # Initialize logging... and log some remembered messages from
+        # config module.
         logging_utils.initialize_logging(logging.getLogger())
-
         config.late_logging()
 
         # Allow programs that don't bother to override the random seed
@@ -102,8 +117,8 @@ def initialize(entry_point):
             logger.debug(msg)
         random.seed(random_seed)
 
+        # Do it, invoke the user's code.  Pay attention to how long it takes.
         logger.debug(f'Starting {entry_point.__name__} (program entry point)')
-
         ret = None
         import stopwatch
         with stopwatch.Timer() as t:
@@ -121,6 +136,8 @@ def initialize(entry_point):
                      f'child system: {cstime}s\n'
                      f'machine uptime: {elapsed_time}s\n'
                      f'walltime: {walltime}s')
+
+        # If it doesn't return cleanly, call attention to the return value.
         if ret is not None and ret != 0:
             logger.error(f'Exit {ret}')
         else:
diff --git a/config.py b/config.py
index 574449f..1e690bc 100644
--- a/config.py
+++ b/config.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
 
-"""Global configuration driven by commandline arguments (even across
-different modules).  Usage:
+"""Global configuration driven by commandline arguments, environment variables
+and saved configuration files.  This works across several modules.
+
+Usage:
 
     module.py:
     ----------
@@ -36,7 +38,8 @@ different modules).  Usage:
         config.parse()   # Very important, this must be invoked!
 
     If you set this up and remember to invoke config.parse(), all commandline
-    arguments will play nicely together:
+    arguments will play nicely together.  This is done automatically for you
+    if you're using the bootstrap module's initialize wrapper.
 
     % main.py -h
     usage: main.py [-h]
diff --git a/dict_utils.py b/dict_utils.py
index 6dd79f3..92fd1e0 100644
--- a/dict_utils.py
+++ b/dict_utils.py
@@ -25,6 +25,7 @@ def init_or_inc(
     False
     >>> d
     {'test': 2, 'ing': 1}
+
     """
     if key in d.keys():
         d[key] = inc_function(d[key])
@@ -71,6 +72,7 @@ def coalesce(
     >>> c = {'c': 1, 'd': 2}
     >>> coalesce([a, b, c])
     {'a': 1, 'b': [1, 2], 'c': [1, 2], 'd': [2, 3]}
+
     """
     out: Dict[Any, Any] = {}
     for d in inputs:
@@ -93,6 +95,7 @@ def item_with_max_value(d: Dict[Any, Any]) -> Tuple[Any, Any]:
     Traceback (most recent call last):
     ...
     ValueError: max() arg is an empty sequence
+
     """
     return max(d.items(), key=lambda _: _[1])
 
@@ -103,6 +106,7 @@ def item_with_min_value(d: Dict[Any, Any]) -> Tuple[Any, Any]:
     >>> d = {'a': 1, 'b': 2, 'c': 3}
     >>> item_with_min_value(d)
     ('a', 1)
+
     """
     return min(d.items(), key=lambda _: _[1])
 
@@ -113,6 +117,7 @@ def key_with_max_value(d: Dict[Any, Any]) -> Any:
     >>> d = {'a': 1, 'b': 2, 'c': 3}
     >>> key_with_max_value(d)
     'c'
+
     """
     return item_with_max_value(d)[0]
 
@@ -123,6 +128,7 @@ def key_with_min_value(d: Dict[Any, Any]) -> Any:
     >>> d = {'a': 1, 'b': 2, 'c': 3}
     >>> key_with_min_value(d)
     'a'
+
     """
     return item_with_min_value(d)[0]
 
@@ -133,6 +139,7 @@ def max_value(d: Dict[Any, Any]) -> Any:
     >>> d = {'a': 1, 'b': 2, 'c': 3}
     >>> max_value(d)
     3
+
     """
     return item_with_max_value(d)[1]
 
@@ -143,6 +150,7 @@ def min_value(d: Dict[Any, Any]) -> Any:
     >>> d = {'a': 1, 'b': 2, 'c': 3}
     >>> min_value(d)
     1
+
     """
     return item_with_min_value(d)[1]
 
@@ -153,6 +161,7 @@ def max_key(d: Dict[Any, Any]) -> Any:
     >>> d = {'a': 3, 'b': 2, 'c': 1}
     >>> max_key(d)
     'c'
+
     """
     return max(d.keys())
 
@@ -163,6 +172,7 @@ def min_key(d: Dict[Any, Any]) -> Any:
     >>> d = {'a': 3, 'b': 2, 'c': 1}
     >>> min_key(d)
     'a'
+
     """
     return min(d.keys())
 
diff --git a/input_utils.py b/input_utils.py
index 648ee30..153641b 100644
--- a/input_utils.py
+++ b/input_utils.py
@@ -17,6 +17,8 @@ def single_keystroke_response(
     default_response: str = None,
     timeout_seconds: int = None,
 ) -> str:
+    """Get a single keystroke response to a prompt."""
+
     def _handle_timeout(signum, frame) -> None:
         raise exceptions.TimeoutError()
 
@@ -56,12 +58,16 @@ def single_keystroke_response(
 
 
 def yn_response(prompt: str = None, *, timeout_seconds=None) -> str:
+    """Get a Y/N response to a prompt."""
+
     return single_keystroke_response(
         ["y", "n", "Y", "N"], prompt=prompt, timeout_seconds=timeout_seconds
     ).lower()
 
 
 def keystroke_helper() -> None:
+    """Misc util to watch keystrokes and report what they were."""
+
     print("Watching for keystrokes; ^C to quit.")
     while True:
         key = readchar.readkey()
diff --git a/logging_utils.py b/logging_utils.py
index 25919a7..034f90c 100644
--- a/logging_utils.py
+++ b/logging_utils.py
@@ -2,6 +2,7 @@
 
 """Utilities related to logging."""
 
+import collections
 import contextlib
 import datetime
 import enum
@@ -94,6 +95,12 @@ cfg.add_argument(
     default=False,
     help='logging.info also prints to stdout.'
 )
+cfg.add_argument(
+    '--logging_max_n_times_per_message',
+    type=int,
+    default=0,
+    help='When set, ignore logged messages from the same site after N.'
+)
 
 # See also: OutputMultiplexer
 cfg.add_argument(
@@ -107,11 +114,37 @@ built_in_print = print
 
 
 class OnlyInfoFilter(logging.Filter):
+    """
+    A filter that only logs messages produced at the INFO logging level.
+    """
     def filter(self, record):
         return record.levelno == logging.INFO
 
 
+class OnlyNTimesFilter(logging.Filter):
+    """
+    A filter that only logs messages from a given site with the same
+    message at the same logging level N times and ignores subsequent
+    attempts to log.
+
+    """
+    def __init__(self, maximum: int) -> None:
+        self.maximum = maximum
+        self.counters = collections.Counter()
+        super().__init__()
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        source = f'{record.pathname}+{record.lineno}+{record.levelno}+{record.msg}'
+        count = self.counters[source]
+        self.counters[source] += 1
+        return count < self.maximum
+
+
 class MillisecondAwareFormatter(logging.Formatter):
+    """
+    A formatter for adding milliseconds to log messages.
+
+    """
     converter = datetime.datetime.fromtimestamp
 
     def formatTime(self, record, datefmt=None):
@@ -199,6 +232,11 @@ def initialize_logging(logger=None) -> logging.Logger:
         handler.addFilter(OnlyInfoFilter())
         logger.addHandler(handler)
 
+    maximum = config.config['logging_max_n_times_per_message']
+    if maximum > 0:
+        for handler in handlers:
+            handler.addFilter(OnlyNTimesFilter(maximum))
+
     logger.setLevel(numeric_level)
     logger.propagate = False
 
diff --git a/logical_search.py b/logical_search.py
index 805ec22..3ebaee5 100644
--- a/logical_search.py
+++ b/logical_search.py
@@ -86,6 +86,16 @@ class Corpus(object):
     ...                    reference=None,
     ...                   )
     ...          )
+    >>> c.add_doc(Document(
+    ...                    docid=3,
+    ...                    tags=set(['urgent']),
+    ...                    properties=[
+    ...                                ('author', 'Scott'),
+    ...                                ('subject', 'car turning in front of you')
+    ...                    ],
+    ...                    reference=None,
+    ...                   )
+    ...          )
     >>> c.query('author:Scott and important')
     {1}
     """
diff --git a/string_utils.py b/string_utils.py
index 0829846..3aaf1d7 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -1059,18 +1059,26 @@ def to_bool(in_str: str) -> bool:
 
     >>> to_bool('True')
     True
+
     >>> to_bool('1')
     True
+
     >>> to_bool('yes')
     True
+
     >>> to_bool('no')
     False
+
     >>> to_bool('huh?')
     False
+
+    >>> to_bool('on')
+    True
+
     """
     if not is_string(in_str):
         raise ValueError(in_str)
-    return in_str.lower() in ("true", "1", "yes", "y", "t")
+    return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
 
 
 def to_date(in_str: str) -> Optional[datetime.date]:
diff --git a/tests/run_tests.sh b/tests/run_tests.sh
index 6e0c30c..9623f1b 100755
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@@ -78,10 +78,11 @@ if [ ${DOCTEST} -eq 1 ]; then
         BASE="${BASE} (doctest)"
         make_header "${BASE}" "${CYAN}"
         OUT=$( python3 ${doctest} 2>&1 )
-        if [ "$OUT" == "" ]; then
+        FAILED=$( echo "${OUT}" | grep '\*\*\*Test Failed\*\*\*' | wc -l )
+        if [ $FAILED == 0 ]; then
             echo "OK"
         else
-            echo -e "${OUT}"
+            echo -e "${FAILED}"
             FAILURES=$((FAILURES+1))
         fi
     done
@@ -112,5 +113,11 @@ if [ ${INTEGRATION} -eq 1 ]; then
 fi
 
 if [ ${FAILURES} -ne 0 ]; then
-    echo -e "${RED}There were ${FAILURES} failure(s).${NC}"
+    if [ ${FAILURES} -eq 1 ]; then
+        echo -e "${RED}There was ${FAILURES} failure.${NC}"
+    else
+        echo -e "${RED}There were ${FAILURES} failures.${NC}"
+    fi
+else
+    echo -e "${GREEN}Everything looks good.${NC}"
 fi
diff --git a/unittest_utils.py b/unittest_utils.py
index 8a0556b..bb1a9b4 100644
--- a/unittest_utils.py
+++ b/unittest_utils.py
@@ -56,13 +56,14 @@ _db = '/home/scott/.python_unittest_performance_db'
 
 
 def check_method_for_perf_regressions(func: Callable) -> Callable:
-    """This is meant to be used on a method in a class that subclasses
+    """
+    This is meant to be used on a method in a class that subclasses
     unittest.TestCase.  When thus decorated it will time the execution
     of the code in the method, compare it with a database of
     historical perfmance, and fail the test with a perf-related
     message if it has become too slow.
-    """
 
+    """
     def load_known_test_performance_characteristics():
         with open(_db, 'rb') as f:
             return pickle.load(f)
@@ -107,7 +108,7 @@ def check_method_for_perf_regressions(func: Callable) -> Callable:
             )
         else:
             stdev = statistics.stdev(hist)
-            limit = hist[-1] + stdev * 3
+            limit = hist[-1] + stdev * 5
             logger.debug(
                 f'Max acceptable performace for {func.__name__} is {limit:f}s'
             )
@@ -117,12 +118,14 @@ def check_method_for_perf_regressions(func: Callable) -> Callable:
             ):
                 msg = f'''{func_id} performance has regressed unacceptably.
 {hist[-1]:f}s is the slowest record in {len(hist)} db perf samples.
-It just ran in {run_time:f}s which is >3 stdevs slower than the slowest sample.
+It just ran in {run_time:f}s which is >5 stdevs slower than the slowest sample.
 Here is the current, full db perf timing distribution:
 
-{hist}'''
-                slf = args[0]
+'''
+                for x in hist:
+                    msg += f'{x:f}\n'
                 logger.error(msg)
+                slf = args[0]
                 slf.fail(msg)
             else:
                 hist.append(run_time)