Add some more examples and a convenience method in config.py for
authorScott Gasch <[email protected]>
Tue, 18 Oct 2022 20:03:22 +0000 (13:03 -0700)
committerScott Gasch <[email protected]>
Tue, 18 Oct 2022 20:03:22 +0000 (13:03 -0700)
terminating the program due to bad flags.

examples/README
examples/cron/cron.py [new file with mode: 0755]
examples/dedup_files/dedup_files.py [new file with mode: 0755]
src/pyutils/config.py

index c14e8639d481567bd234a96bff174c19138b6187..ff70e01dfd0daca9112b8671b00f92009ce4ba59 100644 (file)
@@ -1,3 +1,37 @@
 Stuff under here is example code that uses pyutils library routines and
 is meant to just be illustrative and fun.  Each should be runnable as-is
 if you have pyutils installed.  Use the --help flag for more info.
+
+cron:
+    Wrapper for running cronjobs with optional locks to ensure that no
+    more than one instance executes at the same time, optional max
+    frequencies, optionally touch a file on successful execution to
+    drive monitoring, etc...
+
+dedup_files:
+    Util that traverses a directory structure and identifies files that
+    are duplicates of each other then optionally deletes duplicates or
+    symlinks duplicates back to an original.
+
+parallelize_config:
+    This is a sample config file (place in ~/.remote_worker_records or
+    override with --remote_worker_records_file) for the @parallelize
+    framework to understand how to dispatch work to remote machines.
+
+pyskel:
+    This is a "skeleton" I keep around for when I want to start
+    working on a new script.
+
+reminder:
+    Reminds you of important dates which are stored in the .reminder
+    file.
+
+scrabble:
+    Helps you play Scrabble word game.
+
+wordle:
+    Plays and helps you cheat at the Wordle word game.  Demo of using
+    the @parallelize framework and shared_dict which it uses to
+    precompute the solution space on several processes at once.
+
+
diff --git a/examples/cron/cron.py b/examples/cron/cron.py
new file mode 100755 (executable)
index 0000000..2a06770
--- /dev/null
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+
+"""Wrapper that adds exclusive locks, timeouts, timestamp accounting,
+max frequency, logging, etc... to running cron jobs.
+"""
+
+import datetime
+import logging
+import os
+import sys
+from typing import Optional
+
+from pyutils import bootstrap, config, exec_utils, stopwatch
+from pyutils.datetimez import datetime_utils
+from pyutils.files import file_utils, lockfile
+
+logger = logging.getLogger(__name__)
+
+cfg = config.add_commandline_args(
+    f'Python Cron Runner ({__file__})',
+    'Wrapper for cron commands with locking, timeouts, and accounting.',
+)
+cfg.add_argument(
+    '--lockfile',
+    default=None,
+    metavar='LOCKFILE_PATH',
+    help='Path to the lockfile to use to ensure that two instances of a command do not execute contemporaneously.',
+)
+cfg.add_argument(
+    '--timeout',
+    type=str,
+    metavar='TIMEOUT',
+    default=None,
+    help='Maximum time for lock acquisition + command execution.  Undecorated for seconds but "3m" or "1h 15m" work too.',
+)
+cfg.add_argument(
+    '--timestamp',
+    type=str,
+    metavar='TIMESTAMP_FILE',
+    default=None,
+    help='The /timestamp/TIMESTAMP_FILE file tracking the work being done; files\' mtimes will be set to the last successful run of a command for accounting purposes.',
+)
+cfg.add_argument(
+    '--max_frequency',
+    type=str,
+    metavar='FREQUENCY',
+    default=None,
+    help='The maximum frequency with which to do this work; even if the wrapper is invoked more often than this it will not run the command.  Requires --timestamp.  Undecorated for seconds but "3h" or "1h 15m" work too.',
+)
+cfg.add_argument(
+    '--command',
+    nargs='*',
+    required=True,
+    type=str,
+    metavar='COMMANDLINE',
+    help='The commandline to run under a lock.',
+)
+config.overwrite_argparse_epilog(
+    """
+cron.py's exit value:
+
+   -1000 = some internal error occurred (see exception log).
+       0 = we exited early due to not enough time passage since the last
+           invocation of --command.
+    1000 = we could not obtain the lockfile; someone else owns it.
+ else = if the --command was run successfully, cron.py will exit with
+        the same code that the subcommand exited with.
+"""
+)
+
+
+def run_command(timeout: Optional[int], timestamp_file: Optional[str]) -> int:
+    """Run cron command"""
+    cmd = ' '.join(config.config['command'])
+    logger.info('cron cmd = "%s"', cmd)
+    logger.debug('shell environment:')
+    for var in os.environ:
+        val = os.environ[var]
+        logger.debug('%s = %s', var, val)
+    logger.debug('____ (↓↓↓ output from the subprocess appears below here ↓↓↓) ____')
+    try:
+        with stopwatch.Timer() as t:
+            ret = exec_utils.cmd_exitcode(cmd, timeout)
+        logger.debug(
+            f'____ (↑↑↑ subprocess finished in {t():.2f}s, exit value was {ret} ↑↑↑) ____'
+        )
+        if timestamp_file is not None and os.path.exists(timestamp_file):
+            logger.debug('Touching %s', timestamp_file)
+            file_utils.touch_file(timestamp_file)
+        return ret
+    except Exception as e:
+        logger.exception(e)
+        print('Cron subprocess failed, giving up.', file=sys.stderr)
+        logger.warning('Cron subprocess failed, giving up')
+        return -1000
+
+
+def main() -> int:
+    """Entry point"""
+    if config.config['timestamp']:
+        timestamp_file = f"/timestamps/{config.config['timestamp']}"
+        if not file_utils.does_file_exist(timestamp_file):
+            logger.error(
+                '--timestamp argument\'s target file (%s) must already exist.',
+                timestamp_file,
+            )
+            sys.exit(-1)
+    else:
+        timestamp_file = None
+        if config.config['max_frequency']:
+            config.error(
+                'The --max_frequency argument requires the --timestamp argument.'
+            )
+
+    now = datetime.datetime.now()
+    if timestamp_file is not None and os.path.exists(timestamp_file):
+        max_frequency = config.config['max_frequency']
+        if max_frequency is not None:
+            max_delta = datetime_utils.parse_duration(max_frequency)
+            if max_delta > 0:
+                mtime = file_utils.get_file_mtime_as_datetime(timestamp_file)
+                delta = now - mtime
+                if delta.total_seconds() < max_delta:
+                    logger.info(
+                        "It's only been %s since we last ran successfully; bailing out.",
+                        datetime_utils.describe_duration_briefly(delta.total_seconds()),
+                    )
+                    sys.exit(0)
+
+    timeout = config.config['timeout']
+    if timeout is not None:
+        timeout = datetime_utils.parse_duration(timeout)
+        assert timeout > 0
+        logger.debug('Timeout is %ss', timeout)
+        lockfile_expiration = datetime.datetime.now().timestamp() + timeout
+    else:
+        logger.debug('Timeout not specified; no lockfile expiration.')
+        lockfile_expiration = None
+
+    lockfile_path = config.config['lockfile']
+    if lockfile_path is not None:
+        logger.debug('Attempting to acquire lockfile %s...', lockfile_path)
+        try:
+            with lockfile.LockFile(
+                lockfile_path,
+                do_signal_cleanup=True,
+                override_command=' '.join(config.config['command']),
+                expiration_timestamp=lockfile_expiration,
+            ):
+                return run_command(timeout, timestamp_file)
+        except lockfile.LockFileException as e:
+            logger.exception(e)
+            msg = f'Failed to acquire {lockfile_path}, giving up.'
+            logger.error(msg)
+            print(msg, file=sys.stderr)
+            return 1000
+    else:
+        logger.debug('No lockfile indicated; not locking anything.')
+        return run_command(timeout, timestamp_file)
+
+
+if __name__ == '__main__':
+    # Insist that our logger.whatever('messages') make their way into
+    # syslog with a facility=LOG_CRON, please.  Yes, this is hacky.
+    sys.argv.append('--logging_syslog')
+    sys.argv.append('--logging_syslog_facility=CRON')
+    main()
diff --git a/examples/dedup_files/dedup_files.py b/examples/dedup_files/dedup_files.py
new file mode 100755 (executable)
index 0000000..f01ed8a
--- /dev/null
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+"""Find duplicate files (based on hash of contents) in a directory (or
+tree) and deduplicate them by either deleting duplicates or (with -l)
+symlinking duplicates to a canonical original.
+"""
+
+import logging
+import os
+from collections import defaultdict
+
+from pyutils import bootstrap, config, string_utils
+from pyutils.files import file_utils
+
+logger = logging.getLogger(__name__)
+parser = config.add_commandline_args(
+    f'Dedup Files ({__file__})',
+    'Deduplicate files based on content in a directory or recursively',
+)
+parser.add_argument(
+    'start_dirs',
+    type=str,
+    nargs='*',
+    help='Filespec (glob) of starting directory',
+)
+parser.add_argument(
+    '-n',
+    '--dry_run',
+    action='store_true',
+    help='Do nothing, just say what you\'d do',
+)
+parser.add_argument(
+    '-R',
+    '--recursive',
+    action='store_true',
+    help='Traverse recursively',
+)
+parser.add_argument(
+    '-l',
+    '--link',
+    action='store_true',
+    help='Instead of deleting duplicates, create symbolic links',
+)
+
+
+def main() -> int:
+    """Entry point"""
+    sigs = defaultdict(list)
+    sizes = defaultdict(list)
+    dry_size = 0
+
+    for spec in config.config['start_dirs']:
+        if config.config['recursive']:
+            filez = file_utils.get_files_recursive(spec)
+        else:
+            filez = file_utils.get_files(spec)
+
+        for filename in filez:
+            if not file_utils.is_symlink(filename) and file_utils.is_normal_file(
+                filename
+            ):
+                size = file_utils.get_file_size(filename)
+                sizes[size].append(filename)
+                logging.debug('%d => %s', size, sizes[size])
+
+        for size in sizes:
+            files = sizes[size]
+            if len(files) > 1:
+                logging.debug('%s (size=%d) need checksums', files, size)
+                for filename in files:
+                    md5 = file_utils.get_file_md5(filename)
+                    sigs[md5].append(filename)
+
+        for md5 in sigs:
+            files = sigs[md5]
+            if len(files) > 1:
+                logging.debug('%s are all dupes', files)
+
+                filename = files[0]
+                for dupe in files[1:]:
+                    if len(dupe) > len(filename):
+                        filename = dupe
+
+                for dupe in files:
+                    if filename == dupe:
+                        continue
+
+                    assert not file_utils.is_symlink(dupe)
+                    if config.config['dry_run']:
+                        print(f'{filename} == {dupe}.')
+                        dry_size += file_utils.get_file_size(dupe)
+                    else:
+                        assert len(filename) >= len(dupe)
+                        saved = filename
+                        killed = dupe
+                        print(f'{killed} == {saved} -- DELETED')
+                        logger.info('Deleting %s', killed)
+                        os.remove(killed)
+                        if config.config['link']:
+                            logger.info('Creating symlink from %s -> %s', saved, killed)
+                            os.symlink(saved, killed)
+                        filename = saved
+
+    if dry_size > 0:
+        print(
+            f'Running w/o -n would have deleted {string_utils.add_thousands_separator(dry_size)} bytes from disk.'
+        )
+    return 0
+
+
+if __name__ == '__main__':
+    main()
index ad97038ed4f4da3f603b84eaedd1d6cde8a77c55..3dfc4ebbc7cbe02d7cfc775636ac9c57f9f3ccda 100644 (file)
@@ -702,6 +702,15 @@ def parse(entry_module: Optional[str]) -> Dict[str, Any]:
     return CONFIG.parse(entry_module)
 
 
+def error(message: str, exit_code: int = 1) -> None:
+    """
+    Convenience method for indicating a configuration error.
+    """
+    logging.error(message)
+    print(message, file=sys.stderr)
+    sys.exit(exit_code)
+
+
 def has_been_parsed() -> bool:
     """Returns True iff the global config has already been parsed"""
     return CONFIG.has_been_parsed()