From: Scott Gasch Date: Tue, 18 Oct 2022 20:03:22 +0000 (-0700) Subject: Add some more examples and a convenience method in config.py for X-Git-Url: https://wannabe.guru.org/gitweb/?a=commitdiff_plain;h=0b66f494f430847d4d54fa455fb6264ff05fdf99;p=pyutils.git Add some more examples and a convenience method in config.py for terminating the program due to bad flags. --- diff --git a/examples/README b/examples/README index c14e863..ff70e01 100644 --- a/examples/README +++ b/examples/README @@ -1,3 +1,37 @@ Stuff under here is example code that uses pyutils library routines and is meant to just be illustrative and fun. Each should be runnable as-is if you have pyutils installed. Use the --help flag for more info. + +cron: + Wrapper for running cronjobs with optional locks to ensure that no + more than one instance executes at the same time, optional max + frequencies, optionally touch a file on successful execution to + drive monitoring, etc... + +dedup_files: + Util that traverses a directory structure and identifies files that + are duplicates of each other then optionally deletes duplicates or + symlinks duplicates back to an original. + +parallelize_config: + This is a sample config file (place in ~/.remote_worker_records or + override with --remote_worker_records_file) for the @parallelize + framework to understand how to dispatch work to remote machines. + +pyskel: + This is a "skeleton" I keep around for when I want to start + working on a new script. + +reminder: + Reminds you of important dates which are stored in the .reminder + file. + +scrabble: + Helps you play Scrabble word game. + +wordle: + Plays and helps you cheat at the Wordle word game. Demo of using + the @parallelize framework and shared_dict which it uses to + precompute the solution space on several processes at once. + + diff --git a/examples/cron/cron.py b/examples/cron/cron.py new file mode 100755 index 0000000..2a06770 --- /dev/null +++ b/examples/cron/cron.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 + +"""Wrapper that adds exclusive locks, timeouts, timestamp accounting, +max frequency, logging, etc... to running cron jobs. +""" + +import datetime +import logging +import os +import sys +from typing import Optional + +from pyutils import bootstrap, config, exec_utils, stopwatch +from pyutils.datetimez import datetime_utils +from pyutils.files import file_utils, lockfile + +logger = logging.getLogger(__name__) + +cfg = config.add_commandline_args( + f'Python Cron Runner ({__file__})', + 'Wrapper for cron commands with locking, timeouts, and accounting.', +) +cfg.add_argument( + '--lockfile', + default=None, + metavar='LOCKFILE_PATH', + help='Path to the lockfile to use to ensure that two instances of a command do not execute contemporaneously.', +) +cfg.add_argument( + '--timeout', + type=str, + metavar='TIMEOUT', + default=None, + help='Maximum time for lock acquisition + command execution. Undecorated for seconds but "3m" or "1h 15m" work too.', +) +cfg.add_argument( + '--timestamp', + type=str, + metavar='TIMESTAMP_FILE', + default=None, + help='The /timestamp/TIMESTAMP_FILE file tracking the work being done; files\' mtimes will be set to the last successful run of a command for accounting purposes.', +) +cfg.add_argument( + '--max_frequency', + type=str, + metavar='FREQUENCY', + default=None, + help='The maximum frequency with which to do this work; even if the wrapper is invoked more often than this it will not run the command. Requires --timestamp. Undecorated for seconds but "3h" or "1h 15m" work too.', +) +cfg.add_argument( + '--command', + nargs='*', + required=True, + type=str, + metavar='COMMANDLINE', + help='The commandline to run under a lock.', +) +config.overwrite_argparse_epilog( + """ +cron.py's exit value: + + -1000 = some internal error occurred (see exception log). + 0 = we exited early due to not enough time passage since the last + invocation of --command. + 1000 = we could not obtain the lockfile; someone else owns it. + else = if the --command was run successfully, cron.py will exit with + the same code that the subcommand exited with. +""" +) + + +def run_command(timeout: Optional[int], timestamp_file: Optional[str]) -> int: + """Run cron command""" + cmd = ' '.join(config.config['command']) + logger.info('cron cmd = "%s"', cmd) + logger.debug('shell environment:') + for var in os.environ: + val = os.environ[var] + logger.debug('%s = %s', var, val) + logger.debug('____ (↓↓↓ output from the subprocess appears below here ↓↓↓) ____') + try: + with stopwatch.Timer() as t: + ret = exec_utils.cmd_exitcode(cmd, timeout) + logger.debug( + f'____ (↑↑↑ subprocess finished in {t():.2f}s, exit value was {ret} ↑↑↑) ____' + ) + if timestamp_file is not None and os.path.exists(timestamp_file): + logger.debug('Touching %s', timestamp_file) + file_utils.touch_file(timestamp_file) + return ret + except Exception as e: + logger.exception(e) + print('Cron subprocess failed, giving up.', file=sys.stderr) + logger.warning('Cron subprocess failed, giving up') + return -1000 + + +@bootstrap.initialize +def main() -> int: + """Entry point""" + if config.config['timestamp']: + timestamp_file = f"/timestamps/{config.config['timestamp']}" + if not file_utils.does_file_exist(timestamp_file): + logger.error( + '--timestamp argument\'s target file (%s) must already exist.', + timestamp_file, + ) + sys.exit(-1) + else: + timestamp_file = None + if config.config['max_frequency']: + config.error( + 'The --max_frequency argument requires the --timestamp argument.' + ) + + now = datetime.datetime.now() + if timestamp_file is not None and os.path.exists(timestamp_file): + max_frequency = config.config['max_frequency'] + if max_frequency is not None: + max_delta = datetime_utils.parse_duration(max_frequency) + if max_delta > 0: + mtime = file_utils.get_file_mtime_as_datetime(timestamp_file) + delta = now - mtime + if delta.total_seconds() < max_delta: + logger.info( + "It's only been %s since we last ran successfully; bailing out.", + datetime_utils.describe_duration_briefly(delta.total_seconds()), + ) + sys.exit(0) + + timeout = config.config['timeout'] + if timeout is not None: + timeout = datetime_utils.parse_duration(timeout) + assert timeout > 0 + logger.debug('Timeout is %ss', timeout) + lockfile_expiration = datetime.datetime.now().timestamp() + timeout + else: + logger.debug('Timeout not specified; no lockfile expiration.') + lockfile_expiration = None + + lockfile_path = config.config['lockfile'] + if lockfile_path is not None: + logger.debug('Attempting to acquire lockfile %s...', lockfile_path) + try: + with lockfile.LockFile( + lockfile_path, + do_signal_cleanup=True, + override_command=' '.join(config.config['command']), + expiration_timestamp=lockfile_expiration, + ): + return run_command(timeout, timestamp_file) + except lockfile.LockFileException as e: + logger.exception(e) + msg = f'Failed to acquire {lockfile_path}, giving up.' + logger.error(msg) + print(msg, file=sys.stderr) + return 1000 + else: + logger.debug('No lockfile indicated; not locking anything.') + return run_command(timeout, timestamp_file) + + +if __name__ == '__main__': + # Insist that our logger.whatever('messages') make their way into + # syslog with a facility=LOG_CRON, please. Yes, this is hacky. + sys.argv.append('--logging_syslog') + sys.argv.append('--logging_syslog_facility=CRON') + main() diff --git a/examples/dedup_files/dedup_files.py b/examples/dedup_files/dedup_files.py new file mode 100755 index 0000000..f01ed8a --- /dev/null +++ b/examples/dedup_files/dedup_files.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +"""Find duplicate files (based on hash of contents) in a directory (or +tree) and deduplicate them by either deleting duplicates or (with -l) +symlinking duplicates to a canonical original. +""" + +import logging +import os +from collections import defaultdict + +from pyutils import bootstrap, config, string_utils +from pyutils.files import file_utils + +logger = logging.getLogger(__name__) +parser = config.add_commandline_args( + f'Dedup Files ({__file__})', + 'Deduplicate files based on content in a directory or recursively', +) +parser.add_argument( + 'start_dirs', + type=str, + nargs='*', + help='Filespec (glob) of starting directory', +) +parser.add_argument( + '-n', + '--dry_run', + action='store_true', + help='Do nothing, just say what you\'d do', +) +parser.add_argument( + '-R', + '--recursive', + action='store_true', + help='Traverse recursively', +) +parser.add_argument( + '-l', + '--link', + action='store_true', + help='Instead of deleting duplicates, create symbolic links', +) + + +@bootstrap.initialize +def main() -> int: + """Entry point""" + sigs = defaultdict(list) + sizes = defaultdict(list) + dry_size = 0 + + for spec in config.config['start_dirs']: + if config.config['recursive']: + filez = file_utils.get_files_recursive(spec) + else: + filez = file_utils.get_files(spec) + + for filename in filez: + if not file_utils.is_symlink(filename) and file_utils.is_normal_file( + filename + ): + size = file_utils.get_file_size(filename) + sizes[size].append(filename) + logging.debug('%d => %s', size, sizes[size]) + + for size in sizes: + files = sizes[size] + if len(files) > 1: + logging.debug('%s (size=%d) need checksums', files, size) + for filename in files: + md5 = file_utils.get_file_md5(filename) + sigs[md5].append(filename) + + for md5 in sigs: + files = sigs[md5] + if len(files) > 1: + logging.debug('%s are all dupes', files) + + filename = files[0] + for dupe in files[1:]: + if len(dupe) > len(filename): + filename = dupe + + for dupe in files: + if filename == dupe: + continue + + assert not file_utils.is_symlink(dupe) + if config.config['dry_run']: + print(f'{filename} == {dupe}.') + dry_size += file_utils.get_file_size(dupe) + else: + assert len(filename) >= len(dupe) + saved = filename + killed = dupe + print(f'{killed} == {saved} -- DELETED') + logger.info('Deleting %s', killed) + os.remove(killed) + if config.config['link']: + logger.info('Creating symlink from %s -> %s', saved, killed) + os.symlink(saved, killed) + filename = saved + + if dry_size > 0: + print( + f'Running w/o -n would have deleted {string_utils.add_thousands_separator(dry_size)} bytes from disk.' + ) + return 0 + + +if __name__ == '__main__': + main() diff --git a/src/pyutils/config.py b/src/pyutils/config.py index ad97038..3dfc4eb 100644 --- a/src/pyutils/config.py +++ b/src/pyutils/config.py @@ -702,6 +702,15 @@ def parse(entry_module: Optional[str]) -> Dict[str, Any]: return CONFIG.parse(entry_module) +def error(message: str, exit_code: int = 1) -> None: + """ + Convenience method for indicating a configuration error. + """ + logging.error(message) + print(message, file=sys.stderr) + sys.exit(exit_code) + + def has_been_parsed() -> bool: """Returns True iff the global config has already been parsed""" return CONFIG.has_been_parsed()