Stuff under here is example code that uses pyutils library routines and
is meant to just be illustrative and fun. Each should be runnable as-is
if you have pyutils installed. Use the --help flag for more info.
+
+cron:
+ Wrapper for running cronjobs with optional locks to ensure that no
+ more than one instance executes at the same time, optional max
+ frequencies, optionally touch a file on successful execution to
+ drive monitoring, etc...
+
+dedup_files:
+ Util that traverses a directory structure and identifies files that
+ are duplicates of each other then optionally deletes duplicates or
+ symlinks duplicates back to an original.
+
+parallelize_config:
+ This is a sample config file (place in ~/.remote_worker_records or
+ override with --remote_worker_records_file) for the @parallelize
+ framework to understand how to dispatch work to remote machines.
+
+pyskel:
+ This is a "skeleton" I keep around for when I want to start
+ working on a new script.
+
+reminder:
+ Reminds you of important dates which are stored in the .reminder
+ file.
+
+scrabble:
+ Helps you play Scrabble word game.
+
+wordle:
+ Plays and helps you cheat at the Wordle word game. Demo of using
+ the @parallelize framework and shared_dict which it uses to
+ precompute the solution space on several processes at once.
+
+
--- /dev/null
+#!/usr/bin/env python3
+
+"""Wrapper that adds exclusive locks, timeouts, timestamp accounting,
+max frequency, logging, etc... to running cron jobs.
+"""
+
+import datetime
+import logging
+import os
+import sys
+from typing import Optional
+
+from pyutils import bootstrap, config, exec_utils, stopwatch
+from pyutils.datetimez import datetime_utils
+from pyutils.files import file_utils, lockfile
+
+logger = logging.getLogger(__name__)
+
+cfg = config.add_commandline_args(
+ f'Python Cron Runner ({__file__})',
+ 'Wrapper for cron commands with locking, timeouts, and accounting.',
+)
+cfg.add_argument(
+ '--lockfile',
+ default=None,
+ metavar='LOCKFILE_PATH',
+ help='Path to the lockfile to use to ensure that two instances of a command do not execute contemporaneously.',
+)
+cfg.add_argument(
+ '--timeout',
+ type=str,
+ metavar='TIMEOUT',
+ default=None,
+ help='Maximum time for lock acquisition + command execution. Undecorated for seconds but "3m" or "1h 15m" work too.',
+)
+cfg.add_argument(
+ '--timestamp',
+ type=str,
+ metavar='TIMESTAMP_FILE',
+ default=None,
+ help='The /timestamp/TIMESTAMP_FILE file tracking the work being done; files\' mtimes will be set to the last successful run of a command for accounting purposes.',
+)
+cfg.add_argument(
+ '--max_frequency',
+ type=str,
+ metavar='FREQUENCY',
+ default=None,
+ help='The maximum frequency with which to do this work; even if the wrapper is invoked more often than this it will not run the command. Requires --timestamp. Undecorated for seconds but "3h" or "1h 15m" work too.',
+)
+cfg.add_argument(
+ '--command',
+ nargs='*',
+ required=True,
+ type=str,
+ metavar='COMMANDLINE',
+ help='The commandline to run under a lock.',
+)
+config.overwrite_argparse_epilog(
+ """
+cron.py's exit value:
+
+ -1000 = some internal error occurred (see exception log).
+ 0 = we exited early due to not enough time passage since the last
+ invocation of --command.
+ 1000 = we could not obtain the lockfile; someone else owns it.
+ else = if the --command was run successfully, cron.py will exit with
+ the same code that the subcommand exited with.
+"""
+)
+
+
+def run_command(timeout: Optional[int], timestamp_file: Optional[str]) -> int:
+ """Run cron command"""
+ cmd = ' '.join(config.config['command'])
+ logger.info('cron cmd = "%s"', cmd)
+ logger.debug('shell environment:')
+ for var in os.environ:
+ val = os.environ[var]
+ logger.debug('%s = %s', var, val)
+ logger.debug('____ (↓↓↓ output from the subprocess appears below here ↓↓↓) ____')
+ try:
+ with stopwatch.Timer() as t:
+ ret = exec_utils.cmd_exitcode(cmd, timeout)
+ logger.debug(
+ f'____ (↑↑↑ subprocess finished in {t():.2f}s, exit value was {ret} ↑↑↑) ____'
+ )
+ if timestamp_file is not None and os.path.exists(timestamp_file):
+ logger.debug('Touching %s', timestamp_file)
+ file_utils.touch_file(timestamp_file)
+ return ret
+ except Exception as e:
+ logger.exception(e)
+ print('Cron subprocess failed, giving up.', file=sys.stderr)
+ logger.warning('Cron subprocess failed, giving up')
+ return -1000
+
+
+@bootstrap.initialize
+def main() -> int:
+ """Entry point"""
+ if config.config['timestamp']:
+ timestamp_file = f"/timestamps/{config.config['timestamp']}"
+ if not file_utils.does_file_exist(timestamp_file):
+ logger.error(
+ '--timestamp argument\'s target file (%s) must already exist.',
+ timestamp_file,
+ )
+ sys.exit(-1)
+ else:
+ timestamp_file = None
+ if config.config['max_frequency']:
+ config.error(
+ 'The --max_frequency argument requires the --timestamp argument.'
+ )
+
+ now = datetime.datetime.now()
+ if timestamp_file is not None and os.path.exists(timestamp_file):
+ max_frequency = config.config['max_frequency']
+ if max_frequency is not None:
+ max_delta = datetime_utils.parse_duration(max_frequency)
+ if max_delta > 0:
+ mtime = file_utils.get_file_mtime_as_datetime(timestamp_file)
+ delta = now - mtime
+ if delta.total_seconds() < max_delta:
+ logger.info(
+ "It's only been %s since we last ran successfully; bailing out.",
+ datetime_utils.describe_duration_briefly(delta.total_seconds()),
+ )
+ sys.exit(0)
+
+ timeout = config.config['timeout']
+ if timeout is not None:
+ timeout = datetime_utils.parse_duration(timeout)
+ assert timeout > 0
+ logger.debug('Timeout is %ss', timeout)
+ lockfile_expiration = datetime.datetime.now().timestamp() + timeout
+ else:
+ logger.debug('Timeout not specified; no lockfile expiration.')
+ lockfile_expiration = None
+
+ lockfile_path = config.config['lockfile']
+ if lockfile_path is not None:
+ logger.debug('Attempting to acquire lockfile %s...', lockfile_path)
+ try:
+ with lockfile.LockFile(
+ lockfile_path,
+ do_signal_cleanup=True,
+ override_command=' '.join(config.config['command']),
+ expiration_timestamp=lockfile_expiration,
+ ):
+ return run_command(timeout, timestamp_file)
+ except lockfile.LockFileException as e:
+ logger.exception(e)
+ msg = f'Failed to acquire {lockfile_path}, giving up.'
+ logger.error(msg)
+ print(msg, file=sys.stderr)
+ return 1000
+ else:
+ logger.debug('No lockfile indicated; not locking anything.')
+ return run_command(timeout, timestamp_file)
+
+
+if __name__ == '__main__':
+ # Insist that our logger.whatever('messages') make their way into
+ # syslog with a facility=LOG_CRON, please. Yes, this is hacky.
+ sys.argv.append('--logging_syslog')
+ sys.argv.append('--logging_syslog_facility=CRON')
+ main()
--- /dev/null
+#!/usr/bin/env python3
+
+"""Find duplicate files (based on hash of contents) in a directory (or
+tree) and deduplicate them by either deleting duplicates or (with -l)
+symlinking duplicates to a canonical original.
+"""
+
+import logging
+import os
+from collections import defaultdict
+
+from pyutils import bootstrap, config, string_utils
+from pyutils.files import file_utils
+
+logger = logging.getLogger(__name__)
+parser = config.add_commandline_args(
+ f'Dedup Files ({__file__})',
+ 'Deduplicate files based on content in a directory or recursively',
+)
+parser.add_argument(
+ 'start_dirs',
+ type=str,
+ nargs='*',
+ help='Filespec (glob) of starting directory',
+)
+parser.add_argument(
+ '-n',
+ '--dry_run',
+ action='store_true',
+ help='Do nothing, just say what you\'d do',
+)
+parser.add_argument(
+ '-R',
+ '--recursive',
+ action='store_true',
+ help='Traverse recursively',
+)
+parser.add_argument(
+ '-l',
+ '--link',
+ action='store_true',
+ help='Instead of deleting duplicates, create symbolic links',
+)
+
+
+@bootstrap.initialize
+def main() -> int:
+ """Entry point"""
+ sigs = defaultdict(list)
+ sizes = defaultdict(list)
+ dry_size = 0
+
+ for spec in config.config['start_dirs']:
+ if config.config['recursive']:
+ filez = file_utils.get_files_recursive(spec)
+ else:
+ filez = file_utils.get_files(spec)
+
+ for filename in filez:
+ if not file_utils.is_symlink(filename) and file_utils.is_normal_file(
+ filename
+ ):
+ size = file_utils.get_file_size(filename)
+ sizes[size].append(filename)
+ logging.debug('%d => %s', size, sizes[size])
+
+ for size in sizes:
+ files = sizes[size]
+ if len(files) > 1:
+ logging.debug('%s (size=%d) need checksums', files, size)
+ for filename in files:
+ md5 = file_utils.get_file_md5(filename)
+ sigs[md5].append(filename)
+
+ for md5 in sigs:
+ files = sigs[md5]
+ if len(files) > 1:
+ logging.debug('%s are all dupes', files)
+
+ filename = files[0]
+ for dupe in files[1:]:
+ if len(dupe) > len(filename):
+ filename = dupe
+
+ for dupe in files:
+ if filename == dupe:
+ continue
+
+ assert not file_utils.is_symlink(dupe)
+ if config.config['dry_run']:
+ print(f'{filename} == {dupe}.')
+ dry_size += file_utils.get_file_size(dupe)
+ else:
+ assert len(filename) >= len(dupe)
+ saved = filename
+ killed = dupe
+ print(f'{killed} == {saved} -- DELETED')
+ logger.info('Deleting %s', killed)
+ os.remove(killed)
+ if config.config['link']:
+ logger.info('Creating symlink from %s -> %s', saved, killed)
+ os.symlink(saved, killed)
+ filename = saved
+
+ if dry_size > 0:
+ print(
+ f'Running w/o -n would have deleted {string_utils.add_thousands_separator(dry_size)} bytes from disk.'
+ )
+ return 0
+
+
+if __name__ == '__main__':
+ main()