X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=remote_worker.py;h=82b80ea3d722090ab7254eb24eac5884a9520172;hb=31c81f6539969a5eba864d3305f9fb7bf716a367;hp=84f8d56fa33318b507f3f11618c663861718182b;hpb=bef486c8c06e8d743a98b89910658a615acc8bbc;p=python_utils.git diff --git a/remote_worker.py b/remote_worker.py index 84f8d56..82b80ea 100755 --- a/remote_worker.py +++ b/remote_worker.py @@ -7,9 +7,10 @@ results. import logging import os import signal -import threading import sys +import threading import time +from typing import Optional import cloudpickle # type: ignore import psutil # type: ignore @@ -17,9 +18,9 @@ import psutil # type: ignore import argparse_utils import bootstrap import config +from stopwatch import Timer from thread_utils import background_thread - logger = logging.getLogger(__file__) cfg = config.add_commandline_args( @@ -31,35 +32,41 @@ cfg.add_argument( type=str, required=True, metavar='FILENAME', - help='The location of the bundle of code to execute.' + help='The location of the bundle of code to execute.', ) cfg.add_argument( '--result_file', type=str, required=True, metavar='FILENAME', - help='The location where we should write the computation results.' + help='The location where we should write the computation results.', ) cfg.add_argument( '--watch_for_cancel', action=argparse_utils.ActionNoYes, - default=False, - help='Should we watch for the cancellation of our parent ssh process?' + default=True, + help='Should we watch for the cancellation of our parent ssh process?', ) @background_thread def watch_for_cancel(terminate_event: threading.Event) -> None: + logger.debug('Starting up background thread...') p = psutil.Process(os.getpid()) while True: saw_sshd = False ancestors = p.parents() for ancestor in ancestors: name = ancestor.name() + pid = ancestor.pid + logger.debug(f'Ancestor process {name} (pid={pid})') if 'ssh' in name.lower(): saw_sshd = True break if not saw_sshd: + logger.error( + 'Did not see sshd in our ancestors list?! Committing suicide.' + ) os.system('pstree') os.kill(os.getpid(), signal.SIGTERM) time.sleep(5.0) @@ -70,11 +77,28 @@ def watch_for_cancel(terminate_event: threading.Event) -> None: time.sleep(1.0) +def cleanup_and_exit( + thread: Optional[threading.Thread], + stop_thread: Optional[threading.Event], + exit_code: int, +) -> None: + if stop_thread is not None: + stop_thread.set() + assert thread is not None + thread.join() + sys.exit(exit_code) + + @bootstrap.initialize def main() -> None: in_file = config.config['code_file'] out_file = config.config['result_file'] + thread = None + stop_thread = None + if config.config['watch_for_cancel']: + (thread, stop_thread) = watch_for_cancel() + logger.debug(f'Reading {in_file}.') try: with open(in_file, 'rb') as rb: @@ -82,7 +106,7 @@ def main() -> None: except Exception as e: logger.exception(e) logger.critical(f'Problem reading {in_file}. Aborting.') - sys.exit(-1) + cleanup_and_exit(thread, stop_thread, 1) logger.debug(f'Deserializing {in_file}.') try: @@ -90,13 +114,12 @@ def main() -> None: except Exception as e: logger.exception(e) logger.critical(f'Problem deserializing {in_file}. Aborting.') - sys.exit(-1) + cleanup_and_exit(thread, stop_thread, 2) logger.debug('Invoking user code...') - start = time.time() - ret = fun(*args, **kwargs) - end = time.time() - logger.debug(f'User code took {end - start:.1f}s') + with Timer() as t: + ret = fun(*args, **kwargs) + logger.debug(f'User code took {t():.1f}s') logger.debug('Serializing results') try: @@ -104,7 +127,7 @@ def main() -> None: except Exception as e: logger.exception(e) logger.critical(f'Could not serialize result ({type(ret)}). Aborting.') - sys.exit(-1) + cleanup_and_exit(thread, stop_thread, 3) logger.debug(f'Writing {out_file}.') try: @@ -113,7 +136,8 @@ def main() -> None: except Exception as e: logger.exception(e) logger.critical(f'Error writing {out_file}. Aborting.') - sys.exit(-1) + cleanup_and_exit(thread, stop_thread, 4) + cleanup_and_exit(thread, stop_thread, 0) if __name__ == '__main__':