X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=remote_worker.py;h=b58c6ba0a66f8d32b2b81af72a66d23493c9b2e5;hb=36fea7f15ed17150691b5b3ead75450e575229ef;hp=ebd510040d15ac377165281a75c20c8ce63a8474;hpb=497fb9e21f45ec08e1486abaee6dfa7b20b8a691;p=python_utils.git diff --git a/remote_worker.py b/remote_worker.py index ebd5100..b58c6ba 100755 --- a/remote_worker.py +++ b/remote_worker.py @@ -4,21 +4,24 @@ results. """ +import logging import os -import platform import signal -import sys import threading +import sys import time import cloudpickle # type: ignore import psutil # type: ignore +import argparse_utils import bootstrap import config from thread_utils import background_thread +logger = logging.getLogger(__file__) + cfg = config.add_commandline_args( f"Remote Worker ({__file__})", "Helper to run pickled code remotely and return results", @@ -28,32 +31,46 @@ cfg.add_argument( type=str, required=True, metavar='FILENAME', - help='The location of the bundle of code to execute.' + help='The location of the bundle of code to execute.', ) cfg.add_argument( '--result_file', type=str, required=True, metavar='FILENAME', - help='The location where we should write the computation results.' + help='The location where we should write the computation results.', +) +cfg.add_argument( + '--watch_for_cancel', + action=argparse_utils.ActionNoYes, + default=True, + help='Should we watch for the cancellation of our parent ssh process?', ) @background_thread def watch_for_cancel(terminate_event: threading.Event) -> None: + logger.debug('Starting up background thread...') p = psutil.Process(os.getpid()) while True: saw_sshd = False ancestors = p.parents() for ancestor in ancestors: name = ancestor.name() - if 'ssh' in name or 'Ssh' in name: + pid = ancestor.pid + logger.debug(f'Ancestor process {name} (pid={pid})') + if 'ssh' in name.lower(): saw_sshd = True break - if not saw_sshd: + logger.error( + 'Did not see sshd in our ancestors list?! Committing suicide.' + ) os.system('pstree') os.kill(os.getpid(), signal.SIGTERM) + time.sleep(5.0) + os.kill(os.getpid(), signal.SIGKILL) + sys.exit(-1) if terminate_event.is_set(): return time.sleep(1.0) @@ -61,30 +78,60 @@ def watch_for_cancel(terminate_event: threading.Event) -> None: @bootstrap.initialize def main() -> None: - hostname = platform.node() - - # Windows-Linux is retarded. - if hostname != 'VIDEO-COMPUTER': - (thread, terminate_event) = watch_for_cancel() - in_file = config.config['code_file'] out_file = config.config['result_file'] - with open(in_file, 'rb') as rb: - serialized = rb.read() - - fun, args, kwargs = cloudpickle.loads(serialized) + stop_thread = None + if config.config['watch_for_cancel']: + (thread, stop_thread) = watch_for_cancel() + + logger.debug(f'Reading {in_file}.') + try: + with open(in_file, 'rb') as rb: + serialized = rb.read() + except Exception as e: + logger.exception(e) + logger.critical(f'Problem reading {in_file}. Aborting.') + stop_thread.set() + sys.exit(-1) + + logger.debug(f'Deserializing {in_file}.') + try: + fun, args, kwargs = cloudpickle.loads(serialized) + except Exception as e: + logger.exception(e) + logger.critical(f'Problem deserializing {in_file}. Aborting.') + stop_thread.set() + sys.exit(-1) + + logger.debug('Invoking user code...') + start = time.time() ret = fun(*args, **kwargs) - - serialized = cloudpickle.dumps(ret) - with open(out_file, 'wb') as wb: - wb.write(serialized) - - # Windows-Linux is retarded. - if hostname != 'VIDEO-COMPUTER': - terminate_event.set() + end = time.time() + logger.debug(f'User code took {end - start:.1f}s') + + logger.debug('Serializing results') + try: + serialized = cloudpickle.dumps(ret) + except Exception as e: + logger.exception(e) + logger.critical(f'Could not serialize result ({type(ret)}). Aborting.') + stop_thread.set() + sys.exit(-1) + + logger.debug(f'Writing {out_file}.') + try: + with open(out_file, 'wb') as wb: + wb.write(serialized) + except Exception as e: + logger.exception(e) + logger.critical(f'Error writing {out_file}. Aborting.') + stop_thread.set() + sys.exit(-1) + + if stop_thread is not None: + stop_thread.set() thread.join() - sys.exit(0) if __name__ == '__main__':