results.
"""
+import logging
import os
-import platform
import signal
import sys
import threading
import time
+from typing import Optional
import cloudpickle # type: ignore
import psutil # type: ignore
+import argparse_utils
import bootstrap
import config
+from stopwatch import Timer
from thread_utils import background_thread
+logger = logging.getLogger(__file__)
cfg = config.add_commandline_args(
f"Remote Worker ({__file__})",
type=str,
required=True,
metavar='FILENAME',
- help='The location of the bundle of code to execute.'
+ help='The location of the bundle of code to execute.',
)
cfg.add_argument(
'--result_file',
type=str,
required=True,
metavar='FILENAME',
- help='The location where we should write the computation results.'
+ help='The location where we should write the computation results.',
+)
+cfg.add_argument(
+ '--watch_for_cancel',
+ action=argparse_utils.ActionNoYes,
+ default=True,
+ help='Should we watch for the cancellation of our parent ssh process?',
)
@background_thread
def watch_for_cancel(terminate_event: threading.Event) -> None:
+ logger.debug('Starting up background thread...')
p = psutil.Process(os.getpid())
while True:
saw_sshd = False
ancestors = p.parents()
for ancestor in ancestors:
name = ancestor.name()
- if 'ssh' in name or 'Ssh' in name:
+ pid = ancestor.pid
+ logger.debug(f'Ancestor process {name} (pid={pid})')
+ if 'ssh' in name.lower():
saw_sshd = True
break
-
if not saw_sshd:
+ logger.error(
+ 'Did not see sshd in our ancestors list?! Committing suicide.'
+ )
os.system('pstree')
os.kill(os.getpid(), signal.SIGTERM)
+ time.sleep(5.0)
+ os.kill(os.getpid(), signal.SIGKILL)
+ sys.exit(-1)
if terminate_event.is_set():
return
time.sleep(1.0)
-def main() -> None:
- hostname = platform.node()
+def cleanup_and_exit(
+ thread: Optional[threading.Thread],
+ stop_thread: Optional[threading.Event],
+ exit_code: int,
+) -> None:
+ if stop_thread is not None:
+ stop_thread.set()
+ assert thread is not None
+ thread.join()
+ sys.exit(exit_code)
- # Windows-Linux is retarded.
- if hostname != 'VIDEO-COMPUTER':
- (thread, terminate_event) = watch_for_cancel()
+def main() -> None:
in_file = config.config['code_file']
out_file = config.config['result_file']
- with open(in_file, 'rb') as rb:
- serialized = rb.read()
-
- fun, args, kwargs = cloudpickle.loads(serialized)
- ret = fun(*args, **kwargs)
-
- serialized = cloudpickle.dumps(ret)
- with open(out_file, 'wb') as wb:
- wb.write(serialized)
-
- # Windows-Linux is retarded.
- if hostname != 'VIDEO-COMPUTER':
- terminate_event.set()
- thread.join()
- sys.exit(0)
+ thread = None
+ stop_thread = None
+ if config.config['watch_for_cancel']:
+ (thread, stop_thread) = watch_for_cancel()
+
+ logger.debug(f'Reading {in_file}.')
+ try:
+ with open(in_file, 'rb') as rb:
+ serialized = rb.read()
+ except Exception as e:
+ logger.exception(e)
+ logger.critical(f'Problem reading {in_file}. Aborting.')
+ cleanup_and_exit(thread, stop_thread, 1)
+
+ logger.debug(f'Deserializing {in_file}.')
+ try:
+ fun, args, kwargs = cloudpickle.loads(serialized)
+ except Exception as e:
+ logger.exception(e)
+ logger.critical(f'Problem deserializing {in_file}. Aborting.')
+ cleanup_and_exit(thread, stop_thread, 2)
+
+ logger.debug('Invoking user code...')
+ with Timer() as t:
+ ret = fun(*args, **kwargs)
+ logger.debug(f'User code took {t():.1f}s')
+
+ logger.debug('Serializing results')
+ try:
+ serialized = cloudpickle.dumps(ret)
+ except Exception as e:
+ logger.exception(e)
+ logger.critical(f'Could not serialize result ({type(ret)}). Aborting.')
+ cleanup_and_exit(thread, stop_thread, 3)
+
+ logger.debug(f'Writing {out_file}.')
+ try:
+ with open(out_file, 'wb') as wb:
+ wb.write(serialized)
+ except Exception as e:
+ logger.exception(e)
+ logger.critical(f'Error writing {out_file}. Aborting.')
+ cleanup_and_exit(thread, stop_thread, 4)
+ cleanup_and_exit(thread, stop_thread, 0)
if __name__ == '__main__':