From eedcbd4f64af13ec2098508c3d839a60f7e9ffce Mon Sep 17 00:00:00 2001
From: Scott <scott@wannabe.house>
Date: Sun, 23 Jan 2022 12:43:46 -0800
Subject: [PATCH] Clean up the remote executor stuff and create a dedicated
 heartbeat thread.  Fix the parallelize integration test.  Other changes that
 depended on these fixes.

---
 executors.py               | 362 ++++++++++++++++++++-----------------
 file_utils.py              |  14 +-
 ml/model_trainer.py        |   3 +
 tests/parallelize_itest.py |  22 +--
 4 files changed, 210 insertions(+), 191 deletions(-)

diff --git a/executors.py b/executors.py
index 2829c69..c91c2a6 100644
--- a/executors.py
+++ b/executors.py
@@ -26,6 +26,8 @@ import config
 from decorator_utils import singleton
 from exec_utils import run_silently, cmd_in_background, cmd_with_timeout
 import histogram as hist
+from thread_utils import background_thread
+
 
 logger = logging.getLogger(__name__)
 
@@ -64,6 +66,7 @@ parser.add_argument(
 SSH = '/usr/bin/ssh -oForwardX11=no'
 SCP = '/usr/bin/scp'
 
+
 def make_cloud_pickle(fun, *args, **kwargs):
     logger.debug(f"Making cloudpickled bundle at {fun.__name__}")
     return cloudpickle.dumps((fun, args, kwargs))
@@ -556,6 +559,119 @@ class RemoteExecutor(BaseExecutor):
         self.total_bundles_submitted = 0
         self.backup_lock = threading.Lock()
         self.last_backup = None
+        (self.heartbeat_thread, self.heartbeat_stop_event) = self.run_periodic_heartbeat()
+
+    @background_thread
+    def run_periodic_heartbeat(self, stop_event) -> None:
+        while not stop_event.is_set():
+            time.sleep(5.0)
+            logger.debug('Running periodic heartbeat code...')
+            self.heartbeat()
+        logger.debug('Periodic heartbeat thread shutting down.')
+
+    def heartbeat(self) -> None:
+        with self.status.lock:
+            # Dump regular progress report
+            self.status.periodic_dump(self.total_bundles_submitted)
+
+            # Look for bundles to reschedule via executor.submit
+            if config.config['executors_schedule_remote_backups']:
+                self.maybe_schedule_backup_bundles()
+
+    def maybe_schedule_backup_bundles(self):
+        assert self.status.lock.locked()
+        num_done = len(self.status.finished_bundle_timings)
+        num_idle_workers = self.worker_count - self.task_count
+        now = time.time()
+        if (
+                num_done > 2
+                and num_idle_workers > 1
+                and (self.last_backup is None or (now - self.last_backup > 6.0))
+                and self.backup_lock.acquire(blocking=False)
+        ):
+            try:
+                assert self.backup_lock.locked()
+
+                bundle_to_backup = None
+                best_score = None
+                for worker, bundle_uuids in self.status.in_flight_bundles_by_worker.items():
+
+                    # Prefer to schedule backups of bundles running on
+                    # slower machines.
+                    base_score = 0
+                    for record in self.workers:
+                        if worker.machine == record.machine:
+                            base_score = float(record.weight)
+                            base_score = 1.0 / base_score
+                            base_score *= 200.0
+                            base_score = int(base_score)
+                            break
+
+                    for uuid in bundle_uuids:
+                        bundle = self.status.bundle_details_by_uuid.get(uuid, None)
+                        if (
+                                bundle is not None
+                                and bundle.src_bundle is None
+                                and bundle.backup_bundles is not None
+                        ):
+                            score = base_score
+
+                            # Schedule backups of bundles running
+                            # longer; especially those that are
+                            # unexpectedly slow.
+                            start_ts = self.status.start_per_bundle[uuid]
+                            if start_ts is not None:
+                                runtime = now - start_ts
+                                score += runtime
+                                logger.debug(f'score[{bundle}] => {score}  # latency boost')
+
+                                if bundle.slower_than_local_p95:
+                                    score += runtime / 2
+                                    logger.debug(f'score[{bundle}] => {score}  # >worker p95')
+
+                                if bundle.slower_than_global_p95:
+                                    score += runtime / 4
+                                    logger.debug(f'score[{bundle}] => {score}  # >global p95')
+
+                            # Prefer backups of bundles that don't
+                            # have backups already.
+                            backup_count = len(bundle.backup_bundles)
+                            if backup_count == 0:
+                                score *= 2
+                            elif backup_count == 1:
+                                score /= 2
+                            elif backup_count == 2:
+                                score /= 8
+                            else:
+                                score = 0
+                            logger.debug(
+                                f'score[{bundle}] => {score}  # {backup_count} dup backup factor'
+                            )
+
+                            if (
+                                    score != 0
+                                    and (best_score is None or score > best_score)
+                            ):
+                                bundle_to_backup = bundle
+                                assert bundle is not None
+                                assert bundle.backup_bundles is not None
+                                assert bundle.src_bundle is None
+                                best_score = score
+
+                # Note: this is all still happening on the heartbeat
+                # runner thread.  That's ok because
+                # schedule_backup_for_bundle uses the executor to
+                # submit the bundle again which will cause it to be
+                # picked up by a worker thread and allow this thread
+                # to return to run future heartbeats.
+                if bundle_to_backup is not None:
+                    self.last_backup = now
+                    logger.info(
+                        f'=====> SCHEDULING BACKUP {bundle_to_backup} (score={best_score:.1f}) <====='
+                    )
+                    self.schedule_backup_for_bundle(bundle_to_backup)
+            finally:
+                self.backup_lock.release()
 
     def is_worker_available(self) -> bool:
         return self.policy.is_worker_available()
@@ -580,97 +696,24 @@ class RemoteExecutor(BaseExecutor):
         logger.critical(msg)
         raise Exception(msg)
 
-    def release_worker(self, worker: RemoteWorkerRecord) -> None:
+    def release_worker(
+            self,
+            bundle: BundleDetails,
+            *,
+            was_cancelled=True
+    ) -> None:
+        worker = bundle.worker
+        assert worker is not None
         logger.debug(f'Released worker {worker}')
+        self.status.record_release_worker(
+            worker,
+            bundle.uuid,
+            was_cancelled,
+        )
         with self.cv:
             worker.count += 1
             self.cv.notify()
-
-    def heartbeat(self) -> None:
-        with self.status.lock:
-            # Regular progress report
-            self.status.periodic_dump(self.total_bundles_submitted)
-
-            # Look for bundles to reschedule.
-            num_done = len(self.status.finished_bundle_timings)
-            num_idle_workers = self.worker_count - self.task_count
-            now = time.time()
-            if (
-                    config.config['executors_schedule_remote_backups']
-                    and num_done > 2
-                    and num_idle_workers > 1
-                    and (self.last_backup is None or (now - self.last_backup > 1.0))
-                    and self.backup_lock.acquire(blocking=False)
-            ):
-                try:
-                    assert self.backup_lock.locked()
-
-                    bundle_to_backup = None
-                    best_score = None
-                    for worker, bundle_uuids in self.status.in_flight_bundles_by_worker.items():
-                        # Prefer to schedule backups of bundles on slower machines.
-                        base_score = 0
-                        for record in self.workers:
-                            if worker.machine == record.machine:
-                                base_score = float(record.weight)
-                                base_score = 1.0 / base_score
-                                base_score *= 200.0
-                                base_score = int(base_score)
-                                break
-
-                        for uuid in bundle_uuids:
-                            bundle = self.status.bundle_details_by_uuid.get(uuid, None)
-                            if (
-                                    bundle is not None
-                                    and bundle.src_bundle is None
-                                    and bundle.backup_bundles is not None
-                            ):
-                                score = base_score
-
-                                # Schedule backups of bundles running longer; especially those
-                                # that are unexpectedly slow.
-                                start_ts = self.status.start_per_bundle[uuid]
-                                if start_ts is not None:
-                                    runtime = now - start_ts
-                                    score += runtime
-                                    logger.debug(f'score[{bundle}] => {score}  # latency boost')
-
-                                    if bundle.slower_than_local_p95:
-                                        score += runtime / 2
-                                        logger.debug(f'score[{bundle}] => {score}  # >worker p95')
-
-                                    if bundle.slower_than_global_p95:
-                                        score += runtime / 2
-                                        logger.debug(f'score[{bundle}] => {score}  # >global p95')
-
-                                # Prefer backups of bundles that don't have backups already.
-                                backup_count = len(bundle.backup_bundles)
-                                if backup_count == 0:
-                                    score *= 2
-                                elif backup_count == 1:
-                                    score /= 2
-                                elif backup_count == 2:
-                                    score /= 8
-                                else:
-                                    score = 0
-                                logger.debug(f'score[{bundle}] => {score}  # {backup_count} dup backup factor')
-
-                                if (
-                                        score != 0
-                                        and (best_score is None or score > best_score)
-                                ):
-                                    bundle_to_backup = bundle
-                                    assert bundle is not None
-                                    assert bundle.backup_bundles is not None
-                                    assert bundle.src_bundle is None
-                                    best_score = score
-
-                    if bundle_to_backup is not None:
-                        self.last_backup = now
-                        logger.info(f'=====> SCHEDULING BACKUP {bundle_to_backup} (score={best_score:.1f}) <=====')
-                        self.schedule_backup_for_bundle(bundle_to_backup)
-                finally:
-                    self.backup_lock.release()
+        self.adjust_task_count(-1)
 
     def check_if_cancelled(self, bundle: BundleDetails) -> bool:
         with self.status.lock:
@@ -694,32 +737,27 @@ class RemoteExecutor(BaseExecutor):
         worker = None
         while worker is None:
             worker = self.find_available_worker_or_block(avoid_machine)
+        assert worker
 
         # Ok, found a worker.
         bundle.worker = worker
         machine = bundle.machine = worker.machine
         username = bundle.username = worker.username
-
         self.status.record_acquire_worker(worker, uuid)
         logger.debug(f'{bundle}: Running bundle on {worker}...')
 
         # Before we do any work, make sure the bundle is still viable.
+        # It may have been some time between when it was submitted and
+        # now due to lack of worker availability and someone else may
+        # have already finished it.
         if self.check_if_cancelled(bundle):
             try:
-                return self.post_launch_work(bundle)
+                return self.process_work_result(bundle)
             except Exception as e:
-                logger.exception(e)
-                logger.error(
+                logger.warning(
                     f'{bundle}: bundle says it\'s cancelled upfront but no results?!'
                 )
-                assert bundle.worker is not None
-                self.status.record_release_worker(
-                    bundle.worker,
-                    bundle.uuid,
-                    True,
-                )
-                self.release_worker(bundle.worker)
-                self.adjust_task_count(-1)
+                self.release_worker(bundle)
                 if is_original:
                     # Weird.  We are the original owner of this
                     # bundle.  For it to have been cancelled, a backup
@@ -728,13 +766,20 @@ class RemoteExecutor(BaseExecutor):
                     # it is done but we can't find the results it
                     # should have copied over.  Reschedule the whole
                     # thing.
+                    logger.exception(e)
+                    logger.error(
+                        f'{bundle}: We are the original owner thread and yet there are ' +
+                        'no results for this bundle.  This is unexpected and bad.'
+                    )
                     return self.emergency_retry_nasty_bundle(bundle)
                 else:
                     # Expected(?).  We're a backup and our bundle is
                     # cancelled before we even got started.  Something
-                    # went bad in post_launch_work (I acutually don't
+                    # went bad in process_work_result (I acutually don't
                     # see what?) but probably not worth worrying
-                    # about.
+                    # about.  Let the original thread worry about
+                    # either finding the results or complaining about
+                    # it.
                     return None
 
         # Send input code / data to worker machine if it's not local.
@@ -745,24 +790,17 @@ class RemoteExecutor(BaseExecutor):
                 logger.info(f"{bundle}: Copying work to {worker} via {cmd}.")
                 run_silently(cmd)
                 xfer_latency = time.time() - start_ts
-                logger.info(f"{bundle}: Copying done to {worker} in {xfer_latency:.1f}s.")
+                logger.info(f"{bundle}: Copying to {worker} took {xfer_latency:.1f}s.")
             except Exception as e:
-                assert bundle.worker is not None
-                self.status.record_release_worker(
-                    bundle.worker,
-                    bundle.uuid,
-                    True,
-                )
-                self.release_worker(bundle.worker)
-                self.adjust_task_count(-1)
+                self.release_worker(bundle)
                 if is_original:
                     # Weird.  We tried to copy the code to the worker and it failed...
                     # And we're the original bundle.  We have to retry.
                     logger.exception(e)
                     logger.error(
-                        f'{bundle}: Failed to send instructions to the worker machine?! ' +
-                        'This is not expected; we\'re the original bundle so this shouldn\'t ' +
-                        'be a race condition.  Attempting an emergency retry...'
+                        f"{bundle}: Failed to send instructions to the worker machine?! " +
+                        "This is not expected; we\'re the original bundle so this shouldn\'t " +
+                        "be a race condition.  Attempting an emergency retry..."
                     )
                     return self.emergency_retry_nasty_bundle(bundle)
                 else:
@@ -785,8 +823,8 @@ class RemoteExecutor(BaseExecutor):
                f' --code_file {bundle.code_file} --result_file {bundle.result_file}"')
         logger.debug(f'{bundle}: Executing {cmd} in the background to kick off work...')
         p = cmd_in_background(cmd, silent=True)
-        bundle.pid = pid = p.pid
-        logger.debug(f'{bundle}: Local ssh process pid={pid}; remote worker is {machine}.')
+        bundle.pid = p.pid
+        logger.debug(f'{bundle}: Local ssh process pid={p.pid}; remote worker is {machine}.')
         return self.wait_for_process(p, bundle, 0)
 
     def wait_for_process(self, p: subprocess.Popen, bundle: BundleDetails, depth: int) -> Any:
@@ -797,13 +835,7 @@ class RemoteExecutor(BaseExecutor):
                 f"I've gotten repeated errors waiting on this bundle; giving up on pid={pid}."
             )
             p.terminate()
-            self.status.record_release_worker(
-                bundle.worker,
-                bundle.uuid,
-                True,
-            )
-            self.release_worker(bundle.worker)
-            self.adjust_task_count(-1)
+            self.release_worker(bundle)
             return self.emergency_retry_nasty_bundle(bundle)
 
         # Spin until either the ssh job we scheduled finishes the
@@ -813,7 +845,6 @@ class RemoteExecutor(BaseExecutor):
             try:
                 p.wait(timeout=0.25)
             except subprocess.TimeoutExpired:
-                self.heartbeat()
                 if self.check_if_cancelled(bundle):
                     logger.info(
                         f'{bundle}: another worker finished bundle, checking it out...'
@@ -821,7 +852,7 @@ class RemoteExecutor(BaseExecutor):
                     break
             else:
                 logger.info(
-                    f"{bundle}: pid {pid} ({machine}) our ssh finished, checking it out..."
+                    f"{bundle}: pid {pid} ({machine}) is finished!"
                 )
                 p = None
                 break
@@ -831,7 +862,7 @@ class RemoteExecutor(BaseExecutor):
         # that some other worker seems to have completed the bundle
         # and we're bailing out.
         try:
-            ret = self.post_launch_work(bundle)
+            ret = self.process_work_result(bundle)
             if ret is not None and p is not None:
                 p.terminate()
             return ret
@@ -849,16 +880,10 @@ class RemoteExecutor(BaseExecutor):
                 logger.warning(msg)
                 return self.wait_for_process(p, bundle, depth + 1)
             else:
-                self.status.record_release_worker(
-                    bundle.worker,
-                    bundle.uuid,
-                    True,
-                )
-                self.release_worker(bundle.worker)
-                self.adjust_task_count(-1)
+                self.release_worker(bundle)
                 return self.emergency_retry_nasty_bundle(bundle)
 
-    def post_launch_work(self, bundle: BundleDetails) -> Any:
+    def process_work_result(self, bundle: BundleDetails) -> Any:
         with self.status.lock:
             is_original = bundle.src_bundle is None
             was_cancelled = bundle.was_cancelled
@@ -876,14 +901,25 @@ class RemoteExecutor(BaseExecutor):
                 if bundle.hostname not in bundle.machine:
                     cmd = f'{SCP} {username}@{machine}:{result_file} {result_file} 2>/dev/null'
                     logger.info(
-                        f"{bundle}: Fetching results from {username}@{machine} via {cmd}"
+                        f"{bundle}: Fetching results back from {username}@{machine} via {cmd}"
                     )
 
                     # If either of these throw they are handled in
                     # wait_for_process.
-                    run_silently(cmd)
+                    attempts = 0
+                    while True:
+                        try:
+                            run_silently(cmd)
+                        except Exception as e:
+                            attempts += 1
+                            if attempts >= 3:
+                                raise(e)
+                        else:
+                            break
+
                     run_silently(f'{SSH} {username}@{machine}'
                                  f' "/bin/rm -f {code_file} {result_file}"')
+                    logger.debug(f'Fetching results back took {time.time() - bundle.end_ts:.1f}s.')
                 dur = bundle.end_ts - bundle.start_ts
                 self.histogram.add_item(dur)
 
@@ -900,19 +936,14 @@ class RemoteExecutor(BaseExecutor):
                     serialized = rb.read()
                 result = cloudpickle.loads(serialized)
             except Exception as e:
-                msg = f'Failed to load {result_file}, this is bad news.'
+                logger.exception(e)
+                msg = f'Failed to load {result_file}... this is bad news.'
                 logger.critical(msg)
-                self.status.record_release_worker(
-                    bundle.worker,
-                    bundle.uuid,
-                    True,
-                )
-                self.release_worker(bundle.worker)
+                self.release_worker(bundle)
 
                 # Re-raise the exception; the code in wait_for_process may
                 # decide to emergency_retry_nasty_bundle here.
                 raise Exception(e)
-
             logger.debug(
                 f'Removing local (master) {code_file} and {result_file}.'
             )
@@ -944,15 +975,7 @@ class RemoteExecutor(BaseExecutor):
                     f'{bundle}: Notifying original {bundle.src_bundle.uuid} we beat them to it.'
                 )
                 bundle.src_bundle.is_cancelled.set()
-
-        assert bundle.worker is not None
-        self.status.record_release_worker(
-            bundle.worker,
-            bundle.uuid,
-            was_cancelled,
-        )
-        self.release_worker(bundle.worker)
-        self.adjust_task_count(-1)
+        self.release_worker(bundle, was_cancelled=was_cancelled)
         return result
 
     def create_original_bundle(self, pickle, fname: str):
@@ -1076,8 +1099,10 @@ class RemoteExecutor(BaseExecutor):
 
     @overrides
     def shutdown(self, wait=True) -> None:
-        self._helper_executor.shutdown(wait)
         logging.debug(f'Shutting down RemoteExecutor {self.title}')
+        self.heartbeat_stop_event.set()
+        self.heartbeat_thread.join()
+        self._helper_executor.shutdown(wait)
         print(self.histogram)
 
 
@@ -1129,26 +1154,16 @@ class DefaultExecutors(object):
                     RemoteWorkerRecord(
                         username = 'scott',
                         machine = 'meerkat.cabin',
-                        weight = 5,
+                        weight = 12,
                         count = 2,
                     ),
                 )
-            # if self.ping('kiosk.house'):
-            #     logger.info('Found kiosk.house')
-            #     pool.append(
-            #         RemoteWorkerRecord(
-            #             username = 'pi',
-            #             machine = 'kiosk.house',
-            #             weight = 1,
-            #             count = 2,
-            #         ),
-            #     )
-            if self.ping('hero.house'):
-                logger.info('Found hero.house')
+            if self.ping('wannabe.house'):
+                logger.info('Found wannabe.house')
                 pool.append(
                     RemoteWorkerRecord(
                         username = 'scott',
-                        machine = 'hero.house',
+                        machine = 'wannabe.house',
                         weight = 30,
                         count = 10,
                     ),
@@ -1169,7 +1184,7 @@ class DefaultExecutors(object):
                     RemoteWorkerRecord(
                         username = 'scott',
                         machine = 'backup.house',
-                        weight = 3,
+                        weight = 7,
                         count = 2,
                     ),
                 )
@@ -1184,3 +1199,14 @@ class DefaultExecutors(object):
             policy.register_worker_pool(pool)
             self.remote_executor = RemoteExecutor(pool, policy)
         return self.remote_executor
+
+    def shutdown(self) -> None:
+        if self.thread_executor is not None:
+            self.thread_executor.shutdown()
+            self.thread_executor = None
+        if self.process_executor is not None:
+            self.process_executor.shutdown()
+            self.process_executor = None
+        if self.remote_executor is not None:
+            self.remote_executor.shutdown()
+            self.remote_executor = None
diff --git a/file_utils.py b/file_utils.py
index e40a9f5..9fa8fd2 100644
--- a/file_utils.py
+++ b/file_utils.py
@@ -249,9 +249,7 @@ def is_same_file(file1: str, file2: str) -> bool:
 
 
 def get_file_raw_timestamps(filename: str) -> Optional[os.stat_result]:
-    """Stats the file and returns an os.stat_result or None on error.
-
-    """
+    """Stats the file and returns an os.stat_result or None on error."""
     try:
         return os.stat(filename)
     except Exception as e:
@@ -279,9 +277,7 @@ def get_file_raw_ctime(filename: str) -> Optional[float]:
 
 
 def get_file_md5(filename: str) -> str:
-    """Hashes filename's contents and returns an MD5.
-
-    """
+    """Hashes filename's contents and returns an MD5."""
     file_hash = hashlib.md5()
     with open(filename, "rb") as f:
         chunk = f.read(8192)
@@ -371,10 +367,9 @@ def get_file_mtime_timedelta(filename: str) -> Optional[datetime.timedelta]:
     return get_file_timestamp_timedelta(filename, lambda x: x.st_mtime)
 
 
-def describe_file_timestamp(
-    filename: str, extractor, *, brief=False
-) -> Optional[str]:
+def describe_file_timestamp(filename: str, extractor, *, brief=False) -> Optional[str]:
     from datetime_utils import describe_duration, describe_duration_briefly
+
     age = get_file_timestamp_age_seconds(filename, extractor)
     if age is None:
         return None
@@ -451,4 +446,5 @@ class FileWriter(object):
 
 if __name__ == '__main__':
     import doctest
+
     doctest.testmod()
diff --git a/ml/model_trainer.py b/ml/model_trainer.py
index 79ce706..041f0f8 100644
--- a/ml/model_trainer.py
+++ b/ml/model_trainer.py
@@ -22,6 +22,7 @@ from ansi import bold, reset
 import argparse_utils
 import config
 from decorator_utils import timed
+import executors
 import parallelize as par
 
 logger = logging.getLogger(__file__)
@@ -171,9 +172,11 @@ class TrainingBlueprint(ABC):
                         )
 
         if not self.spec.quiet:
+            executors.DefaultExecutors().shutdown()
             msg = f"Done training; best test set score was: {best_test_score:.1f}%"
             print(msg)
             logger.info(msg)
+
         scaler_filename, model_filename, model_info_filename = (
             self.maybe_persist_scaler_and_model(
                 best_training_score,
diff --git a/tests/parallelize_itest.py b/tests/parallelize_itest.py
index 9d98710..11c5676 100755
--- a/tests/parallelize_itest.py
+++ b/tests/parallelize_itest.py
@@ -1,13 +1,11 @@
 #!/usr/bin/env python3
 
-import random
 import sys
 
 import bootstrap
 import parallelize as p
 import decorator_utils
 import executors
-import math_utils
 import smart_future
 
 
@@ -28,12 +26,11 @@ def compute_factorial_process(n):
 
 
 @p.parallelize(method=p.Method.REMOTE)
-def list_primes(n):
-    """Calculates sum of all primes below given integer n"""
-    ret = []
+def compute_factorial_remote(n):
+    total = 1
     for x in range(2, n):
-        ret.append(math_utils.is_prime(x))
-    return ret
+        total *= x
+    return total
 
 
 @decorator_utils.timed
@@ -61,14 +58,11 @@ def test_process_parallelization() -> None:
 
 @decorator_utils.timed
 def test_remote_parallelization() -> None:
-    results = {}
+    results = []
     for _ in range(50):
-        n = random.randint(0, 100000)
-        results[n] = list_primes(n)
-    tot = 0
-    for _ in results[n]:
-        tot += _
-    print(tot)
+        results.append(compute_factorial_remote(_))
+    for result in smart_future.wait_any(results):
+        print(result)
     rexecutor = executors.DefaultExecutors().remote_pool()
     rexecutor.shutdown()
 
-- 
2.45.2