ACL uses enums, some more tests, other stuff.
[python_utils.git] / ml_model_trainer.py
index edddcc0c9f794232a5d2ee6593793518abd2ef12..ab3059f855388d06b8077a359897bb07ef5b2bc9 100644 (file)
@@ -20,14 +20,8 @@ from sklearn.preprocessing import MinMaxScaler  # type: ignore
 from ansi import bold, reset
 import argparse_utils
 import config
-import datetime_utils
-import decorator_utils
-import input_utils
-import list_utils
+from decorator_utils import timed
 import parallelize as par
-import smart_future
-import string_utils
-import text_utils
 
 logger = logging.getLogger(__file__)
 
@@ -101,6 +95,8 @@ class TrainingBlueprint(ABC):
         self.spec = None
 
     def train(self, spec: InputSpec) -> OutputSpec:
+        import smart_future
+
         random.seed()
         self.spec = spec
 
@@ -142,35 +138,36 @@ class TrainingBlueprint(ABC):
         best_test_score = None
         best_training_score = None
         best_params = None
-        for model in smart_future.wait_many(models):
+        for model in smart_future.wait_any(models):
             params = modelid_to_params[model.get_id()]
             if isinstance(model, smart_future.SmartFuture):
                 model = model._resolve()
-            training_score, test_score = self.evaluate_model(
-                model,
-                self.X_train_scaled,
-                self.y_train,
-                self.X_test_scaled,
-                self.y_test,
-            )
-            score = (training_score + test_score * 20) / 21
-            if not self.spec.quiet:
-                print(
-                    f"{bold()}{params}{reset()}: "
-                    f"Training set score={training_score:.2f}%, "
-                    f"test set score={test_score:.2f}%",
-                    file=sys.stderr,
+            if model is not None:
+                training_score, test_score = self.evaluate_model(
+                    model,
+                    self.X_train_scaled,
+                    self.y_train,
+                    self.X_test_scaled,
+                    self.y_test,
                 )
-            if best_score is None or score > best_score:
-                best_score = score
-                best_test_score = test_score
-                best_training_score = training_score
-                best_model = model
-                best_params = params
+                score = (training_score + test_score * 20) / 21
                 if not self.spec.quiet:
                     print(
-                        f"New best score {best_score:.2f}% with params {params}"
+                        f"{bold()}{params}{reset()}: "
+                        f"Training set score={training_score:.2f}%, "
+                        f"test set score={test_score:.2f}%",
+                        file=sys.stderr,
                     )
+                if best_score is None or score > best_score:
+                    best_score = score
+                    best_test_score = test_score
+                    best_training_score = training_score
+                    best_model = model
+                    best_params = params
+                    if not self.spec.quiet:
+                        print(
+                            f"New best score {best_score:.2f}% with params {params}"
+                        )
 
         if not self.spec.quiet:
             msg = f"Done training; best test set score was: {best_test_score:.1f}%"
@@ -261,11 +258,17 @@ class TrainingBlueprint(ABC):
 
     def make_progress_graph(self) -> None:
         if not self.spec.quiet:
-            text_utils.progress_graph(self.file_done_count,
-                                      self.total_file_count)
+            from text_utils import progress_graph
+            progress_graph(
+                self.file_done_count,
+                self.total_file_count
+            )
 
-    @decorator_utils.timed
+    @timed
     def read_input_files(self):
+        import list_utils
+        import smart_future
+
         # All features
         X = []
 
@@ -279,7 +282,7 @@ class TrainingBlueprint(ABC):
             file_list = list(files)
             results.append(self.read_files_from_list(file_list, n))
 
-        for result in smart_future.wait_many(results, callback=self.make_progress_graph):
+        for result in smart_future.wait_any(results, callback=self.make_progress_graph):
             result = result._resolve()
             for z in result[0]:
                 X.append(z)
@@ -349,6 +352,10 @@ class TrainingBlueprint(ABC):
             scaler: Any,
             model: Any) -> Tuple[Optional[str], Optional[str], Optional[str]]:
         if not self.spec.dry_run:
+            import datetime_utils
+            import input_utils
+            import string_utils
+
             if (
                     (self.spec.persist_percentage_threshold is not None and
                      test_score > self.spec.persist_percentage_threshold)