Since this thing is on the innerwebs I suppose it should have a
[python_utils.git] / ml / model_trainer.py
index 041f0f805cc5958fb948a48cc9bc160bc8578956..e3d89c20421619533da6c8fdcddee739ed33ddff 100644 (file)
@@ -1,8 +1,10 @@
 #!/usr/bin/env python3
 
-from __future__ import annotations
+# © Copyright 2021-2022, Scott Gasch
 
-from abc import ABC, abstractmethod
+"""This is a blueprint for training sklearn ML models."""
+
+from __future__ import annotations
 import datetime
 import glob
 import logging
@@ -10,36 +12,38 @@ import os
 import pickle
 import random
 import sys
-from types import SimpleNamespace
-from typing import Any, List, NamedTuple, Optional, Set, Tuple
 import warnings
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from types import SimpleNamespace
+from typing import Any, List, Optional, Set, Tuple
 
 import numpy as np
 from sklearn.model_selection import train_test_split  # type:ignore
 from sklearn.preprocessing import MinMaxScaler  # type: ignore
 
-from ansi import bold, reset
 import argparse_utils
 import config
-from decorator_utils import timed
 import executors
 import parallelize as par
+from ansi import bold, reset
+from decorator_utils import timed
 
-logger = logging.getLogger(__file__)
+logger = logging.getLogger(__name__)
 
 parser = config.add_commandline_args(
     f"ML Model Trainer ({__file__})",
-    "Arguments related to training an ML model"
+    "Arguments related to training an ML model",
 )
 parser.add_argument(
     "--ml_trainer_quiet",
     action="store_true",
-    help="Don't prompt the user for anything."
+    help="Don't prompt the user for anything.",
 )
 parser.add_argument(
     "--ml_trainer_delete",
     action="store_true",
-    help="Delete invalid/incomplete features files in addition to warning."
+    help="Delete invalid/incomplete features files in addition to warning.",
 )
 group = parser.add_mutually_exclusive_group()
 group.add_argument(
@@ -56,6 +60,9 @@ group.add_argument(
 
 
 class InputSpec(SimpleNamespace):
+    """A collection of info needed to train the model provided by the
+    caller."""
+
     file_glob: str
     feature_count: int
     features_to_skip: Set[str]
@@ -71,22 +78,27 @@ class InputSpec(SimpleNamespace):
     @staticmethod
     def populate_from_config() -> InputSpec:
         return InputSpec(
-            dry_run = config.config["ml_trainer_dry_run"],
-            quiet = config.config["ml_trainer_quiet"],
-            persist_percentage_threshold = config.config["ml_trainer_persist_threshold"],
-            delete_bad_inputs = config.config["ml_trainer_delete"],
+            dry_run=config.config["ml_trainer_dry_run"],
+            quiet=config.config["ml_trainer_quiet"],
+            persist_percentage_threshold=config.config["ml_trainer_persist_threshold"],
+            delete_bad_inputs=config.config["ml_trainer_delete"],
         )
 
 
-class OutputSpec(NamedTuple):
-    model_filename: Optional[str]
-    model_info_filename: Optional[str]
-    scaler_filename: Optional[str]
-    training_score: float
-    test_score: float
+@dataclass
+class OutputSpec:
+    """Info about the results of training returned to the caller."""
+
+    model_filename: Optional[str] = None
+    model_info_filename: Optional[str] = None
+    scaler_filename: Optional[str] = None
+    training_score: np.float64 = np.float64(0.0)
+    test_score: np.float64 = np.float64(0.0)
 
 
 class TrainingBlueprint(ABC):
+    """The blueprint for doing the actual training."""
+
     def __init__(self):
         self.y_train = None
         self.y_test = None
@@ -112,13 +124,13 @@ class TrainingBlueprint(ABC):
         y = np.array(y_)
 
         print("Doing random test/train split...")
-        X_train, X_test, self.y_train, self.y_test = self.test_train_split(
+        X_train, X_test, self.y_train, self.y_test = TrainingBlueprint.test_train_split(
             X,
             y,
         )
 
         print("Scaling training data...")
-        scaler, self.X_train_scaled, self.X_test_scaled = self.scale_data(
+        scaler, self.X_train_scaled, self.X_test_scaled = TrainingBlueprint.scale_data(
             X_train,
             X_test,
         )
@@ -127,25 +139,21 @@ class TrainingBlueprint(ABC):
         models = []
         modelid_to_params = {}
         for params in self.spec.training_parameters:
-            model = self.train_model(
-                params,
-                self.X_train_scaled,
-                self.y_train
-            )
+            model = self.train_model(params, self.X_train_scaled, self.y_train)
             models.append(model)
             modelid_to_params[model.get_id()] = str(params)
 
         best_model = None
-        best_score = None
-        best_test_score = None
-        best_training_score = None
+        best_score: Optional[np.float64] = None
+        best_test_score: Optional[np.float64] = None
+        best_training_score: Optional[np.float64] = None
         best_params = None
         for model in smart_future.wait_any(models):
             params = modelid_to_params[model.get_id()]
             if isinstance(model, smart_future.SmartFuture):
                 model = model._resolve()
             if model is not None:
-                training_score, test_score = self.evaluate_model(
+                training_score, test_score = TrainingBlueprint.evaluate_model(
                     model,
                     self.X_train_scaled,
                     self.y_train,
@@ -167,9 +175,7 @@ class TrainingBlueprint(ABC):
                     best_model = model
                     best_params = params
                     if not self.spec.quiet:
-                        print(
-                            f"New best score {best_score:.2f}% with params {params}"
-                        )
+                        print(f"New best score {best_score:.2f}% with params {params}")
 
         if not self.spec.quiet:
             executors.DefaultExecutors().shutdown()
@@ -177,30 +183,31 @@ class TrainingBlueprint(ABC):
             print(msg)
             logger.info(msg)
 
-        scaler_filename, model_filename, model_info_filename = (
-            self.maybe_persist_scaler_and_model(
-                best_training_score,
-                best_test_score,
-                best_params,
-                num_examples,
-                scaler,
-                best_model,
-            )
+        assert best_training_score is not None
+        assert best_test_score is not None
+        assert best_params is not None
+        (
+            scaler_filename,
+            model_filename,
+            model_info_filename,
+        ) = self.maybe_persist_scaler_and_model(
+            best_training_score,
+            best_test_score,
+            best_params,
+            num_examples,
+            scaler,
+            best_model,
         )
         return OutputSpec(
-            model_filename = model_filename,
-            model_info_filename = model_info_filename,
-            scaler_filename = scaler_filename,
-            training_score = best_training_score,
-            test_score = best_test_score,
+            model_filename=model_filename,
+            model_info_filename=model_info_filename,
+            scaler_filename=scaler_filename,
+            training_score=best_training_score,
+            test_score=best_test_score,
         )
 
     @par.parallelize(method=par.Method.THREAD)
-    def read_files_from_list(
-            self,
-            files: List[str],
-            n: int
-    ) -> Tuple[List, List]:
+    def read_files_from_list(self, files: List[str]) -> Tuple[List, List]:
         # All features
         X = []
 
@@ -223,17 +230,16 @@ class TrainingBlueprint(ABC):
                 try:
                     (key, value) = line.split(self.spec.key_value_delimiter)
                 except Exception:
-                    logger.debug(f"WARNING: bad line in file {filename} '{line}', skipped")
+                    logger.debug("WARNING: bad line in file %s '%s', skipped", filename, line)
                     continue
 
                 key = key.strip()
                 value = value.strip()
-                if (self.spec.features_to_skip is not None
-                        and key in self.spec.features_to_skip):
-                    logger.debug(f"Skipping feature {key}")
+                if self.spec.features_to_skip is not None and key in self.spec.features_to_skip:
+                    logger.debug("Skipping feature %s", key)
                     continue
 
-                value = self.normalize_feature(value)
+                value = TrainingBlueprint.normalize_feature(value)
 
                 if key == self.spec.label:
                     y.append(value)
@@ -263,10 +269,8 @@ class TrainingBlueprint(ABC):
     def make_progress_graph(self) -> None:
         if not self.spec.quiet:
             from text_utils import progress_graph
-            progress_graph(
-                self.file_done_count,
-                self.total_file_count
-            )
+
+            progress_graph(self.file_done_count, self.total_file_count)
 
     @timed
     def read_input_files(self):
@@ -282,9 +286,9 @@ class TrainingBlueprint(ABC):
         results = []
         all_files = glob.glob(self.spec.file_glob)
         self.total_file_count = len(all_files)
-        for n, files in enumerate(list_utils.shard(all_files, 500)):
+        for files in list_utils.shard(all_files, 500):
             file_list = list(files)
-            results.append(self.read_files_from_list(file_list, n))
+            results.append(self.read_files_from_list(file_list))
 
         for result in smart_future.wait_any(results, callback=self.make_progress_graph):
             result = result._resolve()
@@ -296,7 +300,8 @@ class TrainingBlueprint(ABC):
             print(" " * 80 + "\n")
         return (X, y)
 
-    def normalize_feature(self, value: str) -> Any:
+    @staticmethod
+    def normalize_feature(value: str) -> Any:
         if value in ("False", "None"):
             ret = 0
         elif value == "True":
@@ -307,7 +312,8 @@ class TrainingBlueprint(ABC):
             ret = int(value)
         return ret
 
-    def test_train_split(self, X, y) -> List:
+    @staticmethod
+    def test_train_split(X, y) -> List:
         logger.debug("Performing test/train split")
         return train_test_split(
             X,
@@ -315,9 +321,8 @@ class TrainingBlueprint(ABC):
             random_state=random.randrange(0, 1000),
         )
 
-    def scale_data(self,
-                   X_train: np.ndarray,
-                   X_test: np.ndarray) -> Tuple[Any, np.ndarray, np.ndarray]:
+    @staticmethod
+    def scale_data(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[Any, np.ndarray, np.ndarray]:
         logger.debug("Scaling data")
         scaler = MinMaxScaler()
         scaler.fit(X_train)
@@ -325,36 +330,36 @@ class TrainingBlueprint(ABC):
 
     # Note: children should implement.  Consider using @parallelize.
     @abstractmethod
-    def train_model(self,
-                    parameters,
-                    X_train_scaled: np.ndarray,
-                    y_train: np.ndarray) -> Any:
+    def train_model(self, parameters, X_train_scaled: np.ndarray, y_train: np.ndarray) -> Any:
         pass
 
+    @staticmethod
     def evaluate_model(
-            self,
-            model: Any,
-            X_train_scaled: np.ndarray,
-            y_train: np.ndarray,
-            X_test_scaled: np.ndarray,
-            y_test: np.ndarray) -> Tuple[np.float64, np.float64]:
+        model: Any,
+        X_train_scaled: np.ndarray,
+        y_train: np.ndarray,
+        X_test_scaled: np.ndarray,
+        y_test: np.ndarray,
+    ) -> Tuple[np.float64, np.float64]:
         logger.debug("Evaluating the model")
         training_score = model.score(X_train_scaled, y_train) * 100.0
         test_score = model.score(X_test_scaled, y_test) * 100.0
         logger.info(
-            f"Model evaluation results: test_score={test_score:.5f}, "
-            f"train_score={training_score:.5f}"
+            "Model evaluation results: test_score=%.5f, train_score=%.5f",
+            test_score,
+            training_score,
         )
         return (training_score, test_score)
 
     def maybe_persist_scaler_and_model(
-            self,
-            training_score: np.float64,
-            test_score: np.float64,
-            params: str,
-            num_examples: int,
-            scaler: Any,
-            model: Any) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        self,
+        training_score: np.float64,
+        test_score: np.float64,
+        params: str,
+        num_examples: int,
+        scaler: Any,
+        model: Any,
+    ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
         if not self.spec.dry_run:
             import datetime_utils
             import input_utils
@@ -368,21 +373,20 @@ Training set score: {training_score:.2f}%
 Testing set score: {test_score:.2f}%"""
             print(f'\n{info}\n')
             if (
-                    (self.spec.persist_percentage_threshold is not None and
-                     test_score > self.spec.persist_percentage_threshold)
-                    or
-                    (not self.spec.quiet
-                     and input_utils.yn_response("Write the model? [y,n]: ") == "y")
+                self.spec.persist_percentage_threshold is not None
+                and test_score > self.spec.persist_percentage_threshold
+            ) or (
+                not self.spec.quiet and input_utils.yn_response("Write the model? [y,n]: ") == "y"
             ):
                 scaler_filename = f"{self.spec.basename}_scaler.sav"
-                with open(scaler_filename, "wb") as f:
-                    pickle.dump(scaler, f)
+                with open(scaler_filename, "wb") as fb:
+                    pickle.dump(scaler, fb)
                 msg = f"Wrote {scaler_filename}"
                 print(msg)
                 logger.info(msg)
                 model_filename = f"{self.spec.basename}_model.sav"
-                with open(model_filename, "wb") as f:
-                    pickle.dump(model, f)
+                with open(model_filename, "wb") as fb:
+                    pickle.dump(model, fb)
                 msg = f"Wrote {model_filename}"
                 print(msg)
                 logger.info(msg)