# © Copyright 2021-2022, Scott Gasch
-"""A helper to facilitate quick manual labeling of ML training data."""
+"""A helper to facilitate quick manual labeling of ML training data.
+
+To use, implement a subclass that implements the QuickLabelHelper
+interface and pass it into the quick_label function.
+
+"""
import logging
import os
@abstractmethod
def render_example(self, filename: str, features: str, lines: List[str]) -> None:
- '''Render a raw file with its features for the user.'''
+ '''Render a raw file with its features for the user to judge.'''
pass
@abstractmethod
features: str,
lines: List[str],
) -> Any:
- '''Ask the current ML model about this example, if necessary.'''
+ '''Ask the current ML model about this example, if necessary/possible.
+ Returns None to indicate no model to consult.'''
pass
@abstractmethod
def _filter_images(
images: List[str], skip_list: Set[str], helper: QuickLabelHelper
) -> List[Tuple[str, str]]:
+ '''Discard examples that have particular characteristics. e.g.
+ those that are already labeled and whose current label agrees with
+ the ML model, etc...'''
+
filtered_images = []
label_label = helper.get_label_feature()
for image in images:
def _make_prompt(
helper: QuickLabelHelper,
cursor: int,
- filtered_images: List[Tuple[str, str]],
- image: str,
- features: str,
- labeled_features: Dict[Tuple[str, str], str],
+ num_filtered_images: int,
+ current_image: str,
+ current_features: str,
+ labeled_features: Dict[Tuple[str, str], str], # Examples already labeled
) -> None:
- label_label = helper.get_label_feature()
+ '''Tell an interactive user where they are in the set of examples that
+ may be labeled and the details of the current example.'''
+
+ label_label = helper.get_label_feature() # the key: of a label in features
filtered_lines = []
- label = labeled_features.get((image, features), None)
- with open(features, 'r') as rf:
+ label = labeled_features.get((current_image, current_features), None)
+ with open(current_features, 'r') as rf:
lines = rf.readlines()
for line in lines:
line = line[:-1]
label = line
# Prompt...
- helper.render_example(image, features, filtered_lines)
- print(f'{cursor}/{len(filtered_images)} ({cursor/len(filtered_images)*100.0:.1f}%) | ', end='')
- print(f'{ansi.bold()}{image} / {features}{ansi.reset()}:')
- print(f' ...{len(labeled_features)} currently unsaved labels ("W" to save).')
+ helper.render_example(current_image, current_features, filtered_lines)
+ print(f'{cursor}/{num_filtered_images} ({cursor/num_filtered_images*100.0:.1f}%) | ', end='')
+ print(f'{ansi.bold()}{current_image} / {current_features}{ansi.reset()}:')
+ print(f' ...{len(labeled_features)} currently unwritten label(s) ("W" to write).')
if label:
- if (image, features) in labeled_features:
+ if (current_image, current_features) in labeled_features:
print(f' ...This example is labeled but not yet saved: {label}')
else:
print(f' ...This example is already labeled on disk: {label}')
else:
print(' ...This example is currently unlabeled')
- guess = helper.ask_current_model_about_example(image, features, filtered_lines)
+ guess = helper.ask_current_model_about_example(current_image, current_features, filtered_lines)
if guess:
print(f' ...The ML Model says {guess}')
print()
def quick_label(helper: QuickLabelHelper) -> None:
+ '''Pass your QuickLabelHelper implementing class to this function and
+ it will allow users to label examples and persist them to disk.
+
+ '''
skip_list = _maybe_read_skip_list()
# Ask helper for an initial set of files.
logger.info('There are %d candidate images post filtering.', len(filtered_images))
# Allow the user to label the non-filtered images one by one.
-
labeled_features: Dict[Tuple[str, str], str] = {}
cursor = 0
while True:
assert features and os.path.exists(features)
# Render the features, image and prompt.
- _make_prompt(helper, cursor, filtered_images, image, features, labeled_features)
+ _make_prompt(helper, cursor, len(filtered_images), image, features, labeled_features)
try:
# Did they want everything labelled the same?
label_everything = helper.get_everything_label()