#!/usr/bin/env python3 """A helper to facilitate quick manual labeling of ML training data.""" import glob import logging import os import warnings from dataclasses import dataclass from typing import Callable, List, Optional, Set import argparse_utils import config logger = logging.getLogger(__name__) parser = config.add_commandline_args( f"ML Quick Labeler ({__file__})", "Args related to quick labeling of ML training data", ) parser.add_argument( "--ml_quick_label_skip_list_path", default="./qlabel_skip_list.txt", metavar="FILENAME", type=argparse_utils.valid_filename, help="Path to file in which to store already labeled data.", ) parser.add_argument( "--ml_quick_label_use_skip_lists", default=True, action=argparse_utils.ActionNoYes, help='Should we use a skip list file to speed up execution?', ) parser.add_argument( "--ml_quick_label_overwrite_labels", default=False, action=argparse_utils.ActionNoYes, help='Enable overwriting existing labels; default is to not relabel.', ) @dataclass class InputSpec: """A wrapper around the input data we need to operate; should be populated by the caller.""" image_file_glob: Optional[str] = None image_file_prepopulated_list: Optional[List[str]] = None image_file_to_features_file: Optional[Callable[[str], str]] = None label: str = '' valid_keystrokes: List[str] = [] prompt: str = '' keystroke_to_label: Optional[Callable[[str], str]] = None def read_skip_list() -> Set[str]: ret: Set[str] = set() if config.config['ml_quick_label_use_skip_lists']: quick_skip_file = config.config['ml_quick_label_skip_list_path'] if os.path.exists(quick_skip_file): with open(quick_skip_file, 'r') as f: lines = f.readlines() for line in lines: line = line[:-1] line.strip() ret.add(line) logger.debug('Read %s and found %d entries.', quick_skip_file, len(ret)) return ret def write_skip_list(skip_list) -> None: if config.config['ml_quick_label_use_skip_lists']: quick_skip_file = config.config['ml_quick_label_skip_list_path'] with open(quick_skip_file, 'w') as f: for filename in skip_list: filename = filename.strip() if len(filename) > 0: f.write(f'{filename}\n') logger.debug('Updated %s', quick_skip_file) def label(in_spec: InputSpec) -> None: import input_utils images = [] if in_spec.image_file_glob is not None: images += glob.glob(in_spec.image_file_glob) elif in_spec.image_file_prepopulated_list is not None: images += in_spec.image_file_prepopulated_list else: raise ValueError('One of image_file_glob or image_file_prepopulated_list is required') skip_list = read_skip_list() for image in images: if image in skip_list: logger.debug('Skipping %s because of the skip list', image) continue assert in_spec.image_file_to_features_file features = in_spec.image_file_to_features_file(image) if features is None or not os.path.exists(features): msg = f'File {image} yielded file {features} which does not exist, SKIPPING.' logger.warning(msg) warnings.warn(msg) continue # Render features and image. filtered_lines = [] with open(features, "r") as f: lines = f.readlines() saw_label = False for line in lines: line = line[:-1] if in_spec.label not in line: filtered_lines.append(line) else: saw_label = True if not saw_label or config.config['ml_quick_label_overwrite_labels']: logger.info(features) os.system(f'xv {image} &') keystroke = input_utils.single_keystroke_response( in_spec.valid_keystrokes, prompt=in_spec.prompt, ) os.system('killall xv') assert in_spec.keystroke_to_label label_value = in_spec.keystroke_to_label(keystroke) filtered_lines.append(f"{in_spec.label}: {label_value}\n") with open(features, 'w') as f: f.writelines(line + '\n' for line in filtered_lines) skip_list.add(image) write_skip_list(skip_list)