3 # © Copyright 2021-2022, Scott Gasch
5 """A helper to facilitate quick manual labeling of ML training data.
7 To use, implement a subclass that implements the QuickLabelHelper
8 interface and pass it into the quick_label function.
15 from abc import abstractmethod
16 from typing import Any, Dict, List, Optional, Set, Tuple
23 logger = logging.getLogger(__name__)
24 parser = config.add_commandline_args(
25 f"ML Quick Labeler ({__file__})",
26 "Args related to quick labeling of ML training data",
29 "--ml_quick_label_skip_list_path",
30 default="./qlabel_skip_list.txt",
32 type=argparse_utils.valid_filename,
33 help="Path to file in which to store already labeled data.",
36 "--ml_quick_label_use_skip_lists",
38 action=argparse_utils.ActionNoYes,
39 help='Should we use a skip list file to speed up execution?',
42 "--ml_quick_label_overwrite_labels",
44 action=argparse_utils.ActionNoYes,
45 help='Enable overwriting of existing labels; default is to not relabel.',
48 '--ml_quick_label_skip_where_model_agrees',
50 action=argparse_utils.ActionNoYes,
51 help='Do not filter examples where the model disagrees with the current label.',
54 '--ml_quick_label_delete_invalid_examples',
57 help='If set we will delete invalid training examples.',
61 class QuickLabelHelper:
62 '''To use this quick labeler your code must create a subclass of this
63 class and implement the abstract methods below. See comments for
64 detailed semantics.'''
67 def get_candidate_files(self) -> List[str]:
68 '''This must return a list of raw candidate files for labeling.'''
72 def get_features_for_file(self, filename: str) -> Optional[str]:
73 '''Given a raw file, return its features file.'''
77 def render_example(self, filename: str, features: str, lines: List[str]) -> None:
78 '''Render a raw file with its features for the user to judge.'''
82 def unrender_example(self, filename: str, features: str) -> None:
83 '''Unrender a raw file with its features (if necessary)...'''
87 def is_valid_example(self, filename: str, features: str, lines: List[str]) -> bool:
88 '''Returns true iff the example is valid (all features are valid, there
89 are the correct number of features, etc...'''
93 def ask_current_model_about_example(
99 '''Ask the current ML model about this example, if necessary/possible.
100 Returns None to indicate no model to consult.'''
104 def get_labelling_keystrokes(self) -> Dict[str, Any]:
105 '''What keystrokes should be considered valid label actions and what
106 label does each keystroke map into. e.g. if you want to ask
107 the user to hit 'y' for 'yes' and code that as 255 in your
108 features or to hit 'n' for 'no' and code that as 0 in your
117 def get_everything_label(self) -> Any:
118 '''If this returns something other than None it indicates that every
119 example selected should be labeled with this result. Caveat
120 emptor, we will klobber all your files.
126 def get_label_feature(self) -> str:
127 '''What feature denotes the example's label? This is used to detect
128 when examples already have a label and to assign labels to
133 def _maybe_read_skip_list() -> Set[str]:
134 '''Reads the skip list (files to just bypass) into memory if using.'''
136 ret: Set[str] = set()
137 if config.config['ml_quick_label_use_skip_lists']:
138 quick_skip_file = config.config['ml_quick_label_skip_list_path']
139 if os.path.exists(quick_skip_file):
140 with open(quick_skip_file, 'r') as f:
141 lines = f.readlines()
146 logger.debug('Read %s and found %d entries.', quick_skip_file, len(ret))
150 def _maybe_write_skip_list(skip_list) -> None:
151 '''Writes the skip list (files to just bypass) to disk if using.'''
153 if config.config['ml_quick_label_use_skip_lists']:
154 quick_skip_file = config.config['ml_quick_label_skip_list_path']
155 with open(quick_skip_file, 'w') as f:
156 for filename in skip_list:
157 filename = filename.strip()
158 if len(filename) > 0:
159 f.write(f'{filename}\n')
160 logger.debug('Updated %s', quick_skip_file)
164 images: List[str], skip_list: Set[str], helper: QuickLabelHelper
165 ) -> List[Tuple[str, str]]:
166 '''Discard examples that have particular characteristics. e.g.
167 those that are already labeled and whose current label agrees with
168 the ML model, etc...'''
171 label_label = helper.get_label_feature()
173 if image in skip_list:
174 logger.debug('Skipping %s because of the skip list.', image)
177 features = helper.get_features_for_file(image)
178 if features is None or not os.path.exists(features):
179 logger.warning('%s/%s: features file doesn\'t exist, skipping.', image, features)
185 with open(features, 'r') as rf:
186 lines = rf.readlines()
189 if line.startswith(label_label):
190 label = ''.join(line.split(':')[1:])
191 label = label.strip()
193 filtered_lines.append(line)
195 if not helper.is_valid_example(image, features, filtered_lines):
196 logger.warning('%s/%s: Invalid example.', image, features)
197 if config.config['ml_quick_label_delete_invalid_examples']:
202 if label and not config.config['ml_quick_label_overwrite_labels']:
203 logger.warning('%s/%s: already has label, skipping.', image, features)
206 if config.config['ml_quick_label_skip_where_model_agrees']:
207 model_says = helper.ask_current_model_about_example(image, features, filtered_lines)
208 if model_says and label:
209 if model_says[0] == int(label):
211 '%s/%s: Model agrees with current label (%s), skipping.',
218 print(f'{image}/{features}: The model disagrees with the current label.')
219 print(f' ...model says {model_says[0]} with probability {model_says[1]}.')
220 print(f' ...the example is currently labeled {label}')
222 filtered_images.append((image, features))
223 return filtered_images
227 helper: QuickLabelHelper,
229 num_filtered_images: int,
231 current_features: str,
232 labeled_features: Dict[Tuple[str, str], str], # Examples already labeled
234 '''Tell an interactive user where they are in the set of examples that
235 may be labeled and the details of the current example.'''
237 label_label = helper.get_label_feature() # the key: of a label in features
239 label = labeled_features.get((current_image, current_features), None)
240 with open(current_features, 'r') as rf:
241 lines = rf.readlines()
246 if not line.startswith(label_label):
247 filtered_lines.append(line)
253 helper.render_example(current_image, current_features, filtered_lines)
254 print(f'{cursor}/{num_filtered_images} ({cursor/num_filtered_images*100.0:.1f}%) | ', end='')
255 print(f'{ansi.bold()}{current_image} / {current_features}{ansi.reset()}:')
256 print(f' ...{len(labeled_features)} currently unwritten label(s) ("W" to write).')
258 if (current_image, current_features) in labeled_features:
259 print(f' ...This example is labeled but not yet saved: {label}')
261 print(f' ...This example is already labeled on disk: {label}')
263 print(' ...This example is currently unlabeled')
264 guess = helper.ask_current_model_about_example(current_image, current_features, filtered_lines)
266 print(f' ...The ML Model says {guess}')
270 def _write_labeled_features(
271 helper: QuickLabelHelper,
272 labeled_features: Dict[Tuple[str, str], str],
275 label_label = helper.get_label_feature()
276 for image_features, label in labeled_features.items():
277 image = image_features[0]
278 features = image_features[1]
280 with open(features, 'r') as rf:
281 lines = rf.readlines()
287 if not line.startswith(label_label):
288 filtered_lines.append(line)
290 filtered_lines.append(f'{label_label}: {label}')
291 with open(features, 'w') as f:
292 f.writelines(line + '\n' for line in filtered_lines)
293 if config.config['ml_quick_label_use_skip_lists']:
295 print(f'Wrote {len(labeled_features)} labels.')
298 def quick_label(helper: QuickLabelHelper) -> None:
299 '''Pass your QuickLabelHelper implementing class to this function and
300 it will allow users to label examples and persist them to disk.
303 skip_list = _maybe_read_skip_list()
305 # Ask helper for an initial set of files.
306 images = helper.get_candidate_files()
308 logger.warning('No images files to operate on.')
310 logger.info('There are %d starting candidate images.', len(images))
312 # Filter out any that can't be converted to features or already have a
313 # label (unless they used --ml_qukck_label_overwrite_labels).
314 filtered_images = _filter_images(images, skip_list, helper)
315 if len(filtered_images) == 0:
316 logger.warning('No image files to operate on (post filter).')
318 logger.info('There are %d candidate images post filtering.', len(filtered_images))
320 # Allow the user to label the non-filtered images one by one.
321 labeled_features: Dict[Tuple[str, str], str] = {}
324 assert 0 <= cursor < len(filtered_images)
326 image = filtered_images[cursor][0]
327 assert os.path.exists(image)
328 features = filtered_images[cursor][1]
329 assert features and os.path.exists(features)
331 # Render the features, image and prompt.
332 _make_prompt(helper, cursor, len(filtered_images), image, features, labeled_features)
334 # Did they want everything labelled the same?
335 label_everything = helper.get_everything_label()
337 labeled_features[(image, features)] = label_everything
338 filtered_images.remove((image, features))
339 if len(filtered_images) == 0:
340 print('Nothing more to label.')
342 if cursor >= len(filtered_images):
345 # Otherwise ask about each individual example.
347 labelling_keystrokes = helper.get_labelling_keystrokes()
348 valid_keystrokes = ['<', '>', 'W', 'Q', '?']
349 valid_keystrokes += labelling_keystrokes.keys()
350 prompt = ','.join(valid_keystrokes)
351 print(f'What should I do? [{prompt}]: ', end='')
353 keystroke = input_utils.single_keystroke_response(valid_keystrokes)
359 > = Don't label, move to the next example.
360 < = Don't label, move to the previous example.
361 W = Write pending labels to disk now.
362 Q = Quit labeling now.
364 else = These keystrokes assign a label to the example and persist it.'''
366 input_utils.press_any_key()
367 elif keystroke == 'Q':
368 logger.info('Ok, stopping for now.')
369 if len(labeled_features):
370 logger.info('Discarding %d unsaved labels.', len(labeled_features))
372 elif keystroke == '>':
374 if cursor >= len(filtered_images):
375 print('Wrapping around...')
377 elif keystroke == '<':
380 print('Wrapping around...')
381 cursor = len(filtered_images) - 1
382 elif keystroke == 'W':
383 _write_labeled_features(helper, labeled_features, skip_list)
384 labeled_features = {}
385 elif keystroke in labelling_keystrokes:
386 label_value = labelling_keystrokes[keystroke]
387 labeled_features[(image, features)] = label_value
388 filtered_images.remove((image, features))
389 if len(filtered_images) == 0:
390 print('Nothing more to label.')
392 if cursor >= len(filtered_images):
395 print(f'Unknown keystroke: {keystroke}. Try again.')
397 helper.unrender_example(image, features)
399 if len(labeled_features):
400 yn = input_utils.yn_response(f'Save the {len(labeled_features)} labels to disk? [Y/N]: ')
402 _write_labeled_features(helper, labeled_features, skip_list)
403 _maybe_write_skip_list(skip_list)