3 # © Copyright 2021-2022, Scott Gasch
5 """A helper to facilitate quick manual labeling of ML training data."""
11 from abc import abstractmethod
12 from typing import Any, Dict, List, Optional, Set, Tuple
17 logger = logging.getLogger(__name__)
18 parser = config.add_commandline_args(
19 f"ML Quick Labeler ({__file__})",
20 "Args related to quick labeling of ML training data",
23 "--ml_quick_label_skip_list_path",
24 default="./qlabel_skip_list.txt",
26 type=argparse_utils.valid_filename,
27 help="Path to file in which to store already labeled data.",
30 "--ml_quick_label_use_skip_lists",
32 action=argparse_utils.ActionNoYes,
33 help='Should we use a skip list file to speed up execution?',
36 "--ml_quick_label_overwrite_labels",
38 action=argparse_utils.ActionNoYes,
39 help='Enable overwriting existing labels; default is to not relabel.',
42 '--ml_quick_label_skip_where_model_agrees',
44 action=argparse_utils.ActionNoYes,
45 help='Do not filter examples where the model disagrees with the current label.',
48 'ml_quick_label_delete_invalid_examples',
51 help='If set we will delete invalid training examples.',
55 class QuickLabelHelper:
56 '''To use this quick labeler your code must create a subclass of this
57 class and implement the abstract methods below. See comments for
58 detailed semantics.'''
61 def get_candidate_files(self) -> List[str]:
62 '''This must return a list of raw candidate files for labeling.'''
66 def get_features_for_file(self, filename: str) -> Optional[str]:
67 '''Given a raw file, return its features file.'''
71 def render_example(self, filename: str, features: str, lines: List[str]) -> None:
72 '''Render a raw file with its features for the user.'''
76 def unrender_example(self, filename: str, features: str, lines: List[str]) -> None:
77 '''Unrender a raw file with its features (if necessary)...'''
81 def is_valid_example(self, filename: str, features: str, lines: List[str]) -> bool:
82 '''Returns true iff the example is valid (all features are valid, there
83 are the correct number of features, etc...'''
87 def ask_current_model_about_example(
93 '''Ask the current ML model about this example, if necessary.'''
97 def get_labelling_keystrokes(self) -> Dict[str, Any]:
98 '''What keystrokes should be considered valid label actions and what
99 label does each keystroke map into. e.g. if you want to ask
100 the user to hit 'y' for 'yes' and code that as 255 in your
101 features or to hit 'n' for 'no' and code that as 0 in your
110 def get_everything_label(self) -> Any:
111 '''If this returns something other than None it indicates that every
112 example selected should be labeled with this result. Caveat
113 emptor, we will klobber all your files.
119 def get_label_feature(self) -> str:
120 '''What feature denotes the example's label? This is used to detect
121 when examples already have a label and to assign labels to
126 def _maybe_read_skip_list() -> Set[str]:
127 '''Reads the skip list (files to just bypass) into memory if using.'''
129 ret: Set[str] = set()
130 if config.config['ml_quick_label_use_skip_lists']:
131 quick_skip_file = config.config['ml_quick_label_skip_list_path']
132 if os.path.exists(quick_skip_file):
133 with open(quick_skip_file, 'r') as f:
134 lines = f.readlines()
139 logger.debug('Read %s and found %d entries.', quick_skip_file, len(ret))
143 def _maybe_write_skip_list(skip_list) -> None:
144 '''Writes the skip list (files to just bypass) to disk if using.'''
146 if config.config['ml_quick_label_use_skip_lists']:
147 quick_skip_file = config.config['ml_quick_label_skip_list_path']
148 with open(quick_skip_file, 'w') as f:
149 for filename in skip_list:
150 filename = filename.strip()
151 if len(filename) > 0:
152 f.write(f'{filename}\n')
153 logger.debug('Updated %s', quick_skip_file)
157 images: List[str], skip_list: Set[str], helper: QuickLabelHelper
158 ) -> List[Tuple[str, str]]:
160 label_label = helper.get_label_feature()
162 if image in skip_list:
163 logger.debug('Skipping %s because of the skip list', image)
166 features = helper.get_features_for_file(image)
167 if features is None or not os.path.exists(features):
168 logger.warning('%s/%s: features doesn\'t exist, SKIPPING.', image, features)
173 with open(features, 'r') as rf:
174 lines = rf.readlines()
177 if line.startswith(label_label):
178 label = ''.join(line.split(':')[1:])
179 label = label.strip()
181 filtered_lines.append(line)
183 if not helper.is_valid_example(image, features, filtered_lines):
184 logger.warning('%s/%s: Invalid example.', image, features)
185 if config.config['ml_quick_label_delete_invalid_examples']:
190 if label and not config.config['ml_quick_label_overwrite_labels']:
191 logger.warning('%s/%s: already has label, SKIPPING.', image, features)
194 if config.config['ml_quick_label_skip_where_model_agrees']:
195 model_says = helper.ask_current_model_about_example(image, features, filtered_lines)
196 if model_says and label:
197 if model_says[0] == int(label):
199 print(f'{image}/{features}: The model disagrees with the current label.')
200 print(f' ...model says {model_says[0]} with probability {model_says[1]}.')
201 print(f' ...the example is currently labeled {label}')
202 filtered_images.append((image, features))
203 return filtered_images
206 def quick_label(helper: QuickLabelHelper) -> None:
207 skip_list = _maybe_read_skip_list()
209 # Ask helper for an initial set of files.
210 images = helper.get_candidate_files()
212 logger.warning('No images files to operate on.')
215 # Filter out any that can't be converted to features or already have a
216 # label (unless they used --ml_qukck_label_overwrite_labels).
217 filtered_images = _filter_images(images, skip_list, helper)
218 if len(filtered_images) == 0:
219 logger.warning('No image files to operate on (post filter).')
222 # Allow the user to label the non-filtered images one by one.
226 label_label = helper.get_label_feature()
228 assert 0 <= cursor < len(filtered_images)
230 image = filtered_images[cursor][0]
231 assert os.path.exists(image)
232 features = filtered_images[cursor][1]
233 assert features and os.path.exists(features)
237 with open(features, 'r') as rf:
238 lines = rf.readlines()
241 if not line.startswith(label_label):
242 filtered_lines.append(line)
247 helper.render_example(image, features, filtered_lines)
251 f'{cursor} of {len(filtered_images)} {cursor/len(filtered_images)*100.0:.1f}%): {image}, {features}'
254 print(f' ...Already labelled: {label}')
256 print(' ...Currently unlabeled')
257 guess = helper.ask_current_model_about_example(image, features, filtered_lines)
259 print(f' ...Model says {guess}')
262 # Did they want everything labelled the same?
263 label_everything = helper.get_everything_label()
265 filtered_lines.append(f"{label_label}: {label_everything}\n")
266 with open(features, 'w') as f:
267 f.writelines(line + '\n' for line in filtered_lines)
268 if config.config['ml_quick_label_use_skip_lists']:
271 if cursor >= len(filtered_images):
272 helper.unrender_example(image, features, filtered_lines)
275 # Otherwise ask about each example.
277 labelling_keystrokes = helper.get_labelling_keystrokes()
278 valid_keystrokes = ['<', '>', 'Q', '?']
279 valid_keystrokes += labelling_keystrokes.keys()
280 prompt = ','.join(valid_keystrokes)
281 print(f'What should I do ({prompt})? ', end='')
283 keystroke = input_utils.single_keystroke_response(valid_keystrokes)
286 logger.info('Ok, stopping for now. Labeled examples are written to disk')
287 helper.unrender_example(image, features, filtered_lines)
289 elif keystroke == '?':
292 > = Don't label, move to the next example.
293 < = Don't label, move to the previous example.
294 Q = Quit labeling now.
296 else = These keystrokes assign a label to the example and persist it.'''
300 elif keystroke == '>':
302 if cursor >= len(filtered_images):
303 print('Wrapping around...')
305 elif keystroke == '<':
308 print('Wrapping around...')
309 cursor = len(filtered_images) - 1
310 elif keystroke in labelling_keystrokes:
311 label_value = labelling_keystrokes[keystroke]
312 filtered_lines.append(f"{label_label}: {label_value}\n")
313 with open(features, 'w') as f:
314 f.writelines(line + '\n' for line in filtered_lines)
315 if config.config['ml_quick_label_use_skip_lists']:
318 if cursor >= len(filtered_images):
319 print('Wrapping around...')
322 print(f'Unknown keystroke: {keystroke}')
323 helper.unrender_example(image, features, filtered_lines)
324 _maybe_write_skip_list(skip_list)