3 # © Copyright 2021-2022, Scott Gasch
5 """A helper to facilitate quick manual labeling of ML training data."""
12 from abc import abstractmethod
13 from typing import Any, Dict, List, Optional, Set
18 logger = logging.getLogger(__name__)
19 parser = config.add_commandline_args(
20 f"ML Quick Labeler ({__file__})",
21 "Args related to quick labeling of ML training data",
24 "--ml_quick_label_skip_list_path",
25 default="./qlabel_skip_list.txt",
27 type=argparse_utils.valid_filename,
28 help="Path to file in which to store already labeled data.",
31 "--ml_quick_label_use_skip_lists",
33 action=argparse_utils.ActionNoYes,
34 help='Should we use a skip list file to speed up execution?',
37 "--ml_quick_label_overwrite_labels",
39 action=argparse_utils.ActionNoYes,
40 help='Enable overwriting existing labels; default is to not relabel.',
44 class QuickLabelHelper:
45 '''To use this quick labeler your code must create a subclass of this
46 class and implement the abstract methods below. See comments for
47 detailed semantics.'''
50 def get_candidate_files(self) -> List[str]:
51 '''This must return a list of raw candidate files for labeling.'''
55 def get_features_for_file(self, filename: str) -> Optional[str]:
56 '''Given a raw file, return its features file.'''
60 def render_example(self, filename: str, features: str, lines: List[str]) -> None:
61 '''Render a raw file with its features for the user.'''
65 def unrender_example(self, filename: str, features: str, lines: List[str]) -> None:
66 '''Unrender a raw file with its features (if necessary)...'''
70 def ask_current_model_about_example(
76 '''Ask the current ML model about this example, if necessary.'''
80 def get_labelling_keystrokes(self) -> Dict[str, Any]:
81 '''What keystrokes should be considered valid label actions and what
82 label does each keystroke map into. e.g. if you want to ask
83 the user to hit 'y' for 'yes' and code that as 255 in your
84 features or to hit 'n' for 'no' and code that as 0 in your
93 def get_everything_label(self) -> Any:
94 '''If this returns something other than None it indicates that every
95 example selected should be labeled with this result. Caveat
96 emptor, we will klobber all your files.
102 def get_label_feature(self) -> str:
103 '''What feature denotes the example's label? This is used to detect
104 when examples already have a label and to assign labels to
109 def _maybe_read_skip_list() -> Set[str]:
110 '''Reads the skip list (files to just bypass) into memory if using.'''
112 ret: Set[str] = set()
113 if config.config['ml_quick_label_use_skip_lists']:
114 quick_skip_file = config.config['ml_quick_label_skip_list_path']
115 if os.path.exists(quick_skip_file):
116 with open(quick_skip_file, 'r') as f:
117 lines = f.readlines()
122 logger.debug('Read %s and found %d entries.', quick_skip_file, len(ret))
126 def _maybe_write_skip_list(skip_list) -> None:
127 '''Writes the skip list (files to just bypass) to disk if using.'''
129 if config.config['ml_quick_label_use_skip_lists']:
130 quick_skip_file = config.config['ml_quick_label_skip_list_path']
131 with open(quick_skip_file, 'w') as f:
132 for filename in skip_list:
133 filename = filename.strip()
134 if len(filename) > 0:
135 f.write(f'{filename}\n')
136 logger.debug('Updated %s', quick_skip_file)
139 def quick_label(helper: QuickLabelHelper) -> None:
140 # Ask helper for an initial set of files.
141 images = helper.get_candidate_files()
143 logger.warning('No images files to operate on.')
146 # Filter out any that can't be converted to features or already have a
147 # label (unless they used --ml_qukck_label_overwrite_labels).
149 skip_list = _maybe_read_skip_list()
151 if image in skip_list:
152 logger.debug('Skipping %s because of the skip list', image)
155 features = helper.get_features_for_file(image)
156 if features is None or not os.path.exists(features):
157 msg = f'{image}/{features}: {features} doesn\'t exist, SKIPPING.'
162 label_label = helper.get_label_feature()
164 with open(features, 'r') as rf:
165 lines = rf.readlines()
168 if line.startswith(label_label):
170 if label and not config.config['ml_quick_label_overwrite_labels']:
171 msg = f'{image}/{features}: already has label, SKIPPING'
175 filtered_images.append((image, features))
177 if len(filtered_images) == 0:
178 logger.warning('No image files to operate on (post filter).')
185 assert 0 <= cursor < len(filtered_images)
187 image = filtered_images[cursor][0]
188 assert os.path.exists(image)
189 features = filtered_images[cursor][1]
190 assert features and os.path.exists(features)
194 with open(features, 'r') as rf:
195 lines = rf.readlines()
198 if not line.startswith(label_label):
199 filtered_lines.append(line)
204 helper.render_example(image, features, filtered_lines)
208 f'{cursor} of {len(filtered_images)} {cursor/len(filtered_images)*100.0:.1f}%): {image}, {features}'
211 print(f' ...Already labelled: {label}')
213 print(' ...Currently unlabeled')
214 guess = helper.ask_current_model_about_example(image, features, filtered_lines)
216 print(f' ...Model says {guess}')
219 # Did they want everything labelled the same?
220 label_everything = helper.get_everything_label()
222 filtered_lines.append(f"{label_label}: {label_everything}\n")
223 with open(features, 'w') as f:
224 f.writelines(line + '\n' for line in filtered_lines)
225 if config.config['ml_quick_label_use_skip_lists']:
228 if cursor >= len(filtered_images):
229 helper.unrender_example(image, features, filtered_lines)
232 # Otherwise ask about each example.
234 labelling_keystrokes = helper.get_labelling_keystrokes()
235 valid_keystrokes = ['<', '>', 'Q', '?']
236 valid_keystrokes += labelling_keystrokes.keys()
237 prompt = ','.join(valid_keystrokes)
238 print(f'What should I do ({prompt})? ', end='')
240 keystroke = input_utils.single_keystroke_response(valid_keystrokes)
243 logger.info('Ok, stopping for now. Labeled examples are written to disk')
244 helper.unrender_example(image, features, filtered_lines)
246 elif keystroke == '?':
249 > = Don't label, move to the next example.
250 < = Don't label, move to the previous example.
251 Q = Quit labeling now.
253 else = These keystrokes assign a label to the example and persist it.'''
257 elif keystroke == '>':
259 if cursor >= len(filtered_images):
260 print('Wrapping around...')
262 elif keystroke == '<':
265 print('Wrapping around...')
266 cursor = len(filtered_images) - 1
267 elif keystroke in labelling_keystrokes:
268 label_value = labelling_keystrokes[keystroke]
269 filtered_lines.append(f"{label_label}: {label_value}\n")
270 with open(features, 'w') as f:
271 f.writelines(line + '\n' for line in filtered_lines)
272 if config.config['ml_quick_label_use_skip_lists']:
275 if cursor >= len(filtered_images):
276 print('Wrapping around...')
279 print(f'Unknown keystroke: {keystroke}')
280 helper.unrender_example(image, features, filtered_lines)
281 _maybe_write_skip_list(skip_list)