3 # © Copyright 2021-2022, Scott Gasch
5 """A fast word unscrambler library."""
8 from typing import Dict, Mapping, Optional
10 from pyutils import config, decorator_utils, list_utils
11 from pyutils.files import file_utils
13 cfg = config.add_commandline_args(
14 f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
17 "--unscrambler_default_indexfile",
18 help="Path to a file of signature -> word index.",
20 default="/usr/share/dict/sparse_index",
23 logger = logging.getLogger(__name__)
26 letters_mask = 2**letters_bits - 1
29 fprint_mask = (2**fprint_bits - 1) << letters_bits
31 fprint_feature_bit = {
90 class Unscrambler(object):
91 """A class that unscrambles words quickly by computing a signature
92 (sig) for the word based on its position independent letter
93 population and then using a pregenerated index to look up known
94 words the same set of letters.
96 Note that each instance of Unscrambler caches its index to speed
97 up lookups number 2..N; careless reinstantiation will by slower.
99 Sigs are designed to cluster similar words near each other so both
100 lookup methods support a "fuzzy match" argument that can be set to
101 request similar words that do not match exactly in addition to any
105 def __init__(self, indexfile: Optional[str] = None):
107 Constructs an unscrambler.
110 indexfile: overrides the default indexfile location if provided
113 # Cached index per instance.
117 filename = Unscrambler.get_indexfile(indexfile)
118 with open(filename, 'r') as rf:
119 lines = rf.readlines()
122 (fsig, word) = line.split('+')
124 self.sigs.append(isig)
125 self.words.append(word)
128 def get_indexfile(indexfile: Optional[str]) -> str:
129 """Returns the current indexfile location."""
130 if indexfile is None:
131 if 'unscrambler_default_indexfile' in config.config:
132 indexfile = config.config['unscrambler_default_indexfile']
134 indexfile = "/usr/share/dict/sparse_index"
136 assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
141 def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
143 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
145 if letter in fprint_feature_bit:
146 count = min(pair[1], 3)
147 shift = fprint_feature_bit[letter]
150 return fp << letters_bits
154 def _compute_word_letter_sig(
155 lsigs: Mapping[str, int],
157 population: Mapping[str, int],
160 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
162 if letter not in lsigs:
171 length = min(len(word), 31)
178 @decorator_utils.memoized
179 def compute_word_sig(word: str) -> int:
180 """Given a word, compute its signature for subsequent lookup
181 operations. Signatures are computed based on the letters in
182 the word and their frequencies. We try to cluster "similar"
183 words close to each other in the signature space.
186 word: the word to compute a signature for
189 The word's signature.
191 >>> train = Unscrambler.compute_word_sig('train')
195 >>> retain = Unscrambler.compute_word_sig('retrain')
203 population = list_utils.population_counts(word)
204 fprint = Unscrambler._compute_word_fingerprint(population)
205 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
206 assert fprint & letter_sig == 0
207 sig = fprint | letter_sig
212 dictfile: str = '/usr/share/dict/words',
213 indexfile: str = '/usr/share/dict/sparse_index',
216 Repopulates the indexfile.
220 Before calling this method, change letter_sigs from the
221 default above unless you want to populate the same exact
224 words_by_sigs: Dict[int, str] = {}
226 with open(dictfile, "r") as f:
228 word = word.replace('\n', '')
230 sig = Unscrambler.compute_word_sig(word)
231 logger.debug("%s => 0x%x", word, sig)
235 if sig in words_by_sigs:
236 words_by_sigs[sig] += f",{word}"
238 words_by_sigs[sig] = word
239 with open(indexfile, 'w') as f:
240 for sig in sorted(words_by_sigs.keys()):
241 word = words_by_sigs[sig]
242 print(f'0x{sig:x}+{word}', file=f)
244 def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
245 """Looks up a potentially scrambled word optionally including near
249 word: the word to lookup
250 window_size: the number of nearby fuzzy matches to return
253 A dict of word -> bool containing unscrambled words with (close
254 to or precisely) the same letters as the input word. The bool
255 values in this dict indicate whether the key word is an exact
256 or near match. The count of entries in this dict is controlled
257 by the window_size param.
259 >>> u = Unscrambler()
260 >>> u.lookup('eanycleocipd', window_size=0)
261 {'encyclopedia': True}
263 sig = Unscrambler.compute_word_sig(word)
264 return self.lookup_by_sig(sig, window_size=window_size)
266 def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
267 """Looks up a word that has already been translated into a signature by
268 a previous call to Unscrambler.compute_word_sig. Optionally returns
269 near "fuzzy" matches.
272 sig: the signature of the word to lookup (see :meth:`compute_word_sig`
273 to generate these signatures).
274 window_size: the number of nearby fuzzy matches to return
277 A dict of word -> bool containing unscrambled words with (close
278 to or precisely) the same letters as the input word. The bool
279 values in this dict indicate whether the key word is an exact
280 or near match. The count of entries in this dict is controlled
281 by the window_size param.
283 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
287 >>> u = Unscrambler()
288 >>> u.lookup_by_sig(sig)
289 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
292 (_, location) = list_utils.binary_search(self.sigs, sig)
293 start = location - window_size
294 start = max(start, 0)
295 end = location + 1 + window_size
296 end = min(end, len(self.words))
298 for x in range(start, end):
301 if window_size > 0 or (fsig == sig):
302 ret[word] = fsig == sig
307 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
308 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
312 if __name__ == "__main__":