3 # © Copyright 2021-2022, Scott Gasch
5 """A fast (English) word unscrambler."""
8 from typing import Dict, Mapping, Optional
10 from pyutils import config, decorator_utils, list_utils
11 from pyutils.files import file_utils
13 cfg = config.add_commandline_args(
14 f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
17 "--unscrambler_default_indexfile",
18 help="Path to a file of signature -> word index.",
20 default="/usr/share/dict/sparse_index",
23 logger = logging.getLogger(__name__)
26 letters_mask = 2**letters_bits - 1
29 fprint_mask = (2**fprint_bits - 1) << letters_bits
31 fprint_feature_bit = {
90 class Unscrambler(object):
91 """A class that unscrambles words quickly by computing a signature
92 (sig) for the word based on its position independent letter
93 population and then using a pregenerated index to look up known
94 words the same set of letters.
96 Sigs are designed to cluster similar words near each other so both
97 lookup methods support a "fuzzy match" argument that can be set to
98 request similar words that do not match exactly in addition to any
103 Each instance of Unscrambler caches its index to speed up
104 lookups number 2..N; careless deletion / reinstantiation will
105 suffer slower performance.
108 def __init__(self, indexfile: Optional[str] = None):
110 Constructs an unscrambler.
113 indexfile: overrides the default indexfile location if provided.
114 To [re]generate the indexfile, see :meth:`repopulate`.
117 # Cached index per instance.
121 filename = Unscrambler.get_indexfile(indexfile)
122 with open(filename, 'r') as rf:
123 lines = rf.readlines()
126 (fsig, word) = line.split('+')
128 self.sigs.append(isig)
129 self.words.append(word)
132 def get_indexfile(indexfile: Optional[str]) -> str:
135 The current indexfile location
137 if indexfile is None:
138 if 'unscrambler_default_indexfile' in config.config:
139 indexfile = config.config['unscrambler_default_indexfile']
140 assert type(indexfile) == str
142 indexfile = "/usr/share/dict/sparse_index"
144 assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
149 def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
151 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
153 if letter in fprint_feature_bit:
154 count = min(pair[1], 3)
155 shift = fprint_feature_bit[letter]
158 return fp << letters_bits
162 def _compute_word_letter_sig(
163 lsigs: Mapping[str, int],
165 population: Mapping[str, int],
168 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
170 if letter not in lsigs:
179 length = min(len(word), 31)
186 @decorator_utils.memoized
187 def compute_word_sig(word: str) -> int:
188 """Given a word, compute its signature for subsequent lookup
189 operations. Signatures are computed based on the letters in
190 the word and their frequencies. We try to cluster "similar"
191 words close to each other in the signature space.
194 word: the word to compute a signature for
197 The word's signature.
199 >>> test = Unscrambler.compute_word_sig('test')
203 >>> teste = Unscrambler.compute_word_sig('teste')
211 population = list_utils.population_counts(word)
212 fprint = Unscrambler._compute_word_fingerprint(population)
213 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
214 assert fprint & letter_sig == 0
215 sig = fprint | letter_sig
220 dictfile: str = '/usr/share/dict/words',
221 indexfile: str = '/usr/share/dict/sparse_index',
224 Repopulates the indexfile.
227 dictfile: a file that contains one word per line
228 indexfile: the file to populate with sig, word pairs for future use
233 Before calling this method, change `letter_sigs` from the
234 default above unless you want to populate the same exact
237 words_by_sigs: Dict[int, str] = {}
239 with open(dictfile, "r") as f:
241 word = word.replace('\n', '')
243 sig = Unscrambler.compute_word_sig(word)
244 logger.debug("%s => 0x%x", word, sig)
248 if sig in words_by_sigs:
249 words_by_sigs[sig] += f",{word}"
251 words_by_sigs[sig] = word
252 with open(indexfile, 'w') as f:
253 for sig in sorted(words_by_sigs.keys()):
254 word = words_by_sigs[sig]
255 print(f'0x{sig:x}+{word}', file=f)
257 def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
258 """Looks up a potentially scrambled word optionally including near
262 word: the word to lookup
263 window_size: the number of nearby fuzzy matches to return
266 A dict of word -> bool containing unscrambled words with (close
267 to or precisely) the same letters as the input word. The bool
268 values in this dict indicate whether the key word is an exact
269 or near match. The count of entries in this dict is controlled
270 by the window_size param.
272 >>> u = Unscrambler()
273 >>> u.lookup('eanycleocipd', window_size=0)
274 {'encyclopedia': True}
276 sig = Unscrambler.compute_word_sig(word)
277 return self.lookup_by_sig(sig, window_size=window_size)
279 def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
280 """Looks up a word that has already been translated into a signature by
281 a previous call to Unscrambler.compute_word_sig. Optionally returns
282 near "fuzzy" matches.
285 sig: the signature of the word to lookup (see :meth:`compute_word_sig`
286 to generate these signatures).
287 window_size: the number of nearby fuzzy matches to return
290 A dict of word -> bool containing unscrambled words with (close
291 to or precisely) the same letters as the input word. The bool
292 values in this dict indicate whether the key word is an exact
293 or near match. The count of entries in this dict is controlled
294 by the window_size param.
296 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
300 >>> u = Unscrambler()
301 >>> u.lookup_by_sig(sig)
302 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
305 (_, location) = list_utils.binary_search(self.sigs, sig)
306 start = location - window_size
307 start = max(start, 0)
308 end = location + 1 + window_size
309 end = min(end, len(self.words))
311 for x in range(start, end):
314 if window_size > 0 or (fsig == sig):
315 ret[word] = fsig == sig
319 if __name__ == "__main__":