4 from typing import Dict, Mapping, Optional
11 cfg = config.add_commandline_args(
12 f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
15 "--unscrambler_default_indexfile",
16 help="Path to a file of signature -> word index.",
18 default="/usr/share/dict/sparse_index",
21 logger = logging.getLogger(__name__)
24 letters_mask = 2 ** letters_bits - 1
27 fprint_mask = (2 ** fprint_bits - 1) << letters_bits
29 fprint_feature_bit = {
88 class Unscrambler(object):
89 """A class that unscrambles words quickly by computing a signature
90 (sig) for the word based on its position independent letter
91 population and then using a pregenerated index to look up known
92 words the same set of letters.
94 Note that each instance of Unscrambler caches its index to speed
95 up lookups number 2..N; careless reinstantiation will by slower.
97 Sigs are designed to cluster similar words near each other so both
98 lookup methods support a "fuzzy match" argument that can be set to
99 request similar words that do not match exactly in addition to any
104 def __init__(self, indexfile: Optional[str] = None):
105 # Cached index per instance.
109 filename = self.get_indexfile(indexfile)
110 with open(filename, 'r') as rf:
111 lines = rf.readlines()
114 (fsig, word) = line.split('+')
116 self.sigs.append(isig)
117 self.words.append(word)
119 def get_indexfile(self, indexfile: Optional[str]) -> str:
120 if indexfile is None:
121 if 'unscrambler_default_indexfile' in config.config:
122 indexfile = config.config['unscramble_indexfile']
124 indexfile = "/usr/share/dict/sparse_index"
126 assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
131 def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
133 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
135 if letter in fprint_feature_bit:
136 count = min(pair[1], 3)
137 shift = fprint_feature_bit[letter]
140 return fp << letters_bits
144 def _compute_word_letter_sig(
145 lsigs: Mapping[str, int],
147 population: Mapping[str, int],
150 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
152 if letter not in lsigs:
161 length = min(len(word), 31)
168 @decorator_utils.memoized
169 def compute_word_sig(word: str) -> int:
170 """Given a word, compute its signature for subsequent lookup
171 operations. Signatures are computed based on the letters in
172 the word and their frequencies. We try to cluster "similar"
173 words close to each other in the signature space.
175 >>> train = Unscrambler.compute_word_sig('train')
179 >>> retain = Unscrambler.compute_word_sig('retrain')
187 population = list_utils.population_counts(word)
188 fprint = Unscrambler._compute_word_fingerprint(population)
189 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
190 assert fprint & letter_sig == 0
191 sig = fprint | letter_sig
196 lsigs: Dict[str, int],
197 dictfile: str = '/usr/share/dict/words',
198 indexfile: str = '/usr/share/dict/sparse_index',
200 """Before calling this method, change letter_sigs from the default above
201 unless you want to populate the same exact files.
204 words_by_sigs: Dict[int, str] = {}
206 with open(dictfile, "r") as f:
208 word = word.replace('\n', '')
210 sig = Unscrambler.compute_word_sig(word)
211 logger.debug("%s => 0x%x", word, sig)
215 if sig in words_by_sigs:
216 words_by_sigs[sig] += f",{word}"
218 words_by_sigs[sig] = word
219 with open(indexfile, 'w') as f:
220 for sig in sorted(words_by_sigs.keys()):
221 word = words_by_sigs[sig]
222 print(f'0x{sig:x}+{word}', file=f)
224 def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
225 """Looks up a potentially scrambled word optionally including near
228 >>> u = Unscrambler()
229 >>> u.lookup('eanycleocipd', window_size=0)
230 {'encyclopedia': True}
233 sig = Unscrambler.compute_word_sig(word)
234 return self.lookup_by_sig(sig, window_size=window_size)
236 def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
237 """Looks up a word that has already been translated into a signature by
238 a previous call to Unscrambler.compute_word_sig. Optionally returns
239 near "fuzzy" matches.
241 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
245 >>> u = Unscrambler()
246 >>> u.lookup_by_sig(sig)
247 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
251 (_, location) = list_utils.binary_search(self.sigs, sig)
252 start = location - window_size
253 start = max(start, 0)
254 end = location + 1 + window_size
255 end = min(end, len(self.words))
257 for x in range(start, end):
260 if window_size > 0 or (fsig == sig):
261 ret[word] = fsig == sig
266 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
267 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
271 if __name__ == "__main__":