#!/usr/bin/env python3 import logging from typing import Dict import config import decorator_utils import list_utils cfg = config.add_commandline_args( f'Unscramble! ({__file__})', 'A fast word unscrambler.' ) cfg.add_argument( "--unscramble_indexfile", help="Path to a file of signature -> word index.", metavar="FILENAME", default="/usr/share/dict/sparse_index", ) logger = logging.getLogger(__name__) letters_bits = 32 letters_mask = 2 ** letters_bits - 1 fprint_bits = 52 fprint_mask = (2 ** fprint_bits - 1) << letters_bits fprint_feature_bit = { 'e': 0, 'i': 2, 'a': 4, 'o': 6, 'r': 8, 'n': 10, 't': 12, 's': 14, 'l': 16, 'c': 18, 'u': 20, 'p': 22, 'm': 24, 'd': 26, 'h': 28, 'y': 30, 'g': 32, 'b': 34, 'f': 36, 'v': 38, 'k': 40, 'w': 42, 'z': 44, 'x': 46, 'q': 48, 'j': 50, } letter_sigs = { 'a': 1789368711, 'b': 3146859322, 'c': 43676229, 'd': 3522623596, 'e': 3544234957, 'f': 3448207591, 'g': 1282648386, 'h': 3672791226, 'i': 1582316135, 'j': 4001984784, 'k': 831769172, 'l': 1160692746, 'm': 2430986565, 'n': 1873586768, 'o': 694443915, 'p': 1602297017, 'q': 533722196, 'r': 3754550193, 's': 1859447115, 't': 1121373020, 'u': 2414108708, 'v': 2693866766, 'w': 748799881, 'x': 2627529228, 'y': 2376066489, 'z': 802338724, } class Unscrambler(object): sigs = [] words = [] def __init__(self): pass # 52 bits @staticmethod def _compute_word_fingerprint(word: str, population) -> int: fp = 0 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True): letter = pair[0] if letter in fprint_feature_bit: count = pair[1] if count > 3: count = 3 shift = fprint_feature_bit[letter] s = count << shift fp |= s return fp << letters_bits # 32 bits @staticmethod def _compute_word_letter_sig(letter_sigs, word: str, population) -> int: sig = 0 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True): letter = pair[0] if letter not in letter_sigs: continue s = letter_sigs[letter] count = pair[1] if count > 1: s <<= count s |= count s &= letters_mask sig ^= s length = len(word) if length > 31: length = 31 sig ^= length << 8 sig &= letters_mask return sig # 52 + 32 bits @staticmethod @decorator_utils.memoized def compute_word_sig(word: str) -> int: """Given a word, compute its signature for subsequent lookup operations. Signatures are computed based on the letters in the word and their frequencies. We try to cluster "similar" words close to each other in the signature space. >>> train = Unscrambler.compute_word_sig('train') >>> train 23178969883741 >>> retain = Unscrambler.compute_word_sig('retrain') >>> retain 24282502197479 >>> retain - train 1103532313738 """ population = list_utils.population_counts(word) fprint = Unscrambler._compute_word_fingerprint(word, population) letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population) assert fprint & letter_sig == 0 sig = fprint | letter_sig return sig @staticmethod def repopulate( letter_sigs: Dict[str, int], dictfile: str = '/usr/share/dict/words', indexfile: str = '/usr/share/dict/sparse_index', ) -> None: """Before calling this method, change letter_sigs from the default above unless you want to populate the same exact files.""" words_by_sigs = {} seen = set() with open(dictfile, "r") as f: for word in f: word = word.replace('\n', '') word = word.lower() sig = Unscrambler.compute_word_sig(letter_sigs, word) logger.debug("%s => 0x%x" % (word, sig)) if word in seen: continue seen.add(word) if sig in words_by_sigs: words_by_sigs[sig] += ",%s" % word else: words_by_sigs[sig] = word with open(indexfile, 'w') as f: for sig in sorted(words_by_sigs.keys()): word = words_by_sigs[sig] print(f'0x{sig:x}+{word}', file=f) @staticmethod def lookup(word: str, *, include_fuzzy_matches=False) -> Dict[str, bool]: """Looks up a potentially scrambled word optionally including near "fuzzy" matches. >>> Unscrambler.lookup('eanycleocipd', include_fuzzy_matches=False) {'encyclopedia': True} """ sig = Unscrambler.compute_word_sig(word) return Unscrambler.lookup_by_sig( sig, include_fuzzy_matches=include_fuzzy_matches ) @staticmethod def lookup_by_sig(sig, *, include_fuzzy_matches=False) -> Dict[str, bool]: """Looks up a word that has already been translated into a signature by a previous call to Unscrambler.compute_word_sig. Optionally returns near "fuzzy" matches. >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin') >>> sig 18491949645300288339 >>> Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=True) {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False} """ # Cache the index; it doesn't change and this may be called # more than once. if len(Unscrambler.sigs) == 0: if 'unscramble_indexfile' in config.config: indexfile = config.config['unscramble_indexfile'] else: indexfile = "/usr/share/dict/sparse_index" with open(indexfile, 'r') as rf: lines = rf.readlines() for line in lines: line = line[:-1] (fsig, word) = line.split('+') fsig = int(fsig, 16) Unscrambler.sigs.append(fsig) Unscrambler.words.append(word) ret = {} (exact, location) = list_utils.binary_search(Unscrambler.sigs, sig) start = location - 5 if start < 0: start = 0 end = location + 6 if end > len(Unscrambler.words): end = len(Unscrambler.words) for x in range(start, end): word = Unscrambler.words[x] fsig = Unscrambler.sigs[x] if include_fuzzy_matches is True or (fsig == sig): ret[word] = fsig == sig return ret # # To repopulate, change letter_sigs and then call Unscrambler.repopulate. # See notes above. See also ~/bin/unscramble.py --populate_destructively. # if __name__ == "__main__": import doctest doctest.testmod()