From fc113e1f62876278f5a181a86f9773c62e63a395 Mon Sep 17 00:00:00 2001 From: Scott Date: Tue, 25 Jan 2022 22:05:25 -0800 Subject: [PATCH] Adds unscrambler which contains the guts of a jumble/scrabble player I wrote outside of here. --- unscrambler.py | 255 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100755 unscrambler.py diff --git a/unscrambler.py b/unscrambler.py new file mode 100755 index 0000000..056107f --- /dev/null +++ b/unscrambler.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 + +import logging +from typing import Dict + +import config +import decorator_utils +import list_utils + +cfg = config.add_commandline_args( + f'Unscramble! ({__file__})', + 'A fast word unscrambler.' +) +cfg.add_argument( + "--unscramble_indexfile", + help="Path to a file of signature -> word index.", + metavar="FILENAME", + default="/usr/share/dict/sparse_index", +) + +logger = logging.getLogger(__name__) + +letters_bits = 32 +letters_mask = 2 ** letters_bits - 1 + +fprint_bits = 52 +fprint_mask = (2 ** fprint_bits - 1) << letters_bits + +fprint_feature_bit = { + 'e': 0, + 'i': 2, + 'a': 4, + 'o': 6, + 'r': 8, + 'n': 10, + 't': 12, + 's': 14, + 'l': 16, + 'c': 18, + 'u': 20, + 'p': 22, + 'm': 24, + 'd': 26, + 'h': 28, + 'y': 30, + 'g': 32, + 'b': 34, + 'f': 36, + 'v': 38, + 'k': 40, + 'w': 42, + 'z': 44, + 'x': 46, + 'q': 48, + 'j': 50, +} + +letter_sigs = { + 'a': 1789368711, + 'b': 3146859322, + 'c': 43676229, + 'd': 3522623596, + 'e': 3544234957, + 'f': 3448207591, + 'g': 1282648386, + 'h': 3672791226, + 'i': 1582316135, + 'j': 4001984784, + 'k': 831769172, + 'l': 1160692746, + 'm': 2430986565, + 'n': 1873586768, + 'o': 694443915, + 'p': 1602297017, + 'q': 533722196, + 'r': 3754550193, + 's': 1859447115, + 't': 1121373020, + 'u': 2414108708, + 'v': 2693866766, + 'w': 748799881, + 'x': 2627529228, + 'y': 2376066489, + 'z': 802338724, +} + + +class Unscrambler(object): + sigs = [] + words = [] + + def __init__(self): + pass + + # 52 bits + @staticmethod + def _compute_word_fingerprint(word: str, population) -> int: + fp = 0 + for pair in sorted(population.items(), key=lambda x: x[1], reverse=True): + letter = pair[0] + if letter in fprint_feature_bit: + count = pair[1] + if count > 3: + count = 3 + shift = fprint_feature_bit[letter] + s = count << shift + fp |= s + return fp << letters_bits + + # 32 bits + @staticmethod + def _compute_word_letter_sig(letter_sigs, word: str, population) -> int: + sig = 0 + for pair in sorted(population.items(), key=lambda x: x[1], reverse=True): + letter = pair[0] + if letter not in letter_sigs: + continue + s = letter_sigs[letter] + count = pair[1] + if count > 1: + s <<= count + s |= count + s &= letters_mask + sig ^= s + length = len(word) + if length > 31: + length = 31 + sig ^= length << 8 + sig &= letters_mask + return sig + + # 52 + 32 bits + @staticmethod + @decorator_utils.memoized + def compute_word_sig(word: str) -> int: + """Given a word, compute its signature for subsequent lookup + operations. Signatures are computed based on the letters in + the word and their frequencies. We try to cluster "similar" + words close to each other in the signature space. + + >>> train = Unscrambler.compute_word_sig('train') + >>> train + 23178969883741 + + >>> retain = Unscrambler.compute_word_sig('retrain') + >>> retain + 24282502197479 + + >>> retain - train + 1103532313738 + + """ + population = list_utils.population_counts(word) + fprint = Unscrambler._compute_word_fingerprint(word, population) + letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population) + assert fprint & letter_sig == 0 + sig = fprint | letter_sig + return sig + + @staticmethod + def repopulate( + letter_sigs: Dict[str, int], + dictfile: str = '/usr/share/dict/words', + indexfile: str = '/usr/share/dict/sparse_index', + ) -> None: + """Before calling this method, change letter_sigs from the default above + unless you want to populate the same exact files.""" + + words_by_sigs = {} + seen = set() + with open(dictfile, "r") as f: + for word in f: + word = word.replace('\n', '') + word = word.lower() + sig = Unscrambler.compute_word_sig(letter_sigs, word) + logger.debug("%s => 0x%x" % (word, sig)) + if word in seen: + continue + seen.add(word) + if sig in words_by_sigs: + words_by_sigs[sig] += ",%s" % word + else: + words_by_sigs[sig] = word + with open(indexfile, 'w') as f: + for sig in sorted(words_by_sigs.keys()): + word = words_by_sigs[sig] + print(f'0x{sig:x}+{word}', file=f) + + @staticmethod + def lookup(word: str, *, include_fuzzy_matches=False) -> Dict[str, bool]: + """Looks up a potentially scrambled word optionally including near + "fuzzy" matches. + + >>> Unscrambler.lookup('eanycleocipd', include_fuzzy_matches=False) + {'encyclopedia': True} + + """ + sig = Unscrambler.compute_word_sig(word) + return Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=include_fuzzy_matches) + + @staticmethod + def lookup_by_sig(sig, *, include_fuzzy_matches=False) -> Dict[str, bool]: + """Looks up a word that has already been translated into a signature by + a previous call to Unscrambler.compute_word_sig. Optionally returns + near "fuzzy" matches. + + >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin') + >>> sig + 18491949645300288339 + + >>> Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=True) + {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False} + + """ + # Cache the index; it doesn't change and this may be called + # more than once. + if len(Unscrambler.sigs) == 0: + if 'unscramble_indexfile' in config.config: + indexfile = config.config['unscramble_indexfile'] + else: + indexfile = "/usr/share/dict/sparse_index" + with open(indexfile, 'r') as rf: + lines = rf.readlines() + for line in lines: + line = line[:-1] + (fsig, word) = line.split('+') + fsig = int(fsig, 16) + Unscrambler.sigs.append(fsig) + Unscrambler.words.append(word) + + ret = {} + (exact, location) = list_utils.binary_search(Unscrambler.sigs, sig) + start = location - 5 + if start < 0: + start = 0 + end = location + 6 + if end > len(Unscrambler.words): + end = len(Unscrambler.words) + + for x in range(start, end): + word = Unscrambler.words[x] + fsig = Unscrambler.sigs[x] + if include_fuzzy_matches is True or (fsig == sig): + ret[word] = (fsig == sig) + return ret + +# +# To repopulate, change letter_sigs and then call Unscrambler.repopulate. +# See notes above. +# + + +if __name__ == "__main__": + import doctest + doctest.testmod() -- 2.46.0