Adds unscrambler which contains the guts of a jumble/scrabble player

author Scott <[email protected]>

Wed, 26 Jan 2022 06:05:25 +0000 (22:05 -0800)

committer Scott <[email protected]>

Wed, 26 Jan 2022 06:05:25 +0000 (22:05 -0800)
author Scott <[email protected]>
Wed, 26 Jan 2022 06:05:25 +0000 (22:05 -0800)
committer Scott <[email protected]>
Wed, 26 Jan 2022 06:05:25 +0000 (22:05 -0800)
diff --git a/unscrambler.py b/unscrambler.py

new file mode 100755 (executable)

index 0000000..056107f
--- /dev/null
+++ b/unscrambler.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+
+import logging
+from typing import Dict
+
+import config
+import decorator_utils
+import list_utils
+
+cfg = config.add_commandline_args(
+    f'Unscramble! ({__file__})',
+    'A fast word unscrambler.'
+)
+cfg.add_argument(
+    "--unscramble_indexfile",
+    help="Path to a file of signature -> word index.",
+    metavar="FILENAME",
+    default="/usr/share/dict/sparse_index",
+)
+
+logger = logging.getLogger(__name__)
+
+letters_bits = 32
+letters_mask = 2 ** letters_bits - 1
+
+fprint_bits = 52
+fprint_mask = (2 ** fprint_bits - 1) << letters_bits
+
+fprint_feature_bit = {
+    'e': 0,
+    'i': 2,
+    'a': 4,
+    'o': 6,
+    'r': 8,
+    'n': 10,
+    't': 12,
+    's': 14,
+    'l': 16,
+    'c': 18,
+    'u': 20,
+    'p': 22,
+    'm': 24,
+    'd': 26,
+    'h': 28,
+    'y': 30,
+    'g': 32,
+    'b': 34,
+    'f': 36,
+    'v': 38,
+    'k': 40,
+    'w': 42,
+    'z': 44,
+    'x': 46,
+    'q': 48,
+    'j': 50,
+}
+
+letter_sigs = {
+    'a': 1789368711,
+    'b': 3146859322,
+    'c': 43676229,
+    'd': 3522623596,
+    'e': 3544234957,
+    'f': 3448207591,
+    'g': 1282648386,
+    'h': 3672791226,
+    'i': 1582316135,
+    'j': 4001984784,
+    'k': 831769172,
+    'l': 1160692746,
+    'm': 2430986565,
+    'n': 1873586768,
+    'o': 694443915,
+    'p': 1602297017,
+    'q': 533722196,
+    'r': 3754550193,
+    's': 1859447115,
+    't': 1121373020,
+    'u': 2414108708,
+    'v': 2693866766,
+    'w': 748799881,
+    'x': 2627529228,
+    'y': 2376066489,
+    'z': 802338724,
+}
+
+
+class Unscrambler(object):
+    sigs = []
+    words = []
+
+    def __init__(self):
+        pass
+
+    # 52 bits
+    @staticmethod
+    def _compute_word_fingerprint(word: str, population) -> int:
+        fp = 0
+        for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
+            letter = pair[0]
+            if letter in fprint_feature_bit:
+                count = pair[1]
+                if count > 3:
+                    count = 3
+                shift = fprint_feature_bit[letter]
+                s = count << shift
+                fp |= s
+        return fp << letters_bits
+
+    # 32 bits
+    @staticmethod
+    def _compute_word_letter_sig(letter_sigs, word: str, population) -> int:
+        sig = 0
+        for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
+            letter = pair[0]
+            if letter not in letter_sigs:
+                continue
+            s = letter_sigs[letter]
+            count = pair[1]
+            if count > 1:
+                s <<= count
+                s |= count
+            s &= letters_mask
+            sig ^= s
+        length = len(word)
+        if length > 31:
+            length = 31
+        sig ^= length << 8
+        sig &= letters_mask
+        return sig
+
+    # 52 + 32 bits
+    @staticmethod
+    @decorator_utils.memoized
+    def compute_word_sig(word: str) -> int:
+        """Given a word, compute its signature for subsequent lookup
+        operations.  Signatures are computed based on the letters in
+        the word and their frequencies.  We try to cluster "similar"
+        words close to each other in the signature space.
+
+        >>> train = Unscrambler.compute_word_sig('train')
+        >>> train
+        23178969883741
+
+        >>> retain = Unscrambler.compute_word_sig('retrain')
+        >>> retain
+        24282502197479
+
+        >>> retain - train
+        1103532313738
+
+        """
+        population = list_utils.population_counts(word)
+        fprint = Unscrambler._compute_word_fingerprint(word, population)
+        letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
+        assert fprint & letter_sig == 0
+        sig = fprint | letter_sig
+        return sig
+
+    @staticmethod
+    def repopulate(
+            letter_sigs: Dict[str, int],
+            dictfile: str = '/usr/share/dict/words',
+            indexfile: str = '/usr/share/dict/sparse_index',
+    ) -> None:
+        """Before calling this method, change letter_sigs from the default above
+        unless you want to populate the same exact files."""
+
+        words_by_sigs = {}
+        seen = set()
+        with open(dictfile, "r") as f:
+            for word in f:
+                word = word.replace('\n', '')
+                word = word.lower()
+                sig = Unscrambler.compute_word_sig(letter_sigs, word)
+                logger.debug("%s => 0x%x" % (word, sig))
+                if word in seen:
+                    continue
+                seen.add(word)
+                if sig in words_by_sigs:
+                    words_by_sigs[sig] += ",%s" % word
+                else:
+                    words_by_sigs[sig] = word
+        with open(indexfile, 'w') as f:
+            for sig in sorted(words_by_sigs.keys()):
+                word = words_by_sigs[sig]
+                print(f'0x{sig:x}+{word}', file=f)
+
+    @staticmethod
+    def lookup(word: str, *, include_fuzzy_matches=False) -> Dict[str, bool]:
+        """Looks up a potentially scrambled word optionally including near
+        "fuzzy" matches.
+
+        >>> Unscrambler.lookup('eanycleocipd', include_fuzzy_matches=False)
+        {'encyclopedia': True}
+
+        """
+        sig = Unscrambler.compute_word_sig(word)
+        return Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=include_fuzzy_matches)
+
+    @staticmethod
+    def lookup_by_sig(sig, *, include_fuzzy_matches=False) -> Dict[str, bool]:
+        """Looks up a word that has already been translated into a signature by
+        a previous call to Unscrambler.compute_word_sig.  Optionally returns
+        near "fuzzy" matches.
+
+        >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
+        >>> sig
+        18491949645300288339
+
+        >>> Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=True)
+        {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
+
+        """
+        # Cache the index; it doesn't change and this may be called
+        # more than once.
+        if len(Unscrambler.sigs) == 0:
+            if 'unscramble_indexfile' in config.config:
+                indexfile = config.config['unscramble_indexfile']
+            else:
+                indexfile = "/usr/share/dict/sparse_index"
+            with open(indexfile, 'r') as rf:
+                lines = rf.readlines()
+            for line in lines:
+                line = line[:-1]
+                (fsig, word) = line.split('+')
+                fsig = int(fsig, 16)
+                Unscrambler.sigs.append(fsig)
+                Unscrambler.words.append(word)
+
+        ret = {}
+        (exact, location) = list_utils.binary_search(Unscrambler.sigs, sig)
+        start = location - 5
+        if start < 0:
+            start = 0
+        end = location + 6
+        if end > len(Unscrambler.words):
+            end = len(Unscrambler.words)
+
+        for x in range(start, end):
+            word = Unscrambler.words[x]
+            fsig = Unscrambler.sigs[x]
+            if include_fuzzy_matches is True or (fsig == sig):
+                ret[word] = (fsig == sig)
+        return ret
+
+#
+# To repopulate, change letter_sigs and then call Unscrambler.repopulate.
+# See notes above.
+#
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
author	Scott <[email protected]>
	Wed, 26 Jan 2022 06:05:25 +0000 (22:05 -0800)
committer	Scott <[email protected]>
	Wed, 26 Jan 2022 06:05:25 +0000 (22:05 -0800)