--- /dev/null
+#!/usr/bin/env python3
+
+import logging
+from typing import Dict
+
+import config
+import decorator_utils
+import list_utils
+
+cfg = config.add_commandline_args(
+ f'Unscramble! ({__file__})',
+ 'A fast word unscrambler.'
+)
+cfg.add_argument(
+ "--unscramble_indexfile",
+ help="Path to a file of signature -> word index.",
+ metavar="FILENAME",
+ default="/usr/share/dict/sparse_index",
+)
+
+logger = logging.getLogger(__name__)
+
+letters_bits = 32
+letters_mask = 2 ** letters_bits - 1
+
+fprint_bits = 52
+fprint_mask = (2 ** fprint_bits - 1) << letters_bits
+
+fprint_feature_bit = {
+ 'e': 0,
+ 'i': 2,
+ 'a': 4,
+ 'o': 6,
+ 'r': 8,
+ 'n': 10,
+ 't': 12,
+ 's': 14,
+ 'l': 16,
+ 'c': 18,
+ 'u': 20,
+ 'p': 22,
+ 'm': 24,
+ 'd': 26,
+ 'h': 28,
+ 'y': 30,
+ 'g': 32,
+ 'b': 34,
+ 'f': 36,
+ 'v': 38,
+ 'k': 40,
+ 'w': 42,
+ 'z': 44,
+ 'x': 46,
+ 'q': 48,
+ 'j': 50,
+}
+
+letter_sigs = {
+ 'a': 1789368711,
+ 'b': 3146859322,
+ 'c': 43676229,
+ 'd': 3522623596,
+ 'e': 3544234957,
+ 'f': 3448207591,
+ 'g': 1282648386,
+ 'h': 3672791226,
+ 'i': 1582316135,
+ 'j': 4001984784,
+ 'k': 831769172,
+ 'l': 1160692746,
+ 'm': 2430986565,
+ 'n': 1873586768,
+ 'o': 694443915,
+ 'p': 1602297017,
+ 'q': 533722196,
+ 'r': 3754550193,
+ 's': 1859447115,
+ 't': 1121373020,
+ 'u': 2414108708,
+ 'v': 2693866766,
+ 'w': 748799881,
+ 'x': 2627529228,
+ 'y': 2376066489,
+ 'z': 802338724,
+}
+
+
+class Unscrambler(object):
+ sigs = []
+ words = []
+
+ def __init__(self):
+ pass
+
+ # 52 bits
+ @staticmethod
+ def _compute_word_fingerprint(word: str, population) -> int:
+ fp = 0
+ for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
+ letter = pair[0]
+ if letter in fprint_feature_bit:
+ count = pair[1]
+ if count > 3:
+ count = 3
+ shift = fprint_feature_bit[letter]
+ s = count << shift
+ fp |= s
+ return fp << letters_bits
+
+ # 32 bits
+ @staticmethod
+ def _compute_word_letter_sig(letter_sigs, word: str, population) -> int:
+ sig = 0
+ for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
+ letter = pair[0]
+ if letter not in letter_sigs:
+ continue
+ s = letter_sigs[letter]
+ count = pair[1]
+ if count > 1:
+ s <<= count
+ s |= count
+ s &= letters_mask
+ sig ^= s
+ length = len(word)
+ if length > 31:
+ length = 31
+ sig ^= length << 8
+ sig &= letters_mask
+ return sig
+
+ # 52 + 32 bits
+ @staticmethod
+ @decorator_utils.memoized
+ def compute_word_sig(word: str) -> int:
+ """Given a word, compute its signature for subsequent lookup
+ operations. Signatures are computed based on the letters in
+ the word and their frequencies. We try to cluster "similar"
+ words close to each other in the signature space.
+
+ >>> train = Unscrambler.compute_word_sig('train')
+ >>> train
+ 23178969883741
+
+ >>> retain = Unscrambler.compute_word_sig('retrain')
+ >>> retain
+ 24282502197479
+
+ >>> retain - train
+ 1103532313738
+
+ """
+ population = list_utils.population_counts(word)
+ fprint = Unscrambler._compute_word_fingerprint(word, population)
+ letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
+ assert fprint & letter_sig == 0
+ sig = fprint | letter_sig
+ return sig
+
+ @staticmethod
+ def repopulate(
+ letter_sigs: Dict[str, int],
+ dictfile: str = '/usr/share/dict/words',
+ indexfile: str = '/usr/share/dict/sparse_index',
+ ) -> None:
+ """Before calling this method, change letter_sigs from the default above
+ unless you want to populate the same exact files."""
+
+ words_by_sigs = {}
+ seen = set()
+ with open(dictfile, "r") as f:
+ for word in f:
+ word = word.replace('\n', '')
+ word = word.lower()
+ sig = Unscrambler.compute_word_sig(letter_sigs, word)
+ logger.debug("%s => 0x%x" % (word, sig))
+ if word in seen:
+ continue
+ seen.add(word)
+ if sig in words_by_sigs:
+ words_by_sigs[sig] += ",%s" % word
+ else:
+ words_by_sigs[sig] = word
+ with open(indexfile, 'w') as f:
+ for sig in sorted(words_by_sigs.keys()):
+ word = words_by_sigs[sig]
+ print(f'0x{sig:x}+{word}', file=f)
+
+ @staticmethod
+ def lookup(word: str, *, include_fuzzy_matches=False) -> Dict[str, bool]:
+ """Looks up a potentially scrambled word optionally including near
+ "fuzzy" matches.
+
+ >>> Unscrambler.lookup('eanycleocipd', include_fuzzy_matches=False)
+ {'encyclopedia': True}
+
+ """
+ sig = Unscrambler.compute_word_sig(word)
+ return Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=include_fuzzy_matches)
+
+ @staticmethod
+ def lookup_by_sig(sig, *, include_fuzzy_matches=False) -> Dict[str, bool]:
+ """Looks up a word that has already been translated into a signature by
+ a previous call to Unscrambler.compute_word_sig. Optionally returns
+ near "fuzzy" matches.
+
+ >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
+ >>> sig
+ 18491949645300288339
+
+ >>> Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=True)
+ {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
+
+ """
+ # Cache the index; it doesn't change and this may be called
+ # more than once.
+ if len(Unscrambler.sigs) == 0:
+ if 'unscramble_indexfile' in config.config:
+ indexfile = config.config['unscramble_indexfile']
+ else:
+ indexfile = "/usr/share/dict/sparse_index"
+ with open(indexfile, 'r') as rf:
+ lines = rf.readlines()
+ for line in lines:
+ line = line[:-1]
+ (fsig, word) = line.split('+')
+ fsig = int(fsig, 16)
+ Unscrambler.sigs.append(fsig)
+ Unscrambler.words.append(word)
+
+ ret = {}
+ (exact, location) = list_utils.binary_search(Unscrambler.sigs, sig)
+ start = location - 5
+ if start < 0:
+ start = 0
+ end = location + 6
+ if end > len(Unscrambler.words):
+ end = len(Unscrambler.words)
+
+ for x in range(start, end):
+ word = Unscrambler.words[x]
+ fsig = Unscrambler.sigs[x]
+ if include_fuzzy_matches is True or (fsig == sig):
+ ret[word] = (fsig == sig)
+ return ret
+
+#
+# To repopulate, change letter_sigs and then call Unscrambler.repopulate.
+# See notes above.
+#
+
+
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod()