unscrambler.py

   1 #!/usr/bin/env python3
   2
   3 import logging
   4 from typing import Dict
   5
   6 import config
   7 import decorator_utils
   8 import list_utils
   9
  10 cfg = config.add_commandline_args(
  11     f'Unscramble! ({__file__})', 'A fast word unscrambler.'
  12 )
  13 cfg.add_argument(
  14     "--unscramble_indexfile",
  15     help="Path to a file of signature -> word index.",
  16     metavar="FILENAME",
  17     default="/usr/share/dict/sparse_index",
  18 )
  19
  20 logger = logging.getLogger(__name__)
  21
  22 letters_bits = 32
  23 letters_mask = 2 ** letters_bits - 1
  24
  25 fprint_bits = 52
  26 fprint_mask = (2 ** fprint_bits - 1) << letters_bits
  27
  28 fprint_feature_bit = {
  29     'e': 0,
  30     'i': 2,
  31     'a': 4,
  32     'o': 6,
  33     'r': 8,
  34     'n': 10,
  35     't': 12,
  36     's': 14,
  37     'l': 16,
  38     'c': 18,
  39     'u': 20,
  40     'p': 22,
  41     'm': 24,
  42     'd': 26,
  43     'h': 28,
  44     'y': 30,
  45     'g': 32,
  46     'b': 34,
  47     'f': 36,
  48     'v': 38,
  49     'k': 40,
  50     'w': 42,
  51     'z': 44,
  52     'x': 46,
  53     'q': 48,
  54     'j': 50,
  55 }
  56
  57 letter_sigs = {
  58     'a': 1789368711,
  59     'b': 3146859322,
  60     'c': 43676229,
  61     'd': 3522623596,
  62     'e': 3544234957,
  63     'f': 3448207591,
  64     'g': 1282648386,
  65     'h': 3672791226,
  66     'i': 1582316135,
  67     'j': 4001984784,
  68     'k': 831769172,
  69     'l': 1160692746,
  70     'm': 2430986565,
  71     'n': 1873586768,
  72     'o': 694443915,
  73     'p': 1602297017,
  74     'q': 533722196,
  75     'r': 3754550193,
  76     's': 1859447115,
  77     't': 1121373020,
  78     'u': 2414108708,
  79     'v': 2693866766,
  80     'w': 748799881,
  81     'x': 2627529228,
  82     'y': 2376066489,
  83     'z': 802338724,
  84 }
  85
  86
  87 class Unscrambler(object):
  88     """A class that unscrambles words quickly by computing a signature
  89     (sig) for the word based on its position independent letter
  90     population and then using a pregenerated index to look up known
  91     words the same set of letters.
  92
  93     Note that each instance of Unscrambler caches its index to speed
  94     up lookups number 2..N; careless reinstantiation will by slower.
  95
  96     Sigs are designed to cluster similar words near each other so both
  97     lookup methods support a "fuzzy match" argument that can be set to
  98     request similar words that do not match exactly in addition to any
  99     exact matches.
 100
 101     """
 102
 103     def __init__(self):
 104         # Cached index per instance.
 105         self.sigs = []
 106         self.words = []
 107
 108         if 'unscramble_indexfile' in config.config:
 109             indexfile = config.config['unscramble_indexfile']
 110         else:
 111             indexfile = "/usr/share/dict/sparse_index"
 112
 113         with open(indexfile, 'r') as rf:
 114             lines = rf.readlines()
 115         for line in lines:
 116             line = line[:-1]
 117             (fsig, word) = line.split('+')
 118             fsig = int(fsig, 16)
 119             self.sigs.append(fsig)
 120             self.words.append(word)
 121
 122     # 52 bits
 123     @staticmethod
 124     def _compute_word_fingerprint(word: str, population) -> int:
 125         fp = 0
 126         for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
 127             letter = pair[0]
 128             if letter in fprint_feature_bit:
 129                 count = pair[1]
 130                 if count > 3:
 131                     count = 3
 132                 shift = fprint_feature_bit[letter]
 133                 s = count << shift
 134                 fp |= s
 135         return fp << letters_bits
 136
 137     # 32 bits
 138     @staticmethod
 139     def _compute_word_letter_sig(letter_sigs, word: str, population) -> int:
 140         sig = 0
 141         for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
 142             letter = pair[0]
 143             if letter not in letter_sigs:
 144                 continue
 145             s = letter_sigs[letter]
 146             count = pair[1]
 147             if count > 1:
 148                 s <<= count
 149                 s |= count
 150             s &= letters_mask
 151             sig ^= s
 152         length = len(word)
 153         if length > 31:
 154             length = 31
 155         sig ^= length << 8
 156         sig &= letters_mask
 157         return sig
 158
 159     # 52 + 32 bits
 160     @staticmethod
 161     @decorator_utils.memoized
 162     def compute_word_sig(word: str) -> int:
 163         """Given a word, compute its signature for subsequent lookup
 164         operations.  Signatures are computed based on the letters in
 165         the word and their frequencies.  We try to cluster "similar"
 166         words close to each other in the signature space.
 167
 168         >>> train = Unscrambler.compute_word_sig('train')
 169         >>> train
 170         23178969883741
 171
 172         >>> retain = Unscrambler.compute_word_sig('retrain')
 173         >>> retain
 174         24282502197479
 175
 176         >>> retain - train
 177         1103532313738
 178
 179         """
 180         population = list_utils.population_counts(word)
 181         fprint = Unscrambler._compute_word_fingerprint(word, population)
 182         letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
 183         assert fprint & letter_sig == 0
 184         sig = fprint | letter_sig
 185         return sig
 186
 187     @staticmethod
 188     def repopulate(
 189         letter_sigs: Dict[str, int],
 190         dictfile: str = '/usr/share/dict/words',
 191         indexfile: str = '/usr/share/dict/sparse_index',
 192     ) -> None:
 193         """Before calling this method, change letter_sigs from the default above
 194         unless you want to populate the same exact files.
 195
 196         """
 197         words_by_sigs = {}
 198         seen = set()
 199         with open(dictfile, "r") as f:
 200             for word in f:
 201                 word = word.replace('\n', '')
 202                 word = word.lower()
 203                 sig = Unscrambler.compute_word_sig(letter_sigs, word)
 204                 logger.debug("%s => 0x%x" % (word, sig))
 205                 if word in seen:
 206                     continue
 207                 seen.add(word)
 208                 if sig in words_by_sigs:
 209                     words_by_sigs[sig] += ",%s" % word
 210                 else:
 211                     words_by_sigs[sig] = word
 212         with open(indexfile, 'w') as f:
 213             for sig in sorted(words_by_sigs.keys()):
 214                 word = words_by_sigs[sig]
 215                 print(f'0x{sig:x}+{word}', file=f)
 216
 217     def lookup(self, word: str, *, include_fuzzy_matches=False) -> Dict[str, bool]:
 218         """Looks up a potentially scrambled word optionally including near
 219         "fuzzy" matches.
 220
 221         >>> u = Unscrambler()
 222         >>> u.lookup('eanycleocipd', include_fuzzy_matches=False)
 223         {'encyclopedia': True}
 224
 225         """
 226         sig = Unscrambler.compute_word_sig(word)
 227         return self.lookup_by_sig(sig, include_fuzzy_matches=include_fuzzy_matches)
 228
 229     def lookup_by_sig(self, sig, *, include_fuzzy_matches=False) -> Dict[str, bool]:
 230         """Looks up a word that has already been translated into a signature by
 231         a previous call to Unscrambler.compute_word_sig.  Optionally returns
 232         near "fuzzy" matches.
 233
 234         >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
 235         >>> sig
 236         18491949645300288339
 237
 238         >>> u = Unscrambler()
 239         >>> u.lookup_by_sig(sig, include_fuzzy_matches=True)
 240         {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
 241
 242         """
 243         ret = {}
 244         (exact, location) = list_utils.binary_search(self.sigs, sig)
 245         start = location - 5
 246         if start < 0:
 247             start = 0
 248         end = location + 6
 249         if end > len(self.words):
 250             end = len(self.words)
 251
 252         for x in range(start, end):
 253             word = self.words[x]
 254             fsig = self.sigs[x]
 255             if include_fuzzy_matches is True or (fsig == sig):
 256                 ret[word] = fsig == sig
 257         return ret
 258
 259
 260 #
 261 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
 262 # See notes above.  See also ~/bin/unscramble.py --populate_destructively.
 263 #
 264
 265
 266 if __name__ == "__main__":
 267     import doctest
 268
 269     doctest.testmod()