src/pyutils/unscrambler.py

   1 #!/usr/bin/env python3
   2
   3 # © Copyright 2021-2022, Scott Gasch
   4
   5 """A fast word unscrambler library."""
   6
   7 import logging
   8 from typing import Dict, Mapping, Optional
   9
  10 from pyutils import config, decorator_utils, list_utils
  11 from pyutils.files import file_utils
  12
  13 cfg = config.add_commandline_args(
  14     f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
  15 )
  16 cfg.add_argument(
  17     "--unscrambler_default_indexfile",
  18     help="Path to a file of signature -> word index.",
  19     metavar="FILENAME",
  20     default="/usr/share/dict/sparse_index",
  21 )
  22
  23 logger = logging.getLogger(__name__)
  24
  25 letters_bits = 32
  26 letters_mask = 2**letters_bits - 1
  27
  28 fprint_bits = 52
  29 fprint_mask = (2**fprint_bits - 1) << letters_bits
  30
  31 fprint_feature_bit = {
  32     'e': 0,
  33     'i': 2,
  34     'a': 4,
  35     'o': 6,
  36     'r': 8,
  37     'n': 10,
  38     't': 12,
  39     's': 14,
  40     'l': 16,
  41     'c': 18,
  42     'u': 20,
  43     'p': 22,
  44     'm': 24,
  45     'd': 26,
  46     'h': 28,
  47     'y': 30,
  48     'g': 32,
  49     'b': 34,
  50     'f': 36,
  51     'v': 38,
  52     'k': 40,
  53     'w': 42,
  54     'z': 44,
  55     'x': 46,
  56     'q': 48,
  57     'j': 50,
  58 }
  59
  60 letter_sigs = {
  61     'a': 1789368711,
  62     'b': 3146859322,
  63     'c': 43676229,
  64     'd': 3522623596,
  65     'e': 3544234957,
  66     'f': 3448207591,
  67     'g': 1282648386,
  68     'h': 3672791226,
  69     'i': 1582316135,
  70     'j': 4001984784,
  71     'k': 831769172,
  72     'l': 1160692746,
  73     'm': 2430986565,
  74     'n': 1873586768,
  75     'o': 694443915,
  76     'p': 1602297017,
  77     'q': 533722196,
  78     'r': 3754550193,
  79     's': 1859447115,
  80     't': 1121373020,
  81     'u': 2414108708,
  82     'v': 2693866766,
  83     'w': 748799881,
  84     'x': 2627529228,
  85     'y': 2376066489,
  86     'z': 802338724,
  87 }
  88
  89
  90 class Unscrambler(object):
  91     """A class that unscrambles words quickly by computing a signature
  92     (sig) for the word based on its position independent letter
  93     population and then using a pregenerated index to look up known
  94     words the same set of letters.
  95
  96     Note that each instance of Unscrambler caches its index to speed
  97     up lookups number 2..N; careless reinstantiation will by slower.
  98
  99     Sigs are designed to cluster similar words near each other so both
 100     lookup methods support a "fuzzy match" argument that can be set to
 101     request similar words that do not match exactly in addition to any
 102     exact matches.
 103     """
 104
 105     def __init__(self, indexfile: Optional[str] = None):
 106         """
 107         Constructs an unscrambler.
 108
 109         Args:
 110             indexfile: overrides the default indexfile location if provided
 111         """
 112
 113         # Cached index per instance.
 114         self.sigs = []
 115         self.words = []
 116
 117         filename = Unscrambler.get_indexfile(indexfile)
 118         with open(filename, 'r') as rf:
 119             lines = rf.readlines()
 120         for line in lines:
 121             line = line[:-1]
 122             (fsig, word) = line.split('+')
 123             isig = int(fsig, 16)
 124             self.sigs.append(isig)
 125             self.words.append(word)
 126
 127     @staticmethod
 128     def get_indexfile(indexfile: Optional[str]) -> str:
 129         """Returns the current indexfile location."""
 130         if indexfile is None:
 131             if 'unscrambler_default_indexfile' in config.config:
 132                 indexfile = config.config['unscrambler_default_indexfile']
 133                 assert type(indexfile) == str
 134             else:
 135                 indexfile = "/usr/share/dict/sparse_index"
 136         else:
 137             assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
 138         return indexfile
 139
 140     # 52 bits
 141     @staticmethod
 142     def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
 143         fp = 0
 144         for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
 145             letter = pair[0]
 146             if letter in fprint_feature_bit:
 147                 count = min(pair[1], 3)
 148                 shift = fprint_feature_bit[letter]
 149                 s = count << shift
 150                 fp |= s
 151         return fp << letters_bits
 152
 153     # 32 bits
 154     @staticmethod
 155     def _compute_word_letter_sig(
 156         lsigs: Mapping[str, int],
 157         word: str,
 158         population: Mapping[str, int],
 159     ) -> int:
 160         sig = 0
 161         for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
 162             letter = pair[0]
 163             if letter not in lsigs:
 164                 continue
 165             s = lsigs[letter]
 166             count = pair[1]
 167             if count > 1:
 168                 s <<= count
 169                 s |= count
 170             s &= letters_mask
 171             sig ^= s
 172         length = min(len(word), 31)
 173         sig ^= length << 8
 174         sig &= letters_mask
 175         return sig
 176
 177     # 52 + 32 bits
 178     @staticmethod
 179     @decorator_utils.memoized
 180     def compute_word_sig(word: str) -> int:
 181         """Given a word, compute its signature for subsequent lookup
 182         operations.  Signatures are computed based on the letters in
 183         the word and their frequencies.  We try to cluster "similar"
 184         words close to each other in the signature space.
 185
 186         Args:
 187             word: the word to compute a signature for
 188
 189         Returns:
 190             The word's signature.
 191
 192         >>> train = Unscrambler.compute_word_sig('train')
 193         >>> train
 194         23178969883741
 195
 196         >>> retain = Unscrambler.compute_word_sig('retrain')
 197         >>> retain
 198         24282502197479
 199
 200         >>> retain - train
 201         1103532313738
 202
 203         """
 204         population = list_utils.population_counts(word)
 205         fprint = Unscrambler._compute_word_fingerprint(population)
 206         letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
 207         assert fprint & letter_sig == 0
 208         sig = fprint | letter_sig
 209         return sig
 210
 211     @staticmethod
 212     def repopulate(
 213         dictfile: str = '/usr/share/dict/words',
 214         indexfile: str = '/usr/share/dict/sparse_index',
 215     ) -> None:
 216         """
 217         Repopulates the indexfile.
 218
 219         .. warning::
 220
 221             Before calling this method, change letter_sigs from the
 222             default above unless you want to populate the same exact
 223             files.
 224         """
 225         words_by_sigs: Dict[int, str] = {}
 226         seen = set()
 227         with open(dictfile, "r") as f:
 228             for word in f:
 229                 word = word.replace('\n', '')
 230                 word = word.lower()
 231                 sig = Unscrambler.compute_word_sig(word)
 232                 logger.debug("%s => 0x%x", word, sig)
 233                 if word in seen:
 234                     continue
 235                 seen.add(word)
 236                 if sig in words_by_sigs:
 237                     words_by_sigs[sig] += f",{word}"
 238                 else:
 239                     words_by_sigs[sig] = word
 240         with open(indexfile, 'w') as f:
 241             for sig in sorted(words_by_sigs.keys()):
 242                 word = words_by_sigs[sig]
 243                 print(f'0x{sig:x}+{word}', file=f)
 244
 245     def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
 246         """Looks up a potentially scrambled word optionally including near
 247         "fuzzy" matches.
 248
 249         Args:
 250             word: the word to lookup
 251             window_size: the number of nearby fuzzy matches to return
 252
 253         Returns:
 254             A dict of word -> bool containing unscrambled words with (close
 255             to or precisely) the same letters as the input word.  The bool
 256             values in this dict indicate whether the key word is an exact
 257             or near match.  The count of entries in this dict is controlled
 258             by the window_size param.
 259
 260         >>> u = Unscrambler()
 261         >>> u.lookup('eanycleocipd', window_size=0)
 262         {'encyclopedia': True}
 263         """
 264         sig = Unscrambler.compute_word_sig(word)
 265         return self.lookup_by_sig(sig, window_size=window_size)
 266
 267     def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
 268         """Looks up a word that has already been translated into a signature by
 269         a previous call to Unscrambler.compute_word_sig.  Optionally returns
 270         near "fuzzy" matches.
 271
 272         Args:
 273             sig: the signature of the word to lookup (see :meth:`compute_word_sig`
 274                 to generate these signatures).
 275             window_size: the number of nearby fuzzy matches to return
 276
 277         Returns:
 278             A dict of word -> bool containing unscrambled words with (close
 279             to or precisely) the same letters as the input word.  The bool
 280             values in this dict indicate whether the key word is an exact
 281             or near match.  The count of entries in this dict is controlled
 282             by the window_size param.
 283
 284         >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
 285         >>> sig
 286         18491949645300288339
 287
 288         >>> u = Unscrambler()
 289         >>> u.lookup_by_sig(sig)
 290         {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
 291         """
 292         ret = {}
 293         (_, location) = list_utils.binary_search(self.sigs, sig)
 294         start = location - window_size
 295         start = max(start, 0)
 296         end = location + 1 + window_size
 297         end = min(end, len(self.words))
 298
 299         for x in range(start, end):
 300             word = self.words[x]
 301             fsig = self.sigs[x]
 302             if window_size > 0 or (fsig == sig):
 303                 ret[word] = fsig == sig
 304         return ret
 305
 306
 307 #
 308 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
 309 # See notes above.  See also ~/bin/unscramble.py --populate_destructively.
 310 #
 311
 312
 313 if __name__ == "__main__":
 314     import doctest
 315
 316     doctest.testmod()