src/pyutils/unscrambler.py

   1 #!/usr/bin/env python3
   2
   3 # © Copyright 2021-2022, Scott Gasch
   4
   5 """A fast word unscrambler library."""
   6
   7 import logging
   8 from typing import Dict, Mapping, Optional
   9
  10 from pyutils import config, decorator_utils, list_utils
  11 from pyutils.files import file_utils
  12
  13 cfg = config.add_commandline_args(
  14     f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
  15 )
  16 cfg.add_argument(
  17     "--unscrambler_default_indexfile",
  18     help="Path to a file of signature -> word index.",
  19     metavar="FILENAME",
  20     default="/usr/share/dict/sparse_index",
  21 )
  22
  23 logger = logging.getLogger(__name__)
  24
  25 letters_bits = 32
  26 letters_mask = 2**letters_bits - 1
  27
  28 fprint_bits = 52
  29 fprint_mask = (2**fprint_bits - 1) << letters_bits
  30
  31 fprint_feature_bit = {
  32     'e': 0,
  33     'i': 2,
  34     'a': 4,
  35     'o': 6,
  36     'r': 8,
  37     'n': 10,
  38     't': 12,
  39     's': 14,
  40     'l': 16,
  41     'c': 18,
  42     'u': 20,
  43     'p': 22,
  44     'm': 24,
  45     'd': 26,
  46     'h': 28,
  47     'y': 30,
  48     'g': 32,
  49     'b': 34,
  50     'f': 36,
  51     'v': 38,
  52     'k': 40,
  53     'w': 42,
  54     'z': 44,
  55     'x': 46,
  56     'q': 48,
  57     'j': 50,
  58 }
  59
  60 letter_sigs = {
  61     'a': 1789368711,
  62     'b': 3146859322,
  63     'c': 43676229,
  64     'd': 3522623596,
  65     'e': 3544234957,
  66     'f': 3448207591,
  67     'g': 1282648386,
  68     'h': 3672791226,
  69     'i': 1582316135,
  70     'j': 4001984784,
  71     'k': 831769172,
  72     'l': 1160692746,
  73     'm': 2430986565,
  74     'n': 1873586768,
  75     'o': 694443915,
  76     'p': 1602297017,
  77     'q': 533722196,
  78     'r': 3754550193,
  79     's': 1859447115,
  80     't': 1121373020,
  81     'u': 2414108708,
  82     'v': 2693866766,
  83     'w': 748799881,
  84     'x': 2627529228,
  85     'y': 2376066489,
  86     'z': 802338724,
  87 }
  88
  89
  90 class Unscrambler(object):
  91     """A class that unscrambles words quickly by computing a signature
  92     (sig) for the word based on its position independent letter
  93     population and then using a pregenerated index to look up known
  94     words the same set of letters.
  95
  96     Note that each instance of Unscrambler caches its index to speed
  97     up lookups number 2..N; careless reinstantiation will by slower.
  98
  99     Sigs are designed to cluster similar words near each other so both
 100     lookup methods support a "fuzzy match" argument that can be set to
 101     request similar words that do not match exactly in addition to any
 102     exact matches.
 103     """
 104
 105     def __init__(self, indexfile: Optional[str] = None):
 106         """
 107         Constructs an unscrambler.
 108
 109         Args:
 110             indexfile: overrides the default indexfile location if provided
 111         """
 112
 113         # Cached index per instance.
 114         self.sigs = []
 115         self.words = []
 116
 117         filename = Unscrambler.get_indexfile(indexfile)
 118         with open(filename, 'r') as rf:
 119             lines = rf.readlines()
 120         for line in lines:
 121             line = line[:-1]
 122             (fsig, word) = line.split('+')
 123             isig = int(fsig, 16)
 124             self.sigs.append(isig)
 125             self.words.append(word)
 126
 127     @staticmethod
 128     def get_indexfile(indexfile: Optional[str]) -> str:
 129         """Returns the current indexfile location."""
 130         if indexfile is None:
 131             if 'unscrambler_default_indexfile' in config.config:
 132                 indexfile = config.config['unscrambler_default_indexfile']
 133             else:
 134                 indexfile = "/usr/share/dict/sparse_index"
 135         else:
 136             assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
 137         return indexfile
 138
 139     # 52 bits
 140     @staticmethod
 141     def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
 142         fp = 0
 143         for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
 144             letter = pair[0]
 145             if letter in fprint_feature_bit:
 146                 count = min(pair[1], 3)
 147                 shift = fprint_feature_bit[letter]
 148                 s = count << shift
 149                 fp |= s
 150         return fp << letters_bits
 151
 152     # 32 bits
 153     @staticmethod
 154     def _compute_word_letter_sig(
 155         lsigs: Mapping[str, int],
 156         word: str,
 157         population: Mapping[str, int],
 158     ) -> int:
 159         sig = 0
 160         for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
 161             letter = pair[0]
 162             if letter not in lsigs:
 163                 continue
 164             s = lsigs[letter]
 165             count = pair[1]
 166             if count > 1:
 167                 s <<= count
 168                 s |= count
 169             s &= letters_mask
 170             sig ^= s
 171         length = min(len(word), 31)
 172         sig ^= length << 8
 173         sig &= letters_mask
 174         return sig
 175
 176     # 52 + 32 bits
 177     @staticmethod
 178     @decorator_utils.memoized
 179     def compute_word_sig(word: str) -> int:
 180         """Given a word, compute its signature for subsequent lookup
 181         operations.  Signatures are computed based on the letters in
 182         the word and their frequencies.  We try to cluster "similar"
 183         words close to each other in the signature space.
 184
 185         Args:
 186             word: the word to compute a signature for
 187
 188         Returns:
 189             The word's signature.
 190
 191         >>> train = Unscrambler.compute_word_sig('train')
 192         >>> train
 193         23178969883741
 194
 195         >>> retain = Unscrambler.compute_word_sig('retrain')
 196         >>> retain
 197         24282502197479
 198
 199         >>> retain - train
 200         1103532313738
 201
 202         """
 203         population = list_utils.population_counts(word)
 204         fprint = Unscrambler._compute_word_fingerprint(population)
 205         letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
 206         assert fprint & letter_sig == 0
 207         sig = fprint | letter_sig
 208         return sig
 209
 210     @staticmethod
 211     def repopulate(
 212         dictfile: str = '/usr/share/dict/words',
 213         indexfile: str = '/usr/share/dict/sparse_index',
 214     ) -> None:
 215         """
 216         Repopulates the indexfile.
 217
 218         .. warning::
 219
 220             Before calling this method, change letter_sigs from the
 221             default above unless you want to populate the same exact
 222             files.
 223         """
 224         words_by_sigs: Dict[int, str] = {}
 225         seen = set()
 226         with open(dictfile, "r") as f:
 227             for word in f:
 228                 word = word.replace('\n', '')
 229                 word = word.lower()
 230                 sig = Unscrambler.compute_word_sig(word)
 231                 logger.debug("%s => 0x%x", word, sig)
 232                 if word in seen:
 233                     continue
 234                 seen.add(word)
 235                 if sig in words_by_sigs:
 236                     words_by_sigs[sig] += f",{word}"
 237                 else:
 238                     words_by_sigs[sig] = word
 239         with open(indexfile, 'w') as f:
 240             for sig in sorted(words_by_sigs.keys()):
 241                 word = words_by_sigs[sig]
 242                 print(f'0x{sig:x}+{word}', file=f)
 243
 244     def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
 245         """Looks up a potentially scrambled word optionally including near
 246         "fuzzy" matches.
 247
 248         Args:
 249             word: the word to lookup
 250             window_size: the number of nearby fuzzy matches to return
 251
 252         Returns:
 253             A dict of word -> bool containing unscrambled words with (close
 254             to or precisely) the same letters as the input word.  The bool
 255             values in this dict indicate whether the key word is an exact
 256             or near match.  The count of entries in this dict is controlled
 257             by the window_size param.
 258
 259         >>> u = Unscrambler()
 260         >>> u.lookup('eanycleocipd', window_size=0)
 261         {'encyclopedia': True}
 262         """
 263         sig = Unscrambler.compute_word_sig(word)
 264         return self.lookup_by_sig(sig, window_size=window_size)
 265
 266     def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
 267         """Looks up a word that has already been translated into a signature by
 268         a previous call to Unscrambler.compute_word_sig.  Optionally returns
 269         near "fuzzy" matches.
 270
 271         Args:
 272             sig: the signature of the word to lookup (see :meth:`compute_word_sig`
 273                 to generate these signatures).
 274             window_size: the number of nearby fuzzy matches to return
 275
 276         Returns:
 277             A dict of word -> bool containing unscrambled words with (close
 278             to or precisely) the same letters as the input word.  The bool
 279             values in this dict indicate whether the key word is an exact
 280             or near match.  The count of entries in this dict is controlled
 281             by the window_size param.
 282
 283         >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
 284         >>> sig
 285         18491949645300288339
 286
 287         >>> u = Unscrambler()
 288         >>> u.lookup_by_sig(sig)
 289         {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
 290         """
 291         ret = {}
 292         (_, location) = list_utils.binary_search(self.sigs, sig)
 293         start = location - window_size
 294         start = max(start, 0)
 295         end = location + 1 + window_size
 296         end = min(end, len(self.words))
 297
 298         for x in range(start, end):
 299             word = self.words[x]
 300             fsig = self.sigs[x]
 301             if window_size > 0 or (fsig == sig):
 302                 ret[word] = fsig == sig
 303         return ret
 304
 305
 306 #
 307 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
 308 # See notes above.  See also ~/bin/unscramble.py --populate_destructively.
 309 #
 310
 311
 312 if __name__ == "__main__":
 313     import doctest
 314
 315     doctest.testmod()