src/pyutils/unscrambler.py

   1 #!/usr/bin/env python3
   2
   3 # © Copyright 2021-2022, Scott Gasch
   4
   5 """A fast (English) word unscrambler."""
   6
   7 import logging
   8 from typing import Dict, Mapping, Optional
   9
  10 from pyutils import config, decorator_utils, list_utils
  11 from pyutils.files import file_utils
  12
  13 cfg = config.add_commandline_args(
  14     f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
  15 )
  16 cfg.add_argument(
  17     "--unscrambler_default_indexfile",
  18     help="Path to a file of signature -> word index.",
  19     metavar="FILENAME",
  20     default="/usr/share/dict/sparse_index",
  21 )
  22
  23 logger = logging.getLogger(__name__)
  24
  25 letters_bits = 32
  26 letters_mask = 2**letters_bits - 1
  27
  28 fprint_bits = 52
  29 fprint_mask = (2**fprint_bits - 1) << letters_bits
  30
  31 fprint_feature_bit = {
  32     'e': 0,
  33     'i': 2,
  34     'a': 4,
  35     'o': 6,
  36     'r': 8,
  37     'n': 10,
  38     't': 12,
  39     's': 14,
  40     'l': 16,
  41     'c': 18,
  42     'u': 20,
  43     'p': 22,
  44     'm': 24,
  45     'd': 26,
  46     'h': 28,
  47     'y': 30,
  48     'g': 32,
  49     'b': 34,
  50     'f': 36,
  51     'v': 38,
  52     'k': 40,
  53     'w': 42,
  54     'z': 44,
  55     'x': 46,
  56     'q': 48,
  57     'j': 50,
  58 }
  59
  60 letter_sigs = {
  61     'a': 1789368711,
  62     'b': 3146859322,
  63     'c': 43676229,
  64     'd': 3522623596,
  65     'e': 3544234957,
  66     'f': 3448207591,
  67     'g': 1282648386,
  68     'h': 3672791226,
  69     'i': 1582316135,
  70     'j': 4001984784,
  71     'k': 831769172,
  72     'l': 1160692746,
  73     'm': 2430986565,
  74     'n': 1873586768,
  75     'o': 694443915,
  76     'p': 1602297017,
  77     'q': 533722196,
  78     'r': 3754550193,
  79     's': 1859447115,
  80     't': 1121373020,
  81     'u': 2414108708,
  82     'v': 2693866766,
  83     'w': 748799881,
  84     'x': 2627529228,
  85     'y': 2376066489,
  86     'z': 802338724,
  87 }
  88
  89
  90 class Unscrambler(object):
  91     """A class that unscrambles words quickly by computing a signature
  92     (sig) for the word based on its position independent letter
  93     population and then using a pregenerated index to look up known
  94     words the same set of letters.
  95
  96     Sigs are designed to cluster similar words near each other so both
  97     lookup methods support a "fuzzy match" argument that can be set to
  98     request similar words that do not match exactly in addition to any
  99     exact matches.
 100
 101     .. note::
 102
 103         Each instance of Unscrambler caches its index to speed up
 104         lookups number 2..N; careless deletion / reinstantiation will
 105         suffer slower performance.
 106     """
 107
 108     def __init__(self, indexfile: Optional[str] = None):
 109         """
 110         Constructs an unscrambler.
 111
 112         Args:
 113             indexfile: overrides the default indexfile location if provided.
 114                 To [re]generate the indexfile, see :meth:`repopulate`.
 115         """
 116
 117         # Cached index per instance.
 118         self.sigs = []
 119         self.words = []
 120
 121         filename = Unscrambler.get_indexfile(indexfile)
 122         with open(filename, 'r') as rf:
 123             lines = rf.readlines()
 124         for line in lines:
 125             line = line[:-1]
 126             (fsig, word) = line.split('+')
 127             isig = int(fsig, 16)
 128             self.sigs.append(isig)
 129             self.words.append(word)
 130
 131     @staticmethod
 132     def get_indexfile(indexfile: Optional[str]) -> str:
 133         """
 134         Returns:
 135             The current indexfile location
 136         """
 137         if indexfile is None:
 138             if 'unscrambler_default_indexfile' in config.config:
 139                 indexfile = config.config['unscrambler_default_indexfile']
 140                 assert type(indexfile) == str
 141             else:
 142                 indexfile = "/usr/share/dict/sparse_index"
 143         else:
 144             assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
 145         return indexfile
 146
 147     # 52 bits
 148     @staticmethod
 149     def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
 150         fp = 0
 151         for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
 152             letter = pair[0]
 153             if letter in fprint_feature_bit:
 154                 count = min(pair[1], 3)
 155                 shift = fprint_feature_bit[letter]
 156                 s = count << shift
 157                 fp |= s
 158         return fp << letters_bits
 159
 160     # 32 bits
 161     @staticmethod
 162     def _compute_word_letter_sig(
 163         lsigs: Mapping[str, int],
 164         word: str,
 165         population: Mapping[str, int],
 166     ) -> int:
 167         sig = 0
 168         for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
 169             letter = pair[0]
 170             if letter not in lsigs:
 171                 continue
 172             s = lsigs[letter]
 173             count = pair[1]
 174             if count > 1:
 175                 s <<= count
 176                 s |= count
 177             s &= letters_mask
 178             sig ^= s
 179         length = min(len(word), 31)
 180         sig ^= length << 8
 181         sig &= letters_mask
 182         return sig
 183
 184     # 52 + 32 bits
 185     @staticmethod
 186     @decorator_utils.memoized
 187     def compute_word_sig(word: str) -> int:
 188         """Given a word, compute its signature for subsequent lookup
 189         operations.  Signatures are computed based on the letters in
 190         the word and their frequencies.  We try to cluster "similar"
 191         words close to each other in the signature space.
 192
 193         Args:
 194             word: the word to compute a signature for
 195
 196         Returns:
 197             The word's signature.
 198
 199         >>> test = Unscrambler.compute_word_sig('test')
 200         >>> test
 201         105560478284788
 202
 203         >>> teste = Unscrambler.compute_word_sig('teste')
 204         >>> teste
 205         105562386542095
 206
 207         >>> teste - test
 208         1908257307
 209
 210         """
 211         population = list_utils.population_counts(word)
 212         fprint = Unscrambler._compute_word_fingerprint(population)
 213         letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
 214         assert fprint & letter_sig == 0
 215         sig = fprint | letter_sig
 216         return sig
 217
 218     @staticmethod
 219     def repopulate(
 220         dictfile: str = '/usr/share/dict/words',
 221         indexfile: str = '/usr/share/dict/sparse_index',
 222     ) -> None:
 223         """
 224         Repopulates the indexfile.
 225
 226         Args:
 227             dictfile: a file that contains one word per line
 228             indexfile: the file to populate with sig, word pairs for future use
 229                 by this class.
 230
 231         .. warning::
 232
 233             Before calling this method, change `letter_sigs` from the
 234             default above unless you want to populate the same exact
 235             files.
 236         """
 237         words_by_sigs: Dict[int, str] = {}
 238         seen = set()
 239         with open(dictfile, "r") as f:
 240             for word in f:
 241                 word = word.replace('\n', '')
 242                 word = word.lower()
 243                 sig = Unscrambler.compute_word_sig(word)
 244                 logger.debug("%s => 0x%x", word, sig)
 245                 if word in seen:
 246                     continue
 247                 seen.add(word)
 248                 if sig in words_by_sigs:
 249                     words_by_sigs[sig] += f",{word}"
 250                 else:
 251                     words_by_sigs[sig] = word
 252         with open(indexfile, 'w') as f:
 253             for sig in sorted(words_by_sigs.keys()):
 254                 word = words_by_sigs[sig]
 255                 print(f'0x{sig:x}+{word}', file=f)
 256
 257     def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
 258         """Looks up a potentially scrambled word optionally including near
 259         "fuzzy" matches.
 260
 261         Args:
 262             word: the word to lookup
 263             window_size: the number of nearby fuzzy matches to return
 264
 265         Returns:
 266             A dict of word -> bool containing unscrambled words with (close
 267             to or precisely) the same letters as the input word.  The bool
 268             values in this dict indicate whether the key word is an exact
 269             or near match.  The count of entries in this dict is controlled
 270             by the window_size param.
 271
 272         >>> u = Unscrambler()
 273         >>> u.lookup('eanycleocipd', window_size=0)
 274         {'encyclopedia': True}
 275         """
 276         sig = Unscrambler.compute_word_sig(word)
 277         return self.lookup_by_sig(sig, window_size=window_size)
 278
 279     def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
 280         """Looks up a word that has already been translated into a signature by
 281         a previous call to Unscrambler.compute_word_sig.  Optionally returns
 282         near "fuzzy" matches.
 283
 284         Args:
 285             sig: the signature of the word to lookup (see :meth:`compute_word_sig`
 286                 to generate these signatures).
 287             window_size: the number of nearby fuzzy matches to return
 288
 289         Returns:
 290             A dict of word -> bool containing unscrambled words with (close
 291             to or precisely) the same letters as the input word.  The bool
 292             values in this dict indicate whether the key word is an exact
 293             or near match.  The count of entries in this dict is controlled
 294             by the window_size param.
 295
 296         >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
 297         >>> sig
 298         18491949645300288339
 299
 300         >>> u = Unscrambler()
 301         >>> u.lookup_by_sig(sig)
 302         {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
 303         """
 304         ret = {}
 305         (_, location) = list_utils.binary_search(self.sigs, sig)
 306         start = location - window_size
 307         start = max(start, 0)
 308         end = location + 1 + window_size
 309         end = min(end, len(self.words))
 310
 311         for x in range(start, end):
 312             word = self.words[x]
 313             fsig = self.sigs[x]
 314             if window_size > 0 or (fsig == sig):
 315                 ret[word] = fsig == sig
 316         return ret
 317
 318
 319 if __name__ == "__main__":
 320     import doctest
 321
 322     doctest.testmod()