Hammer on that run_tests.py thing again.

[python_utils.git] / unscrambler.py
diff --git a/unscrambler.py b/unscrambler.py

old mode 100755 (executable)

new mode 100644 (file)

index e53ec2a..3ccddbb
--- a/unscrambler.py
+++ b/unscrambler.py
@@ -1,17 +1,22 @@
  #!/usr/bin/env python3
  
  #!/usr/bin/env python3
  
+# © Copyright 2021-2022, Scott Gasch
+
+"""A fast word unscrambler library."""
+
  import logging
  import logging
-from typing import Dict
+from typing import Dict, Mapping, Optional
  
  import config
  import decorator_utils
  
  import config
  import decorator_utils
+import file_utils
  import list_utils
  
  cfg = config.add_commandline_args(
  import list_utils
  
  cfg = config.add_commandline_args(
-    f'Unscramble! ({__file__})', 'A fast word unscrambler.'
+    f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
  )
  cfg.add_argument(
  )
  cfg.add_argument(
-    "--unscramble_indexfile",
+    "--unscrambler_default_indexfile",
      help="Path to a file of signature -> word index.",
      metavar="FILENAME",
      default="/usr/share/dict/sparse_index",
      help="Path to a file of signature -> word index.",
      metavar="FILENAME",
      default="/usr/share/dict/sparse_index",
@@ -20,10 +25,10 @@ cfg.add_argument(
  logger = logging.getLogger(__name__)
  
  letters_bits = 32
  logger = logging.getLogger(__name__)
  
  letters_bits = 32
-letters_mask = 2 ** letters_bits - 1
+letters_mask = 2**letters_bits - 1
  
  fprint_bits = 52
  
  fprint_bits = 52
-fprint_mask = (2 ** fprint_bits - 1) << letters_bits
+fprint_mask = (2**fprint_bits - 1) << letters_bits
  
  fprint_feature_bit = {
      'e': 0,
  
  fprint_feature_bit = {
      'e': 0,
@@ -85,22 +90,62 @@ letter_sigs = {
  
  
  class Unscrambler(object):
  
  
  class Unscrambler(object):
-    sigs = []
-    words = []
+    """A class that unscrambles words quickly by computing a signature
+    (sig) for the word based on its position independent letter
+    population and then using a pregenerated index to look up known
+    words the same set of letters.
+
+    Note that each instance of Unscrambler caches its index to speed
+    up lookups number 2..N; careless reinstantiation will by slower.
+
+    Sigs are designed to cluster similar words near each other so both
+    lookup methods support a "fuzzy match" argument that can be set to
+    request similar words that do not match exactly in addition to any
+    exact matches.
+    """
+
+    def __init__(self, indexfile: Optional[str] = None):
+        """
+        Constructs an unscrambler.
+
+        Args:
+            indexfile: overrides the default indexfile location if provided
+        """
  
  
-    def __init__(self):
-        pass
+        # Cached index per instance.
+        self.sigs = []
+        self.words = []
+
+        filename = Unscrambler.get_indexfile(indexfile)
+        with open(filename, 'r') as rf:
+            lines = rf.readlines()
+        for line in lines:
+            line = line[:-1]
+            (fsig, word) = line.split('+')
+            isig = int(fsig, 16)
+            self.sigs.append(isig)
+            self.words.append(word)
+
+    @staticmethod
+    def get_indexfile(indexfile: Optional[str]) -> str:
+        """Returns the current indexfile location."""
+        if indexfile is None:
+            if 'unscrambler_default_indexfile' in config.config:
+                indexfile = config.config['unscramble_indexfile']
+            else:
+                indexfile = "/usr/share/dict/sparse_index"
+        else:
+            assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
+        return indexfile
  
      # 52 bits
      @staticmethod
  
      # 52 bits
      @staticmethod
-    def _compute_word_fingerprint(word: str, population) -> int:
+    def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
          fp = 0
          for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
              letter = pair[0]
              if letter in fprint_feature_bit:
          fp = 0
          for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
              letter = pair[0]
              if letter in fprint_feature_bit:
-                count = pair[1]
-                if count > 3:
-                    count = 3
+                count = min(pair[1], 3)
                  shift = fprint_feature_bit[letter]
                  s = count << shift
                  fp |= s
                  shift = fprint_feature_bit[letter]
                  s = count << shift
                  fp |= s
@@ -108,22 +153,24 @@ class Unscrambler(object):
  
      # 32 bits
      @staticmethod
  
      # 32 bits
      @staticmethod
-    def _compute_word_letter_sig(letter_sigs, word: str, population) -> int:
+    def _compute_word_letter_sig(
+        lsigs: Mapping[str, int],
+        word: str,
+        population: Mapping[str, int],
+    ) -> int:
          sig = 0
          for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
              letter = pair[0]
          sig = 0
          for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
              letter = pair[0]
-            if letter not in letter_sigs:
+            if letter not in lsigs:
                  continue
                  continue
-            s = letter_sigs[letter]
+            s = lsigs[letter]
              count = pair[1]
              if count > 1:
                  s <<= count
                  s |= count
              s &= letters_mask
              sig ^= s
              count = pair[1]
              if count > 1:
                  s <<= count
                  s |= count
              s &= letters_mask
              sig ^= s
-        length = len(word)
-        if length > 31:
-            length = 31
+        length = min(len(word), 31)
          sig ^= length << 8
          sig &= letters_mask
          return sig
          sig ^= length << 8
          sig &= letters_mask
          return sig
@@ -137,6 +184,12 @@ class Unscrambler(object):
          the word and their frequencies.  We try to cluster "similar"
          words close to each other in the signature space.
  
          the word and their frequencies.  We try to cluster "similar"
          words close to each other in the signature space.
  
+        Args:
+            word: the word to compute a signature for
+
+        Returns:
+            The word's signature.
+
          >>> train = Unscrambler.compute_word_sig('train')
          >>> train
          23178969883741
          >>> train = Unscrambler.compute_word_sig('train')
          >>> train
          23178969883741
@@ -150,7 +203,7 @@ class Unscrambler(object):
  
          """
          population = list_utils.population_counts(word)
  
          """
          population = list_utils.population_counts(word)
-        fprint = Unscrambler._compute_word_fingerprint(word, population)
+        fprint = Unscrambler._compute_word_fingerprint(population)
          letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
          assert fprint & letter_sig == 0
          sig = fprint | letter_sig
          letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
          assert fprint & letter_sig == 0
          sig = fprint | letter_sig
@@ -158,26 +211,31 @@ class Unscrambler(object):
  
      @staticmethod
      def repopulate(
  
      @staticmethod
      def repopulate(
-        letter_sigs: Dict[str, int],
          dictfile: str = '/usr/share/dict/words',
          indexfile: str = '/usr/share/dict/sparse_index',
      ) -> None:
          dictfile: str = '/usr/share/dict/words',
          indexfile: str = '/usr/share/dict/sparse_index',
      ) -> None:
-        """Before calling this method, change letter_sigs from the default above
-        unless you want to populate the same exact files."""
+        """
+        Repopulates the indexfile.
+
+        .. warning::
  
  
-        words_by_sigs = {}
+            Before calling this method, change letter_sigs from the
+            default above unless you want to populate the same exact
+            files.
+        """
+        words_by_sigs: Dict[int, str] = {}
          seen = set()
          with open(dictfile, "r") as f:
              for word in f:
                  word = word.replace('\n', '')
                  word = word.lower()
          seen = set()
          with open(dictfile, "r") as f:
              for word in f:
                  word = word.replace('\n', '')
                  word = word.lower()
-                sig = Unscrambler.compute_word_sig(letter_sigs, word)
-                logger.debug("%s => 0x%x" % (word, sig))
+                sig = Unscrambler.compute_word_sig(word)
+                logger.debug("%s => 0x%x", word, sig)
                  if word in seen:
                      continue
                  seen.add(word)
                  if sig in words_by_sigs:
                  if word in seen:
                      continue
                  seen.add(word)
                  if sig in words_by_sigs:
-                    words_by_sigs[sig] += ",%s" % word
+                    words_by_sigs[sig] += f",{word}"
                  else:
                      words_by_sigs[sig] = word
          with open(indexfile, 'w') as f:
                  else:
                      words_by_sigs[sig] = word
          with open(indexfile, 'w') as f:
@@ -185,63 +243,64 @@ class Unscrambler(object):
                  word = words_by_sigs[sig]
                  print(f'0x{sig:x}+{word}', file=f)
  
                  word = words_by_sigs[sig]
                  print(f'0x{sig:x}+{word}', file=f)
  
-    @staticmethod
-    def lookup(word: str, *, include_fuzzy_matches=False) -> Dict[str, bool]:
+    def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
          """Looks up a potentially scrambled word optionally including near
          "fuzzy" matches.
  
          """Looks up a potentially scrambled word optionally including near
          "fuzzy" matches.
  
-        >>> Unscrambler.lookup('eanycleocipd', include_fuzzy_matches=False)
-        {'encyclopedia': True}
+        Args:
+            word: the word to lookup
+            window_size: the number of nearby fuzzy matches to return
+
+        Returns:
+            A dict of word -> bool containing unscrambled words with (close
+            to or precisely) the same letters as the input word.  The bool
+            values in this dict indicate whether the key word is an exact
+            or near match.  The count of entries in this dict is controlled
+            by the window_size param.
  
  
+        >>> u = Unscrambler()
+        >>> u.lookup('eanycleocipd', window_size=0)
+        {'encyclopedia': True}
          """
          sig = Unscrambler.compute_word_sig(word)
          """
          sig = Unscrambler.compute_word_sig(word)
-        return Unscrambler.lookup_by_sig(
-            sig, include_fuzzy_matches=include_fuzzy_matches
-        )
+        return self.lookup_by_sig(sig, window_size=window_size)
  
  
-    @staticmethod
-    def lookup_by_sig(sig, *, include_fuzzy_matches=False) -> Dict[str, bool]:
+    def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
          """Looks up a word that has already been translated into a signature by
          a previous call to Unscrambler.compute_word_sig.  Optionally returns
          near "fuzzy" matches.
  
          """Looks up a word that has already been translated into a signature by
          a previous call to Unscrambler.compute_word_sig.  Optionally returns
          near "fuzzy" matches.
  
+        Args:
+            sig: the signature of the word to lookup (see :meth:`compute_word_sig`
+                to generate these signatures).
+            window_size: the number of nearby fuzzy matches to return
+
+        Returns:
+            A dict of word -> bool containing unscrambled words with (close
+            to or precisely) the same letters as the input word.  The bool
+            values in this dict indicate whether the key word is an exact
+            or near match.  The count of entries in this dict is controlled
+            by the window_size param.
+
          >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
          >>> sig
          18491949645300288339
  
          >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
          >>> sig
          18491949645300288339
  
-        >>> Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=True)
+        >>> u = Unscrambler()
+        >>> u.lookup_by_sig(sig)
          {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
          {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
-
          """
          """
-        # Cache the index; it doesn't change and this may be called
-        # more than once.
-        if len(Unscrambler.sigs) == 0:
-            if 'unscramble_indexfile' in config.config:
-                indexfile = config.config['unscramble_indexfile']
-            else:
-                indexfile = "/usr/share/dict/sparse_index"
-            with open(indexfile, 'r') as rf:
-                lines = rf.readlines()
-            for line in lines:
-                line = line[:-1]
-                (fsig, word) = line.split('+')
-                fsig = int(fsig, 16)
-                Unscrambler.sigs.append(fsig)
-                Unscrambler.words.append(word)
-
          ret = {}
          ret = {}
-        (exact, location) = list_utils.binary_search(Unscrambler.sigs, sig)
-        start = location - 5
-        if start < 0:
-            start = 0
-        end = location + 6
-        if end > len(Unscrambler.words):
-            end = len(Unscrambler.words)
+        (_, location) = list_utils.binary_search(self.sigs, sig)
+        start = location - window_size
+        start = max(start, 0)
+        end = location + 1 + window_size
+        end = min(end, len(self.words))
  
          for x in range(start, end):
  
          for x in range(start, end):
-            word = Unscrambler.words[x]
-            fsig = Unscrambler.sigs[x]
-            if include_fuzzy_matches is True or (fsig == sig):
+            word = self.words[x]
+            fsig = self.sigs[x]
+            if window_size > 0 or (fsig == sig):
                  ret[word] = fsig == sig
          return ret
  
                  ret[word] = fsig == sig
          return ret