Hammer on that run_tests.py thing again.

[python_utils.git] / unscrambler.py
diff --git a/unscrambler.py b/unscrambler.py

index 8fcd65d4e3ef732bc529cd15f3d3de1fa13dede2..3ccddbbea5ef3f59e3915c5046586e62f74f1182 100644 (file)
--- a/unscrambler.py
+++ b/unscrambler.py
@@ -1,17 +1,22 @@
  #!/usr/bin/env python3
  
  #!/usr/bin/env python3
  
+# © Copyright 2021-2022, Scott Gasch
+
+"""A fast word unscrambler library."""
+
  import logging
  import logging
-from typing import Dict, Mapping
+from typing import Dict, Mapping, Optional
  
  import config
  import decorator_utils
  
  import config
  import decorator_utils
+import file_utils
  import list_utils
  
  cfg = config.add_commandline_args(
  import list_utils
  
  cfg = config.add_commandline_args(
-    f'Unscramble! ({__file__})', 'A fast word unscrambler.'
+    f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
  )
  cfg.add_argument(
  )
  cfg.add_argument(
-    "--unscramble_indexfile",
+    "--unscrambler_default_indexfile",
      help="Path to a file of signature -> word index.",
      metavar="FILENAME",
      default="/usr/share/dict/sparse_index",
      help="Path to a file of signature -> word index.",
      metavar="FILENAME",
      default="/usr/share/dict/sparse_index",
@@ -20,10 +25,10 @@ cfg.add_argument(
  logger = logging.getLogger(__name__)
  
  letters_bits = 32
  logger = logging.getLogger(__name__)
  
  letters_bits = 32
-letters_mask = 2 ** letters_bits - 1
+letters_mask = 2**letters_bits - 1
  
  fprint_bits = 52
  
  fprint_bits = 52
-fprint_mask = (2 ** fprint_bits - 1) << letters_bits
+fprint_mask = (2**fprint_bits - 1) << letters_bits
  
  fprint_feature_bit = {
      'e': 0,
  
  fprint_feature_bit = {
      'e': 0,
@@ -97,38 +102,50 @@ class Unscrambler(object):
      lookup methods support a "fuzzy match" argument that can be set to
      request similar words that do not match exactly in addition to any
      exact matches.
      lookup methods support a "fuzzy match" argument that can be set to
      request similar words that do not match exactly in addition to any
      exact matches.
-
      """
  
      """
  
-    def __init__(self):
+    def __init__(self, indexfile: Optional[str] = None):
+        """
+        Constructs an unscrambler.
+
+        Args:
+            indexfile: overrides the default indexfile location if provided
+        """
+
          # Cached index per instance.
          self.sigs = []
          self.words = []
  
          # Cached index per instance.
          self.sigs = []
          self.words = []
  
-        if 'unscramble_indexfile' in config.config:
-            indexfile = config.config['unscramble_indexfile']
-        else:
-            indexfile = "/usr/share/dict/sparse_index"
-
-        with open(indexfile, 'r') as rf:
+        filename = Unscrambler.get_indexfile(indexfile)
+        with open(filename, 'r') as rf:
              lines = rf.readlines()
          for line in lines:
              line = line[:-1]
              (fsig, word) = line.split('+')
              lines = rf.readlines()
          for line in lines:
              line = line[:-1]
              (fsig, word) = line.split('+')
-            fsig = int(fsig, 16)
-            self.sigs.append(fsig)
+            isig = int(fsig, 16)
+            self.sigs.append(isig)
              self.words.append(word)
  
              self.words.append(word)
  
+    @staticmethod
+    def get_indexfile(indexfile: Optional[str]) -> str:
+        """Returns the current indexfile location."""
+        if indexfile is None:
+            if 'unscrambler_default_indexfile' in config.config:
+                indexfile = config.config['unscramble_indexfile']
+            else:
+                indexfile = "/usr/share/dict/sparse_index"
+        else:
+            assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
+        return indexfile
+
      # 52 bits
      @staticmethod
      # 52 bits
      @staticmethod
-    def _compute_word_fingerprint(word: str, population: Mapping[str, int]) -> int:
+    def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
          fp = 0
          for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
              letter = pair[0]
              if letter in fprint_feature_bit:
          fp = 0
          for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
              letter = pair[0]
              if letter in fprint_feature_bit:
-                count = pair[1]
-                if count > 3:
-                    count = 3
+                count = min(pair[1], 3)
                  shift = fprint_feature_bit[letter]
                  s = count << shift
                  fp |= s
                  shift = fprint_feature_bit[letter]
                  s = count << shift
                  fp |= s
@@ -137,25 +154,23 @@ class Unscrambler(object):
      # 32 bits
      @staticmethod
      def _compute_word_letter_sig(
      # 32 bits
      @staticmethod
      def _compute_word_letter_sig(
-        letter_sigs: Mapping[str, int],
+        lsigs: Mapping[str, int],
          word: str,
          population: Mapping[str, int],
      ) -> int:
          sig = 0
          for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
              letter = pair[0]
          word: str,
          population: Mapping[str, int],
      ) -> int:
          sig = 0
          for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
              letter = pair[0]
-            if letter not in letter_sigs:
+            if letter not in lsigs:
                  continue
                  continue
-            s = letter_sigs[letter]
+            s = lsigs[letter]
              count = pair[1]
              if count > 1:
                  s <<= count
                  s |= count
              s &= letters_mask
              sig ^= s
              count = pair[1]
              if count > 1:
                  s <<= count
                  s |= count
              s &= letters_mask
              sig ^= s
-        length = len(word)
-        if length > 31:
-            length = 31
+        length = min(len(word), 31)
          sig ^= length << 8
          sig &= letters_mask
          return sig
          sig ^= length << 8
          sig &= letters_mask
          return sig
@@ -169,6 +184,12 @@ class Unscrambler(object):
          the word and their frequencies.  We try to cluster "similar"
          words close to each other in the signature space.
  
          the word and their frequencies.  We try to cluster "similar"
          words close to each other in the signature space.
  
+        Args:
+            word: the word to compute a signature for
+
+        Returns:
+            The word's signature.
+
          >>> train = Unscrambler.compute_word_sig('train')
          >>> train
          23178969883741
          >>> train = Unscrambler.compute_word_sig('train')
          >>> train
          23178969883741
@@ -182,7 +203,7 @@ class Unscrambler(object):
  
          """
          population = list_utils.population_counts(word)
  
          """
          population = list_utils.population_counts(word)
-        fprint = Unscrambler._compute_word_fingerprint(word, population)
+        fprint = Unscrambler._compute_word_fingerprint(population)
          letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
          assert fprint & letter_sig == 0
          sig = fprint | letter_sig
          letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
          assert fprint & letter_sig == 0
          sig = fprint | letter_sig
@@ -190,27 +211,31 @@ class Unscrambler(object):
  
      @staticmethod
      def repopulate(
  
      @staticmethod
      def repopulate(
-        letter_sigs: Dict[str, int],
          dictfile: str = '/usr/share/dict/words',
          indexfile: str = '/usr/share/dict/sparse_index',
      ) -> None:
          dictfile: str = '/usr/share/dict/words',
          indexfile: str = '/usr/share/dict/sparse_index',
      ) -> None:
-        """Before calling this method, change letter_sigs from the default above
-        unless you want to populate the same exact files.
+        """
+        Repopulates the indexfile.
+
+        .. warning::
  
  
+            Before calling this method, change letter_sigs from the
+            default above unless you want to populate the same exact
+            files.
          """
          """
-        words_by_sigs = {}
+        words_by_sigs: Dict[int, str] = {}
          seen = set()
          with open(dictfile, "r") as f:
              for word in f:
                  word = word.replace('\n', '')
                  word = word.lower()
          seen = set()
          with open(dictfile, "r") as f:
              for word in f:
                  word = word.replace('\n', '')
                  word = word.lower()
-                sig = Unscrambler.compute_word_sig(letter_sigs, word)
-                logger.debug("%s => 0x%x" % (word, sig))
+                sig = Unscrambler.compute_word_sig(word)
+                logger.debug("%s => 0x%x", word, sig)
                  if word in seen:
                      continue
                  seen.add(word)
                  if sig in words_by_sigs:
                  if word in seen:
                      continue
                  seen.add(word)
                  if sig in words_by_sigs:
-                    words_by_sigs[sig] += ",%s" % word
+                    words_by_sigs[sig] += f",{word}"
                  else:
                      words_by_sigs[sig] = word
          with open(indexfile, 'w') as f:
                  else:
                      words_by_sigs[sig] = word
          with open(indexfile, 'w') as f:
@@ -222,10 +247,20 @@ class Unscrambler(object):
          """Looks up a potentially scrambled word optionally including near
          "fuzzy" matches.
  
          """Looks up a potentially scrambled word optionally including near
          "fuzzy" matches.
  
+        Args:
+            word: the word to lookup
+            window_size: the number of nearby fuzzy matches to return
+
+        Returns:
+            A dict of word -> bool containing unscrambled words with (close
+            to or precisely) the same letters as the input word.  The bool
+            values in this dict indicate whether the key word is an exact
+            or near match.  The count of entries in this dict is controlled
+            by the window_size param.
+
          >>> u = Unscrambler()
          >>> u.lookup('eanycleocipd', window_size=0)
          {'encyclopedia': True}
          >>> u = Unscrambler()
          >>> u.lookup('eanycleocipd', window_size=0)
          {'encyclopedia': True}
-
          """
          sig = Unscrambler.compute_word_sig(word)
          return self.lookup_by_sig(sig, window_size=window_size)
          """
          sig = Unscrambler.compute_word_sig(word)
          return self.lookup_by_sig(sig, window_size=window_size)
@@ -235,6 +270,18 @@ class Unscrambler(object):
          a previous call to Unscrambler.compute_word_sig.  Optionally returns
          near "fuzzy" matches.
  
          a previous call to Unscrambler.compute_word_sig.  Optionally returns
          near "fuzzy" matches.
  
+        Args:
+            sig: the signature of the word to lookup (see :meth:`compute_word_sig`
+                to generate these signatures).
+            window_size: the number of nearby fuzzy matches to return
+
+        Returns:
+            A dict of word -> bool containing unscrambled words with (close
+            to or precisely) the same letters as the input word.  The bool
+            values in this dict indicate whether the key word is an exact
+            or near match.  The count of entries in this dict is controlled
+            by the window_size param.
+
          >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
          >>> sig
          18491949645300288339
          >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
          >>> sig
          18491949645300288339
@@ -242,16 +289,13 @@ class Unscrambler(object):
          >>> u = Unscrambler()
          >>> u.lookup_by_sig(sig)
          {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
          >>> u = Unscrambler()
          >>> u.lookup_by_sig(sig)
          {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
-
          """
          ret = {}
          """
          ret = {}
-        (exact, location) = list_utils.binary_search(self.sigs, sig)
+        (_, location) = list_utils.binary_search(self.sigs, sig)
          start = location - window_size
          start = location - window_size
-        if start < 0:
-            start = 0
+        start = max(start, 0)
          end = location + 1 + window_size
          end = location + 1 + window_size
-        if end > len(self.words):
-            end = len(self.words)
+        end = min(end, len(self.words))
  
          for x in range(start, end):
              word = self.words[x]
  
          for x in range(start, end):
              word = self.words[x]