X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=letter_compress.py;h=9b4cf194c3d97a83c68c47bd0199a65c3a6e1698;hb=36fea7f15ed17150691b5b3ead75450e575229ef;hp=4374edd352eab5e19e7c0a6a39345d63220d0d10;hpb=0bc6e4312cad0f997751739e750954ac39dfa6cc;p=python_utils.git

diff --git a/letter_compress.py b/letter_compress.py
index 4374edd..9b4cf19 100644
--- a/letter_compress.py
+++ b/letter_compress.py
@@ -2,10 +2,9 @@
 
 import bitstring
 
-from collect.bidict import bidict
+from collect.bidict import BiDict
 
-
-special_characters = bidict(
+special_characters = BiDict(
     {
         ' ': 27,
         '.': 28,
@@ -17,8 +16,7 @@ special_characters = bidict(
 
 
 def compress(uncompressed: str) -> bytes:
-    """
-    Compress a word sequence into a stream of bytes.  The compressed
+    """Compress a word sequence into a stream of bytes.  The compressed
     form will be 5/8th the size of the original.  Words can be lower
     case letters or special_characters (above).
 
@@ -26,14 +24,22 @@ def compress(uncompressed: str) -> bytes:
     >>> binascii.hexlify(compress('this is a test'))
     b'a2133da67b0ee859d0'
 
+    >>> binascii.hexlify(compress('scot'))
+    b'98df40'
+
+    >>> binascii.hexlify(compress('scott'))  # Note the last byte
+    b'98df4a00'
+
     """
     compressed = bitstring.BitArray()
     for (n, letter) in enumerate(uncompressed):
         if 'a' <= letter <= 'z':
-            bits = ord(letter) - ord('a') + 1   # 1..26
+            bits = ord(letter) - ord('a') + 1  # 1..26
         else:
             if letter not in special_characters:
-                raise Exception(f'"{uncompressed}" contains uncompressable char="{letter}"')
+                raise Exception(
+                    f'"{uncompressed}" contains uncompressable char="{letter}"'
+                )
             bits = special_characters[letter]
         compressed.append(f"uint:5={bits}")
     while len(compressed) % 8 != 0:
@@ -50,9 +56,38 @@ def decompress(kompressed: bytes) -> str:
     >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0'))
     'this is a test'
 
+    >>> decompress(binascii.unhexlify(b'98df4a00'))
+    'scott'
+
     """
     decompressed = ''
     compressed = bitstring.BitArray(kompressed)
+
+    # There are compressed messages that legitimately end with the
+    # byte 0x00.  The message "scott" is an example; compressed it is
+    # 0x98df4a00.  It's 5 characters long which means there are 5 x 5
+    # bits of compressed info (25 bits, just over 3 bytes).  The last
+    # (25th) bit in the steam happens to be a zero.  The compress code
+    # padded out the compressed message by adding seven more zeros to
+    # complete the partial 4th byte.  In the 4th byte, however, one
+    # bit is information and seven are padding.
+    #
+    # It's likely that this API's client code may treat a zero byte as
+    # a termination character and not regard it as a legitimate part
+    # of the message.  This is a bug in that client code, to be clear.
+    #
+    # However, it's a bug we can work around:
+    #
+    # Here, I'm appending an extra 0x00 byte to the compressed message
+    # passed in.  If the client code dropped the last 0x00 byte (and,
+    # with it, some of the legitimate message bits) by treating it as
+    # a termination mark, this 0x00 will replace it (and the missing
+    # message bits).  If the client code didn't drop the last 0x00 (or
+    # if the compressed message didn't end in 0x00), adding an extra
+    # 0x00 is a no op because the codepoint 0b00000 is a "stop" message
+    # so we'll ignore the extras.
+    compressed.append("uint:8=0")
+
     for chunk in compressed.cut(5):
         chunk = chunk.uint
         if chunk == 0:
@@ -67,4 +102,5 @@ def decompress(kompressed: bytes) -> str:
 
 if __name__ == '__main__':
     import doctest
+
     doctest.testmod()