letter_compress.py

   1 #!/usr/bin/env python3
   2
   3 import bitstring
   4
   5 from collect.bidict import bidict
   6
   7 special_characters = bidict(
   8     {
   9         ' ': 27,
  10         '.': 28,
  11         ',': 29,
  12         "-": 30,
  13         '"': 31,
  14     }
  15 )
  16
  17
  18 def compress(uncompressed: str) -> bytes:
  19     """Compress a word sequence into a stream of bytes.  The compressed
  20     form will be 5/8th the size of the original.  Words can be lower
  21     case letters or special_characters (above).
  22
  23     >>> import binascii
  24     >>> binascii.hexlify(compress('this is a test'))
  25     b'a2133da67b0ee859d0'
  26
  27     >>> binascii.hexlify(compress('scot'))
  28     b'98df40'
  29
  30     >>> binascii.hexlify(compress('scott'))  # Note the last byte
  31     b'98df4a00'
  32
  33     """
  34     compressed = bitstring.BitArray()
  35     for (n, letter) in enumerate(uncompressed):
  36         if 'a' <= letter <= 'z':
  37             bits = ord(letter) - ord('a') + 1   # 1..26
  38         else:
  39             if letter not in special_characters:
  40                 raise Exception(f'"{uncompressed}" contains uncompressable char="{letter}"')
  41             bits = special_characters[letter]
  42         compressed.append(f"uint:5={bits}")
  43     while len(compressed) % 8 != 0:
  44         compressed.append("uint:1=0")
  45     return compressed.bytes
  46
  47
  48 def decompress(kompressed: bytes) -> str:
  49     """
  50     Decompress a previously compressed stream of bytes back into
  51     its original form.
  52
  53     >>> import binascii
  54     >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0'))
  55     'this is a test'
  56
  57     >>> decompress(binascii.unhexlify(b'98df4a00'))
  58     'scott'
  59
  60     """
  61     decompressed = ''
  62     compressed = bitstring.BitArray(kompressed)
  63
  64     # There are compressed messages that legitimately end with the
  65     # byte 0x00.  The message "scott" is an example; compressed it is
  66     # 0x98df4a00.  It's 5 characters long which means there are 5 x 5
  67     # bits of compressed info (25 bits, just over 3 bytes).  The last
  68     # (25th) bit in the steam happens to be a zero.  The compress code
  69     # padded out the compressed message by adding seven more zeros to
  70     # complete the partial 4th byte.  In the 4th byte, however, one
  71     # bit is information and seven are padding.
  72     #
  73     # It's likely that this APIs client code will treat a zero byte as
  74     # a termination character and not regard it as part of the
  75     # message.  This is a bug in the client code.
  76     #
  77     # However, it's a bug we can work around:
  78
  79     # Here, I'm appending an extra 0x00 byte to the compressed message
  80     # passed in.  If the client code dropped the last 0x00 byte (and,
  81     # with it, some of the legitimate message bits) by treating it as
  82     # a termination mark, this 0x00 will replace it (and the missing
  83     # message bits).  If the client code didn't drop the last 0x00 (or
  84     # if the compressed message didn't end in 0x00), adding an extra
  85     # 0x00 is a no op because the codepoint 0b00000 is a "stop" message
  86     # so we'll ignore the extras.
  87     compressed.append("uint:8=0")
  88
  89     for chunk in compressed.cut(5):
  90         chunk = chunk.uint
  91         if chunk == 0:
  92             break
  93         elif 1 <= chunk <= 26:
  94             letter = chr(chunk - 1 + ord('a'))
  95         else:
  96             letter = special_characters.inverse[chunk][0]
  97         decompressed += letter
  98     return decompressed
  99
 100
 101 if __name__ == '__main__':
 102     import doctest
 103     doctest.testmod()