#!/usr/bin/env python3
-import bitstring
+"""A simple compression helper for lowercase ascii text."""
-from collect.bidict import bidict
+import bitstring
+from collect.bidict import BiDict
-special_characters = bidict(
+special_characters = BiDict(
{
' ': 27,
'.': 28,
def compress(uncompressed: str) -> bytes:
- """
- Compress a word sequence into a stream of bytes. The compressed
+ """Compress a word sequence into a stream of bytes. The compressed
form will be 5/8th the size of the original. Words can be lower
case letters or special_characters (above).
>>> binascii.hexlify(compress('this is a test'))
b'a2133da67b0ee859d0'
+ >>> binascii.hexlify(compress('scot'))
+ b'98df40'
+
+ >>> binascii.hexlify(compress('scott')) # Note the last byte
+ b'98df4a00'
+
"""
compressed = bitstring.BitArray()
- for (n, letter) in enumerate(uncompressed):
+ for letter in uncompressed:
if 'a' <= letter <= 'z':
- bits = ord(letter) - ord('a') + 1 # 1..26
+ bits = ord(letter) - ord('a') + 1 # 1..26
else:
if letter not in special_characters:
raise Exception(f'"{uncompressed}" contains uncompressable char="{letter}"')
>>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0'))
'this is a test'
+ >>> decompress(binascii.unhexlify(b'98df4a00'))
+ 'scott'
+
"""
decompressed = ''
compressed = bitstring.BitArray(kompressed)
+
+ # There are compressed messages that legitimately end with the
+ # byte 0x00. The message "scott" is an example; compressed it is
+ # 0x98df4a00. It's 5 characters long which means there are 5 x 5
+ # bits of compressed info (25 bits, just over 3 bytes). The last
+ # (25th) bit in the steam happens to be a zero. The compress code
+ # padded out the compressed message by adding seven more zeros to
+ # complete the partial 4th byte. In the 4th byte, however, one
+ # bit is information and seven are padding.
+ #
+ # It's likely that this API's client code may treat a zero byte as
+ # a termination character and not regard it as a legitimate part
+ # of the message. This is a bug in that client code, to be clear.
+ #
+ # However, it's a bug we can work around:
+ #
+ # Here, I'm appending an extra 0x00 byte to the compressed message
+ # passed in. If the client code dropped the last 0x00 byte (and,
+ # with it, some of the legitimate message bits) by treating it as
+ # a termination mark, this 0x00 will replace it (and the missing
+ # message bits). If the client code didn't drop the last 0x00 (or
+ # if the compressed message didn't end in 0x00), adding an extra
+ # 0x00 is a no op because the codepoint 0b00000 is a "stop" message
+ # so we'll ignore the extras.
+ compressed.append("uint:8=0")
+
for chunk in compressed.cut(5):
chunk = chunk.uint
if chunk == 0:
if __name__ == '__main__':
import doctest
+
doctest.testmod()