#!/usr/bin/env python3 # © Copyright 2021-2022, Scott Gasch """A simple toy compression helper for lowercase ascii text.""" import bitstring from collect.bidict import BiDict special_characters = BiDict( { ' ': 27, '.': 28, ',': 29, "-": 30, '"': 31, } ) def compress(uncompressed: str) -> bytes: """Compress a word sequence into a stream of bytes. The compressed form will be 5/8th the size of the original. Words can be lower case letters or special_characters (above). >>> import binascii >>> binascii.hexlify(compress('this is a test')) b'a2133da67b0ee859d0' >>> binascii.hexlify(compress('scot')) b'98df40' >>> binascii.hexlify(compress('scott')) # Note the last byte b'98df4a00' """ compressed = bitstring.BitArray() for letter in uncompressed: if 'a' <= letter <= 'z': bits = ord(letter) - ord('a') + 1 # 1..26 else: if letter not in special_characters: raise Exception(f'"{uncompressed}" contains uncompressable char="{letter}"') bits = special_characters[letter] compressed.append(f"uint:5={bits}") while len(compressed) % 8 != 0: compressed.append("uint:1=0") return compressed.bytes def decompress(kompressed: bytes) -> str: """ Decompress a previously compressed stream of bytes back into its original form. >>> import binascii >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0')) 'this is a test' >>> decompress(binascii.unhexlify(b'98df4a00')) 'scott' """ decompressed = '' compressed = bitstring.BitArray(kompressed) # There are compressed messages that legitimately end with the # byte 0x00. The message "scott" is an example; compressed it is # 0x98df4a00. It's 5 characters long which means there are 5 x 5 # bits of compressed info (25 bits, just over 3 bytes). The last # (25th) bit in the steam happens to be a zero. The compress code # padded out the compressed message by adding seven more zeros to # complete the partial 4th byte. In the 4th byte, however, one # bit is information and seven are padding. # # It's likely that this API's client code may treat a zero byte as # a termination character and not regard it as a legitimate part # of the message. This is a bug in that client code, to be clear. # # However, it's a bug we can work around: # # Here, I'm appending an extra 0x00 byte to the compressed message # passed in. If the client code dropped the last 0x00 byte (and, # with it, some of the legitimate message bits) by treating it as # a termination mark, this 0x00 will replace it (and the missing # message bits). If the client code didn't drop the last 0x00 (or # if the compressed message didn't end in 0x00), adding an extra # 0x00 is a no op because the codepoint 0b00000 is a "stop" message # so we'll ignore the extras. compressed.append("uint:8=0") for chunk in compressed.cut(5): chunk = chunk.uint if chunk == 0: break elif 1 <= chunk <= 26: letter = chr(chunk - 1 + ord('a')) else: letter = special_characters.inverse[chunk][0] decompressed += letter return decompressed if __name__ == '__main__': import doctest doctest.testmod()