X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=letter_compress.py;h=6cb6b74e87c54928a5aeb7e184114e81ff877b02;hb=532df2c5b57c7517dfb3dddd8c1358fbadf8baf3;hp=01d40f9ed636275cb2c3727d79e4a61db67f941d;hpb=37d09d6ac30c8c66477149b7c73139c3e6782468;p=python_utils.git diff --git a/letter_compress.py b/letter_compress.py index 01d40f9..6cb6b74 100644 --- a/letter_compress.py +++ b/letter_compress.py @@ -1,11 +1,14 @@ #!/usr/bin/env python3 -import bitstring +# © Copyright 2021-2022, Scott Gasch + +"""A simple compression helper for lowercase ascii text.""" -import bidict +import bitstring +from collect.bidict import BiDict -special_characters = bidict.bidict( +special_characters = BiDict( { ' ': 27, '.': 28, @@ -17,20 +20,25 @@ special_characters = bidict.bidict( def compress(uncompressed: str) -> bytes: - """ - Compress a word sequence into a stream of bytes. The compressed + """Compress a word sequence into a stream of bytes. The compressed form will be 5/8th the size of the original. Words can be lower case letters or special_characters (above). >>> import binascii >>> binascii.hexlify(compress('this is a test')) - b'99d12d225a06a6494c' + b'a2133da67b0ee859d0' + + >>> binascii.hexlify(compress('scot')) + b'98df40' + + >>> binascii.hexlify(compress('scott')) # Note the last byte + b'98df4a00' """ compressed = bitstring.BitArray() - for (n, letter) in enumerate(uncompressed): + for letter in uncompressed: if 'a' <= letter <= 'z': - bits = ord(letter) - ord('a') + 1 # 1..26 + bits = ord(letter) - ord('a') + 1 # 1..26 else: if letter not in special_characters: raise Exception(f'"{uncompressed}" contains uncompressable char="{letter}"') @@ -47,12 +55,41 @@ def decompress(kompressed: bytes) -> str: its original form. >>> import binascii - >>> decompress(binascii.unhexlify(b'99d12d225a06a6494c')) + >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0')) 'this is a test' + >>> decompress(binascii.unhexlify(b'98df4a00')) + 'scott' + """ decompressed = '' compressed = bitstring.BitArray(kompressed) + + # There are compressed messages that legitimately end with the + # byte 0x00. The message "scott" is an example; compressed it is + # 0x98df4a00. It's 5 characters long which means there are 5 x 5 + # bits of compressed info (25 bits, just over 3 bytes). The last + # (25th) bit in the steam happens to be a zero. The compress code + # padded out the compressed message by adding seven more zeros to + # complete the partial 4th byte. In the 4th byte, however, one + # bit is information and seven are padding. + # + # It's likely that this API's client code may treat a zero byte as + # a termination character and not regard it as a legitimate part + # of the message. This is a bug in that client code, to be clear. + # + # However, it's a bug we can work around: + # + # Here, I'm appending an extra 0x00 byte to the compressed message + # passed in. If the client code dropped the last 0x00 byte (and, + # with it, some of the legitimate message bits) by treating it as + # a termination mark, this 0x00 will replace it (and the missing + # message bits). If the client code didn't drop the last 0x00 (or + # if the compressed message didn't end in 0x00), adding an extra + # 0x00 is a no op because the codepoint 0b00000 is a "stop" message + # so we'll ignore the extras. + compressed.append("uint:8=0") + for chunk in compressed.cut(5): chunk = chunk.uint if chunk == 0: @@ -67,4 +104,5 @@ def decompress(kompressed: bytes) -> str: if __name__ == '__main__': import doctest + doctest.testmod()