3 # © Copyright 2021-2022, Scott Gasch
5 """A simple toy compression helper for lowercase ascii text."""
9 from pyutils.collectionz.bidict import BiDict
11 special_characters = BiDict(
22 def compress(uncompressed: str) -> bytes:
23 """Compress a word sequence into a stream of bytes. The compressed
24 form will be 5/8th the size of the original. Words can be lower
25 case letters or special_characters (above).
28 >>> binascii.hexlify(compress('this is a test'))
31 >>> binascii.hexlify(compress('scot'))
34 >>> binascii.hexlify(compress('scott')) # Note the last byte
38 compressed = bitstring.BitArray()
39 for letter in uncompressed:
40 if 'a' <= letter <= 'z':
41 bits = ord(letter) - ord('a') + 1 # 1..26
43 if letter not in special_characters:
45 f'"{uncompressed}" contains uncompressable char="{letter}"'
47 bits = special_characters[letter]
48 compressed.append(f"uint:5={bits}")
49 while len(compressed) % 8 != 0:
50 compressed.append("uint:1=0")
51 return compressed.bytes
54 def decompress(kompressed: bytes) -> str:
56 Decompress a previously compressed stream of bytes back into
60 >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0'))
63 >>> decompress(binascii.unhexlify(b'98df4a00'))
68 compressed = bitstring.BitArray(kompressed)
70 # There are compressed messages that legitimately end with the
71 # byte 0x00. The message "scott" is an example; compressed it is
72 # 0x98df4a00. It's 5 characters long which means there are 5 x 5
73 # bits of compressed info (25 bits, just over 3 bytes). The last
74 # (25th) bit in the steam happens to be a zero. The compress code
75 # padded out the compressed message by adding seven more zeros to
76 # complete the partial 4th byte. In the 4th byte, however, one
77 # bit is information and seven are padding.
79 # It's likely that this API's client code may treat a zero byte as
80 # a termination character and not regard it as a legitimate part
81 # of the message. This is a bug in that client code, to be clear.
83 # However, it's a bug we can work around:
85 # Here, I'm appending an extra 0x00 byte to the compressed message
86 # passed in. If the client code dropped the last 0x00 byte (and,
87 # with it, some of the legitimate message bits) by treating it as
88 # a termination mark, this 0x00 will replace it (and the missing
89 # message bits). If the client code didn't drop the last 0x00 (or
90 # if the compressed message didn't end in 0x00), adding an extra
91 # 0x00 is a no op because the codepoint 0b00000 is a "stop" message
92 # so we'll ignore the extras.
93 compressed.append("uint:8=0")
95 for chunk in compressed.cut(5):
99 elif 1 <= chunk <= 26:
100 letter = chr(chunk - 1 + ord('a'))
102 letter = special_characters.inverse[chunk][0]
103 decompressed += letter
107 if __name__ == '__main__':