3 # © Copyright 2021-2023, Scott Gasch
6 This is a simple, honestly, toy compression scheme that uses a custom
7 alphabet of 32 characters which can each be represented in six bits
8 instead of eight. It therefore reduces the size of data composed of
9 only those letters by 25% without loss.
14 from pyutils.collectionz.bidict import BiDict
16 special_characters = BiDict(
27 def compress(uncompressed: str) -> bytes:
28 """Compress a word sequence into a stream of bytes. The compressed
29 form will be 5/8th the size of the original. Words can be lower
30 case letters or special_characters (above).
33 uncompressed: the uncompressed string to be compressed
39 >>> binascii.hexlify(compress('this is a test'))
42 >>> binascii.hexlify(compress('scot'))
45 >>> binascii.hexlify(compress('scott')) # Note the last byte
49 compressed = bitstring.BitArray()
50 for letter in uncompressed:
51 if 'a' <= letter <= 'z':
52 bits = ord(letter) - ord('a') + 1 # 1..26
54 if letter not in special_characters:
56 f'"{uncompressed}" contains uncompressable char="{letter}"'
58 bits = special_characters[letter]
59 compressed.append(f"uint:5={bits}")
60 while len(compressed) % 8 != 0:
61 compressed.append("uint:1=0")
62 return compressed.bytes
65 def decompress(compressed: bytes) -> str:
67 Decompress a previously compressed stream of bytes back into
71 compressed: the compressed data to decompress
74 The decompressed string
77 >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0'))
80 >>> decompress(binascii.unhexlify(b'98df4a00'))
85 kompressed = bitstring.BitArray(compressed)
87 # There are compressed messages that legitimately end with the
88 # byte 0x00. The message "scott" is an example; compressed it is
89 # 0x98df4a00. It's 5 characters long which means there are 5 x 5
90 # bits of compressed info (25 bits, just over 3 bytes). The last
91 # (25th) bit in the steam happens to be a zero. The compress code
92 # padded out the compressed message by adding seven more zeros to
93 # complete the partial 4th byte. In the 4th byte, however, one
94 # bit is information and seven are padding.
96 # It's likely that this API's client code may treat a zero byte as
97 # a termination character and not regard it as a legitimate part
98 # of the message. This is a bug in that client code, to be clear.
100 # However, it's a bug we can work around:
102 # Here, I'm appending an extra 0x00 byte to the compressed message
103 # passed in. If the client code dropped the last 0x00 byte (and,
104 # with it, some of the legitimate message bits) by treating it as
105 # a termination mark, this 0x00 will replace it (and the missing
106 # message bits). If the client code didn't drop the last 0x00 (or
107 # if the compressed message didn't end in 0x00), adding an extra
108 # 0x00 is a no op because the codepoint 0b00000 is a "stop" message
109 # so we'll ignore the extras.
110 kompressed.append("uint:8=0")
112 for chunk in kompressed.cut(5):
116 elif 1 <= chunk <= 26:
117 letter = chr(chunk - 1 + ord('a'))
119 letter = special_characters.inverse[chunk][0]
120 decompressed += letter
124 if __name__ == '__main__':