3 # © Copyright 2021-2023, Scott Gasch
6 This is a simple, honestly, toy compression scheme that uses a custom
7 alphabet of 32 characters which can each be represented in six bits
8 instead of eight. It therefore reduces the size of data composed of
9 only those letters by 25% without loss.
14 from pyutils.collectionz.bidict import BiDict
16 special_characters = BiDict(
27 def compress(uncompressed: str) -> bytes:
28 """Compress a word sequence into a stream of bytes. The compressed
29 form will be 5/8th the size of the original. Words can be lower
30 case letters or special_characters (above).
33 uncompressed: the uncompressed string to be compressed
39 ValueError: uncompressed text contains illegal character
42 >>> binascii.hexlify(compress('this is a test'))
45 >>> binascii.hexlify(compress('scot'))
48 >>> binascii.hexlify(compress('scott')) # Note the last byte
52 compressed = bitstring.BitArray()
53 for letter in uncompressed:
54 if "a" <= letter <= "z":
55 bits = ord(letter) - ord("a") + 1 # 1..26
57 if letter not in special_characters:
59 f'"{uncompressed}" contains uncompressable char="{letter}"'
61 bits = special_characters[letter]
62 compressed.append(f"uint:5={bits}")
63 while len(compressed) % 8 != 0:
64 compressed.append("uint:1=0")
65 return compressed.bytes
68 def decompress(compressed: bytes) -> str:
70 Decompress a previously compressed stream of bytes back into
74 compressed: the compressed data to decompress
77 The decompressed string
80 >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0'))
83 >>> decompress(binascii.unhexlify(b'98df4a00'))
88 kompressed = bitstring.BitArray(compressed)
90 # There are compressed messages that legitimately end with the
91 # byte 0x00. The message "scott" is an example; compressed it is
92 # 0x98df4a00. It's 5 characters long which means there are 5 x 5
93 # bits of compressed info (25 bits, just over 3 bytes). The last
94 # (25th) bit in the steam happens to be a zero. The compress code
95 # padded out the compressed message by adding seven more zeros to
96 # complete the partial 4th byte. In the 4th byte, however, one
97 # bit is information and seven are padding.
99 # It's likely that this API's client code may treat a zero byte as
100 # a termination character and not regard it as a legitimate part
101 # of the message. This is a bug in that client code, to be clear.
103 # However, it's a bug we can work around:
105 # Here, I'm appending an extra 0x00 byte to the compressed message
106 # passed in. If the client code dropped the last 0x00 byte (and,
107 # with it, some of the legitimate message bits) by treating it as
108 # a termination mark, this 0x00 will replace it (and the missing
109 # message bits). If the client code didn't drop the last 0x00 (or
110 # if the compressed message didn't end in 0x00), adding an extra
111 # 0x00 is a no op because the codepoint 0b00000 is a "stop" message
112 # so we'll ignore the extras.
113 kompressed.append("uint:8=0")
115 for chunk in kompressed.cut(5):
121 letter = chr(chunk - 1 + ord("a"))
123 letter = special_characters.inverse[chunk][0]
124 decompressed += letter
128 if __name__ == "__main__":