Various changes
[python_utils.git] / letter_compress.py
1 #!/usr/bin/env python3
2
3 import bitstring
4
5 from collect.bidict import bidict
6
7 special_characters = bidict(
8     {
9         ' ': 27,
10         '.': 28,
11         ',': 29,
12         "-": 30,
13         '"': 31,
14     }
15 )
16
17
18 def compress(uncompressed: str) -> bytes:
19     """
20     Compress a word sequence into a stream of bytes.  The compressed
21     form will be 5/8th the size of the original.  Words can be lower
22     case letters or special_characters (above).
23
24     >>> import binascii
25     >>> binascii.hexlify(compress('this is a test'))
26     b'a2133da67b0ee859d0'
27
28     >>> binascii.hexlify(compress('scot'))
29     b'98df40'
30
31     >>> binascii.hexlify(compress('scott'))
32     b'98df4a00'
33
34     """
35     compressed = bitstring.BitArray()
36     for (n, letter) in enumerate(uncompressed):
37         if 'a' <= letter <= 'z':
38             bits = ord(letter) - ord('a') + 1   # 1..26
39         else:
40             if letter not in special_characters:
41                 raise Exception(f'"{uncompressed}" contains uncompressable char="{letter}"')
42             bits = special_characters[letter]
43         compressed.append(f"uint:5={bits}")
44     while len(compressed) % 8 != 0:
45         compressed.append("uint:1=0")
46     return compressed.bytes
47
48
49 def decompress(kompressed: bytes) -> str:
50     """
51     Decompress a previously compressed stream of bytes back into
52     its original form.
53
54     >>> import binascii
55     >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0'))
56     'this is a test'
57
58     >>> decompress(binascii.unhexlify(b'98df4a00'))
59     'scott'
60
61     """
62     decompressed = ''
63     compressed = bitstring.BitArray(kompressed)
64     for chunk in compressed.cut(5):
65         chunk = chunk.uint
66         print(f'0x{chunk:x}')
67         if chunk == 0:
68             break
69         elif 1 <= chunk <= 26:
70             letter = chr(chunk - 1 + ord('a'))
71         else:
72             letter = special_characters.inverse[chunk][0]
73         decompressed += letter
74     return decompressed
75
76
77 if __name__ == '__main__':
78     import doctest
79     doctest.testmod()