From: Scott Gasch Date: Mon, 27 Sep 2021 15:47:39 +0000 (-0700) Subject: Compress small text messages by just restricting the alphabet. Adds bidict. X-Git-Url: https://wannabe.guru.org/gitweb/?a=commitdiff_plain;h=37d09d6ac30c8c66477149b7c73139c3e6782468;p=python_utils.git Compress small text messages by just restricting the alphabet. Adds bidict. --- diff --git a/bidict.py b/bidict.py new file mode 100644 index 0000000..5ba3fc3 --- /dev/null +++ b/bidict.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +class bidict(dict): + def __init__(self, *args, **kwargs): + super(bidict, self).__init__(*args, **kwargs) + self.inverse = {} + for key, value in self.items(): + self.inverse.setdefault(value, []).append(key) + + def __setitem__(self, key, value): + if key in self: + self.inverse[self[key]].remove(key) + super(bidict, self).__setitem__(key, value) + self.inverse.setdefault(value, []).append(key) + + def __delitem__(self, key): + self.inverse.setdefault(self[key], []).remove(key) + if self[key] in self.inverse and not self.inverse[self[key]]: + del self.inverse[self[key]] + super(bidict, self).__delitem__(key) diff --git a/letter_compress.py b/letter_compress.py new file mode 100644 index 0000000..01d40f9 --- /dev/null +++ b/letter_compress.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +import bitstring + +import bidict + + +special_characters = bidict.bidict( + { + ' ': 27, + '.': 28, + ',': 29, + "-": 30, + '"': 31, + } +) + + +def compress(uncompressed: str) -> bytes: + """ + Compress a word sequence into a stream of bytes. The compressed + form will be 5/8th the size of the original. Words can be lower + case letters or special_characters (above). + + >>> import binascii + >>> binascii.hexlify(compress('this is a test')) + b'99d12d225a06a6494c' + + """ + compressed = bitstring.BitArray() + for (n, letter) in enumerate(uncompressed): + if 'a' <= letter <= 'z': + bits = ord(letter) - ord('a') + 1 # 1..26 + else: + if letter not in special_characters: + raise Exception(f'"{uncompressed}" contains uncompressable char="{letter}"') + bits = special_characters[letter] + compressed.append(f"uint:5={bits}") + while len(compressed) % 8 != 0: + compressed.append("uint:1=0") + return compressed.bytes + + +def decompress(kompressed: bytes) -> str: + """ + Decompress a previously compressed stream of bytes back into + its original form. + + >>> import binascii + >>> decompress(binascii.unhexlify(b'99d12d225a06a6494c')) + 'this is a test' + + """ + decompressed = '' + compressed = bitstring.BitArray(kompressed) + for chunk in compressed.cut(5): + chunk = chunk.uint + if chunk == 0: + break + elif 1 <= chunk <= 26: + letter = chr(chunk - 1 + ord('a')) + else: + letter = special_characters.inverse[chunk][0] + decompressed += letter + return decompressed + + +if __name__ == '__main__': + import doctest + doctest.testmod() diff --git a/tests/letter_compress_test.py b/tests/letter_compress_test.py new file mode 100755 index 0000000..a466277 --- /dev/null +++ b/tests/letter_compress_test.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +import random +import math +import unittest + +import bootstrap +import letter_compress +import unittest_utils as uu + + +class TestLetterCompress(unittest.TestCase): + + def test_with_random_strings(self): + alphabet = 'abcdefghijklmnopqrstuvwxyz .,"-' + for n in range(20): + message = "" + for letter in random.choices(alphabet, k=random.randrange(10, 5000)): + message += letter + mlen = len(message) + compressed = letter_compress.compress(message) + clen = len(compressed) + self.assertEqual(math.ceil(mlen * 5.0 / 8.0), clen) + decompressed = letter_compress.decompress(compressed) + self.assertEqual(decompressed, message, f'The bad message string was "{message}"') + + +if __name__ == '__main__': + bootstrap.initialize(unittest.main)()