Compress small text messages by just restricting the alphabet. Adds bidict.
authorScott Gasch <[email protected]>
Mon, 27 Sep 2021 15:47:39 +0000 (08:47 -0700)
committerScott Gasch <[email protected]>
Mon, 27 Sep 2021 15:47:39 +0000 (08:47 -0700)
bidict.py [new file with mode: 0644]
letter_compress.py [new file with mode: 0644]
tests/letter_compress_test.py [new file with mode: 0755]

diff --git a/bidict.py b/bidict.py
new file mode 100644 (file)
index 0000000..5ba3fc3
--- /dev/null
+++ b/bidict.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+class bidict(dict):
+    def __init__(self, *args, **kwargs):
+        super(bidict, self).__init__(*args, **kwargs)
+        self.inverse = {}
+        for key, value in self.items():
+            self.inverse.setdefault(value, []).append(key)
+
+    def __setitem__(self, key, value):
+        if key in self:
+            self.inverse[self[key]].remove(key)
+        super(bidict, self).__setitem__(key, value)
+        self.inverse.setdefault(value, []).append(key)
+
+    def __delitem__(self, key):
+        self.inverse.setdefault(self[key], []).remove(key)
+        if self[key] in self.inverse and not self.inverse[self[key]]:
+            del self.inverse[self[key]]
+        super(bidict, self).__delitem__(key)
diff --git a/letter_compress.py b/letter_compress.py
new file mode 100644 (file)
index 0000000..01d40f9
--- /dev/null
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+
+import bitstring
+
+import bidict
+
+
+special_characters = bidict.bidict(
+    {
+        ' ': 27,
+        '.': 28,
+        ',': 29,
+        "-": 30,
+        '"': 31,
+    }
+)
+
+
+def compress(uncompressed: str) -> bytes:
+    """
+    Compress a word sequence into a stream of bytes.  The compressed
+    form will be 5/8th the size of the original.  Words can be lower
+    case letters or special_characters (above).
+
+    >>> import binascii
+    >>> binascii.hexlify(compress('this is a test'))
+    b'99d12d225a06a6494c'
+
+    """
+    compressed = bitstring.BitArray()
+    for (n, letter) in enumerate(uncompressed):
+        if 'a' <= letter <= 'z':
+            bits = ord(letter) - ord('a') + 1   # 1..26
+        else:
+            if letter not in special_characters:
+                raise Exception(f'"{uncompressed}" contains uncompressable char="{letter}"')
+            bits = special_characters[letter]
+        compressed.append(f"uint:5={bits}")
+    while len(compressed) % 8 != 0:
+        compressed.append("uint:1=0")
+    return compressed.bytes
+
+
+def decompress(kompressed: bytes) -> str:
+    """
+    Decompress a previously compressed stream of bytes back into
+    its original form.
+
+    >>> import binascii
+    >>> decompress(binascii.unhexlify(b'99d12d225a06a6494c'))
+    'this is a test'
+
+    """
+    decompressed = ''
+    compressed = bitstring.BitArray(kompressed)
+    for chunk in compressed.cut(5):
+        chunk = chunk.uint
+        if chunk == 0:
+            break
+        elif 1 <= chunk <= 26:
+            letter = chr(chunk - 1 + ord('a'))
+        else:
+            letter = special_characters.inverse[chunk][0]
+        decompressed += letter
+    return decompressed
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
diff --git a/tests/letter_compress_test.py b/tests/letter_compress_test.py
new file mode 100755 (executable)
index 0000000..a466277
--- /dev/null
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+import random
+import math
+import unittest
+
+import bootstrap
+import letter_compress
+import unittest_utils as uu
+
+
+class TestLetterCompress(unittest.TestCase):
+
+    def test_with_random_strings(self):
+        alphabet = 'abcdefghijklmnopqrstuvwxyz .,"-'
+        for n in range(20):
+            message = ""
+            for letter in random.choices(alphabet, k=random.randrange(10, 5000)):
+                message += letter
+            mlen = len(message)
+            compressed = letter_compress.compress(message)
+            clen = len(compressed)
+            self.assertEqual(math.ceil(mlen * 5.0 / 8.0), clen)
+            decompressed = letter_compress.decompress(compressed)
+            self.assertEqual(decompressed, message, f'The bad message string was "{message}"')
+
+
+if __name__ == '__main__':
+    bootstrap.initialize(unittest.main)()