Source code for pyutils.compress.letter_compress

#!/usr/bin/env python3

# © Copyright 2021-2023, Scott Gasch

"""
This is a simple, honestly, toy compression scheme that uses a custom
alphabet of 32 characters which can each be represented in six bits
instead of eight.  It therefore reduces the size of data composed of
only those letters by 25% without loss.
"""

import bitstring

from pyutils.collectionz.bidict import BiDict

special_characters = BiDict(
    {
        " ": 27,
        ".": 28,
        ",": 29,
        "-": 30,
        '"': 31,
    }
)


[docs]def compress(uncompressed: str) -> bytes: """Compress a word sequence into a stream of bytes. The compressed form will be 5/8th the size of the original. Words can be lower case letters or special_characters (above). Args: uncompressed: the uncompressed string to be compressed Returns: the compressed bytes Raises: ValueError: uncompressed text contains illegal character >>> import binascii >>> binascii.hexlify(compress('this is a test')) b'a2133da67b0ee859d0' >>> binascii.hexlify(compress('scot')) b'98df40' >>> binascii.hexlify(compress('scott')) # Note the last byte b'98df4a00' """ compressed = bitstring.BitArray() for letter in uncompressed: if "a" <= letter <= "z": bits = ord(letter) - ord("a") + 1 # 1..26 else: if letter not in special_characters: raise ValueError( f'"{uncompressed}" contains uncompressable char="{letter}"' ) bits = special_characters[letter] compressed.append(f"uint:5={bits}") while len(compressed) % 8 != 0: compressed.append("uint:1=0") return compressed.bytes
[docs]def decompress(compressed: bytes) -> str: """ Decompress a previously compressed stream of bytes back into its original form. Args: compressed: the compressed data to decompress Returns: The decompressed string >>> import binascii >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0')) 'this is a test' >>> decompress(binascii.unhexlify(b'98df4a00')) 'scott' """ decompressed = "" kompressed = bitstring.BitArray(compressed) # There are compressed messages that legitimately end with the # byte 0x00. The message "scott" is an example; compressed it is # 0x98df4a00. It's 5 characters long which means there are 5 x 5 # bits of compressed info (25 bits, just over 3 bytes). The last # (25th) bit in the steam happens to be a zero. The compress code # padded out the compressed message by adding seven more zeros to # complete the partial 4th byte. In the 4th byte, however, one # bit is information and seven are padding. # # It's likely that this API's client code may treat a zero byte as # a termination character and not regard it as a legitimate part # of the message. This is a bug in that client code, to be clear. # # However, it's a bug we can work around: # # Here, I'm appending an extra 0x00 byte to the compressed message # passed in. If the client code dropped the last 0x00 byte (and, # with it, some of the legitimate message bits) by treating it as # a termination mark, this 0x00 will replace it (and the missing # message bits). If the client code didn't drop the last 0x00 (or # if the compressed message didn't end in 0x00), adding an extra # 0x00 is a no op because the codepoint 0b00000 is a "stop" message # so we'll ignore the extras. kompressed.append("uint:8=0") for chunk in kompressed.cut(5): chunk = chunk.uint if chunk == 0: break if 1 <= chunk <= 26: letter = chr(chunk - 1 + ord("a")) else: letter = special_characters.inverse[chunk][0] decompressed += letter return decompressed
if __name__ == "__main__": import doctest doctest.testmod()