Source code for pyutils.compress.letter_compress

#!/usr/bin/env python3

# © Copyright 2021-2023, Scott Gasch

"""
This is a simple, honestly, toy compression scheme that uses a custom
alphabet of 32 characters which can each be represented in six bits
instead of eight.  It therefore reduces the size of data composed of
only those letters by 25% without loss.
"""

import bitstring

from pyutils.collectionz.bidict import BiDict

special_characters = BiDict(
    {
        " ": 27,
        ".": 28,
        ",": 29,
        "-": 30,
        '"': 31,
    }
)


[docs]def compress(uncompressed: str) -> bytes:
    """Compress a word sequence into a stream of bytes.  The compressed
    form will be 5/8th the size of the original.  Words can be lower
    case letters or special_characters (above).

    Args:
        uncompressed: the uncompressed string to be compressed

    Returns:
        the compressed bytes

    Raises:
        ValueError: uncompressed text contains illegal character

    >>> import binascii
    >>> binascii.hexlify(compress('this is a test'))
    b'a2133da67b0ee859d0'

    >>> binascii.hexlify(compress('scot'))
    b'98df40'

    >>> binascii.hexlify(compress('scott'))  # Note the last byte
    b'98df4a00'

    """
    compressed = bitstring.BitArray()
    for letter in uncompressed:
        if "a" <= letter <= "z":
            bits = ord(letter) - ord("a") + 1  # 1..26
        else:
            if letter not in special_characters:
                raise ValueError(
                    f'"{uncompressed}" contains uncompressable char="{letter}"'
                )
            bits = special_characters[letter]
        compressed.append(f"uint:5={bits}")
    while len(compressed) % 8 != 0:
        compressed.append("uint:1=0")
    return compressed.bytes


[docs]def decompress(compressed: bytes) -> str:
    """
    Decompress a previously compressed stream of bytes back into
    its original form.

    Args:
        compressed: the compressed data to decompress

    Returns:
        The decompressed string

    >>> import binascii
    >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0'))
    'this is a test'

    >>> decompress(binascii.unhexlify(b'98df4a00'))
    'scott'

    """
    decompressed = ""
    kompressed = bitstring.BitArray(compressed)

    # There are compressed messages that legitimately end with the
    # byte 0x00.  The message "scott" is an example; compressed it is
    # 0x98df4a00.  It's 5 characters long which means there are 5 x 5
    # bits of compressed info (25 bits, just over 3 bytes).  The last
    # (25th) bit in the steam happens to be a zero.  The compress code
    # padded out the compressed message by adding seven more zeros to
    # complete the partial 4th byte.  In the 4th byte, however, one
    # bit is information and seven are padding.
    #
    # It's likely that this API's client code may treat a zero byte as
    # a termination character and not regard it as a legitimate part
    # of the message.  This is a bug in that client code, to be clear.
    #
    # However, it's a bug we can work around:
    #
    # Here, I'm appending an extra 0x00 byte to the compressed message
    # passed in.  If the client code dropped the last 0x00 byte (and,
    # with it, some of the legitimate message bits) by treating it as
    # a termination mark, this 0x00 will replace it (and the missing
    # message bits).  If the client code didn't drop the last 0x00 (or
    # if the compressed message didn't end in 0x00), adding an extra
    # 0x00 is a no op because the codepoint 0b00000 is a "stop" message
    # so we'll ignore the extras.
    kompressed.append("uint:8=0")

    for chunk in kompressed.cut(5):
        chunk = chunk.uint
        if chunk == 0:
            break

        if 1 <= chunk <= 26:
            letter = chr(chunk - 1 + ord("a"))
        else:
            letter = special_characters.inverse[chunk][0]
        decompressed += letter
    return decompressed


if __name__ == "__main__":
    import doctest

    doctest.testmod()