src/pyutils/compress/letter_compress.py

   1 #!/usr/bin/env python3
   2
   3 # © Copyright 2021-2023, Scott Gasch
   4
   5 """
   6 This is a simple, honestly, toy compression scheme that uses a custom
   7 alphabet of 32 characters which can each be represented in six bits
   8 instead of eight.  It therefore reduces the size of data composed of
   9 only those letters by 25% without loss.
  10 """
  11
  12 import bitstring
  13
  14 from pyutils.collectionz.bidict import BiDict
  15
  16 special_characters = BiDict(
  17     {
  18         " ": 27,
  19         ".": 28,
  20         ",": 29,
  21         "-": 30,
  22         '"': 31,
  23     }
  24 )
  25
  26
  27 def compress(uncompressed: str) -> bytes:
  28     """Compress a word sequence into a stream of bytes.  The compressed
  29     form will be 5/8th the size of the original.  Words can be lower
  30     case letters or special_characters (above).
  31
  32     Args:
  33         uncompressed: the uncompressed string to be compressed
  34
  35     Returns:
  36         the compressed bytes
  37
  38     Raises:
  39         ValueError: uncompressed text contains illegal character
  40
  41     >>> import binascii
  42     >>> binascii.hexlify(compress('this is a test'))
  43     b'a2133da67b0ee859d0'
  44
  45     >>> binascii.hexlify(compress('scot'))
  46     b'98df40'
  47
  48     >>> binascii.hexlify(compress('scott'))  # Note the last byte
  49     b'98df4a00'
  50
  51     """
  52     compressed = bitstring.BitArray()
  53     for letter in uncompressed:
  54         if "a" <= letter <= "z":
  55             bits = ord(letter) - ord("a") + 1  # 1..26
  56         else:
  57             if letter not in special_characters:
  58                 raise ValueError(
  59                     f'"{uncompressed}" contains uncompressable char="{letter}"'
  60                 )
  61             bits = special_characters[letter]
  62         compressed.append(f"uint:5={bits}")
  63     while len(compressed) % 8 != 0:
  64         compressed.append("uint:1=0")
  65     return compressed.bytes
  66
  67
  68 def decompress(compressed: bytes) -> str:
  69     """
  70     Decompress a previously compressed stream of bytes back into
  71     its original form.
  72
  73     Args:
  74         compressed: the compressed data to decompress
  75
  76     Returns:
  77         The decompressed string
  78
  79     >>> import binascii
  80     >>> decompress(binascii.unhexlify(b'a2133da67b0ee859d0'))
  81     'this is a test'
  82
  83     >>> decompress(binascii.unhexlify(b'98df4a00'))
  84     'scott'
  85
  86     """
  87     decompressed = ""
  88     kompressed = bitstring.BitArray(compressed)
  89
  90     # There are compressed messages that legitimately end with the
  91     # byte 0x00.  The message "scott" is an example; compressed it is
  92     # 0x98df4a00.  It's 5 characters long which means there are 5 x 5
  93     # bits of compressed info (25 bits, just over 3 bytes).  The last
  94     # (25th) bit in the steam happens to be a zero.  The compress code
  95     # padded out the compressed message by adding seven more zeros to
  96     # complete the partial 4th byte.  In the 4th byte, however, one
  97     # bit is information and seven are padding.
  98     #
  99     # It's likely that this API's client code may treat a zero byte as
 100     # a termination character and not regard it as a legitimate part
 101     # of the message.  This is a bug in that client code, to be clear.
 102     #
 103     # However, it's a bug we can work around:
 104     #
 105     # Here, I'm appending an extra 0x00 byte to the compressed message
 106     # passed in.  If the client code dropped the last 0x00 byte (and,
 107     # with it, some of the legitimate message bits) by treating it as
 108     # a termination mark, this 0x00 will replace it (and the missing
 109     # message bits).  If the client code didn't drop the last 0x00 (or
 110     # if the compressed message didn't end in 0x00), adding an extra
 111     # 0x00 is a no op because the codepoint 0b00000 is a "stop" message
 112     # so we'll ignore the extras.
 113     kompressed.append("uint:8=0")
 114
 115     for chunk in kompressed.cut(5):
 116         chunk = chunk.uint
 117         if chunk == 0:
 118             break
 119
 120         if 1 <= chunk <= 26:
 121             letter = chr(chunk - 1 + ord("a"))
 122         else:
 123             letter = special_characters.inverse[chunk][0]
 124         decompressed += letter
 125     return decompressed
 126
 127
 128 if __name__ == "__main__":
 129     import doctest
 130
 131     doctest.testmod()