From d82c8377ce394cad812dc0d53829f7465b3f3f4e Mon Sep 17 00:00:00 2001 From: Scott Gasch Date: Wed, 29 Sep 2021 09:03:57 -0700 Subject: [PATCH] Workaround likely client bug in letter_compress. Update tests in bst. Add pip_install.sh. --- .gitignore | 1 - collect/bst.py | 4 ++-- letter_compress.py | 32 ++++++++++++++++++++++++++++---- pip_install.sh | 11 +++++++++++ 4 files changed, 41 insertions(+), 7 deletions(-) create mode 100755 pip_install.sh diff --git a/.gitignore b/.gitignore index 4ea5b14..28e68dd 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,3 @@ dateparse/dateparse_utilsLexer.tokens dateparse/dateparse_utilsListener.py dateparse/dateparse_utilsParser.py dateparse/duration_utils.g4 -pip_install.sh diff --git a/collect/bst.py b/collect/bst.py index 94570f4..8e95fa2 100644 --- a/collect/bst.py +++ b/collect/bst.py @@ -120,8 +120,8 @@ class BinaryTree(object): 75 85 - >>> t.__delitem__(22) - True + >>> del t[22] # Note: bool result is discarded + >>> for value in t.iterate_inorder(): ... print(value) 13 diff --git a/letter_compress.py b/letter_compress.py index 378ecbc..d5a4d60 100644 --- a/letter_compress.py +++ b/letter_compress.py @@ -16,8 +16,7 @@ special_characters = bidict( def compress(uncompressed: str) -> bytes: - """ - Compress a word sequence into a stream of bytes. The compressed + """Compress a word sequence into a stream of bytes. The compressed form will be 5/8th the size of the original. Words can be lower case letters or special_characters (above). @@ -28,7 +27,7 @@ def compress(uncompressed: str) -> bytes: >>> binascii.hexlify(compress('scot')) b'98df40' - >>> binascii.hexlify(compress('scott')) + >>> binascii.hexlify(compress('scott')) # Note the last byte b'98df4a00' """ @@ -61,9 +60,34 @@ def decompress(kompressed: bytes) -> str: """ decompressed = '' compressed = bitstring.BitArray(kompressed) + + # There are compressed messages that legitimately end with the + # byte 0x00. The message "scott" is an example; compressed it is + # 0x98df4a00. It's 5 characters long which means there are 5 x 5 + # bits of compressed info (25 bits, just over 3 bytes). The last + # (25th) bit in the steam happens to be a zero. The compress code + # padded out the compressed message by adding seven more zeros to + # complete the partial 4th byte. In the 4th byte, however, one + # bit is information and seven are padding. + # + # It's likely that this APIs client code will treat a zero byte as + # a termination character and not regard it as part of the + # message. This is a bug in the client code. + # + # However, it's a bug we can work around: + + # Here, I'm appending an extra 0x00 byte to the compressed message + # passed in. If the client code dropped the last 0x00 byte (and, + # with it, some of the legitimate message bits) by treating it as + # a termination mark, this 0x00 will replace it (and the missing + # message bits). If the client code didn't drop the last 0x00 (or + # if the compressed message didn't end in 0x00), adding an extra + # 0x00 is a no op because the codepoint 0b00000 is a "stop" message + # so we'll ignore the extras. + compressed.append("uint:8=0") + for chunk in compressed.cut(5): chunk = chunk.uint - print(f'0x{chunk:x}') if chunk == 0: break elif 1 <= chunk <= 26: diff --git a/pip_install.sh b/pip_install.sh new file mode 100755 index 0000000..9d40902 --- /dev/null +++ b/pip_install.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -e + +python3 -m ensurepip --upgrade +for x in pip wheel aiohttp antlr4-python3-runtime astral bitstring python-dateutil \ + grpcio holidays cloudpickle dill numpy protobuf psutil pyserial pytype \ + pychromecast requests SpeechRecognition sklearn scikit-learn nltk; do + echo "--- Installing ${x} ---" + pip install -U ${x} +done -- 2.47.1