X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=string_utils.py;h=d75c6ba1aca2c559ed4254d535747c54f4719bf5;hb=e8fbbb7306430478dec55d2c963eed116d8330cc;hp=45cf5aab7ac7f5202346745de733c792c984214d;hpb=b843703134a166013518c707fa5a77373f1bf0bf;p=python_utils.git diff --git a/string_utils.py b/string_utils.py index 45cf5aa..d75c6ba 100644 --- a/string_utils.py +++ b/string_utils.py @@ -1,18 +1,50 @@ #!/usr/bin/env python3 +# -*- coding: utf-8 -*- -import contextlib +"""The MIT License (MIT) + +Copyright (c) 2016-2020 Davide Zanotti +Modifications Copyright (c) 2021-2022 Scott Gasch + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +This class is based on: https://github.com/daveoncode/python-string-utils. +""" + +import base64 +import contextlib # type: ignore import datetime import io -from itertools import zip_longest import json import logging +import numbers import random import re import string -from typing import Any, Callable, Iterable, List, Optional import unicodedata +import warnings +from itertools import zip_longest +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple from uuid import uuid4 +import list_utils + logger = logging.getLogger(__name__) NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$") @@ -47,19 +79,13 @@ EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING)) EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING)) -CAMEL_CASE_TEST_RE = re.compile( - r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$" -) +CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$") CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])") -SNAKE_CASE_TEST_RE = re.compile( - r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE -) +SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE) -SNAKE_CASE_TEST_DASH_RE = re.compile( - r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE -) +SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE) SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])") @@ -74,13 +100,9 @@ CREDIT_CARDS = { "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"), } -JSON_WRAPPER_RE = re.compile( - r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL -) +JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL) -UUID_RE = re.compile( - r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE -) +UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE) UUID_HEX_OK_RE = re.compile( r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$", @@ -89,15 +111,17 @@ UUID_HEX_OK_RE = re.compile( SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$") +ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") + IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE) -MAC_ADDRESS_RE = re.compile( - r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE -) +ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE) -WORDS_COUNT_RE = re.compile( - r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE -) +MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE) + +ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE) + +WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE) HTML_RE = re.compile( r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?())?||)", @@ -111,50 +135,116 @@ HTML_TAG_ONLY_RE = re.compile( SPACES_RE = re.compile(r"\s") -NO_LETTERS_OR_NUMBERS_RE = re.compile( - r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE -) +NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE) MARGIN_RE = re.compile(r"^[^\S\r\n]+") ESCAPE_SEQUENCE_RE = re.compile(r"\[[^A-Za-z]*[A-Za-z]") NUM_SUFFIXES = { - "Pb": (1024 ** 5), - "P": (1024 ** 5), - "Tb": (1024 ** 4), - "T": (1024 ** 4), - "Gb": (1024 ** 3), - "G": (1024 ** 3), - "Mb": (1024 ** 2), - "M": (1024 ** 2), - "Kb": (1024 ** 1), - "K": (1024 ** 1), + "Pb": (1024**5), + "P": (1024**5), + "Tb": (1024**4), + "T": (1024**4), + "Gb": (1024**3), + "G": (1024**3), + "Mb": (1024**2), + "M": (1024**2), + "Kb": (1024**1), + "K": (1024**1), } def is_none_or_empty(in_str: Optional[str]) -> bool: + """ + Returns true if the input string is either None or an empty string. + + >>> is_none_or_empty("") + True + >>> is_none_or_empty(None) + True + >>> is_none_or_empty(" \t ") + True + >>> is_none_or_empty('Test') + False + """ return in_str is None or len(in_str.strip()) == 0 def is_string(obj: Any) -> bool: """ Checks if an object is a string. + + >>> is_string('test') + True + >>> is_string(123) + False + >>> is_string(100.3) + False + >>> is_string([1, 2, 3]) + False """ return isinstance(obj, str) def is_empty_string(in_str: Any) -> bool: + return is_empty(in_str) + + +def is_empty(in_str: Any) -> bool: + """ + Checks if input is a string and empty or only whitespace. + + >>> is_empty('') + True + >>> is_empty(' \t\t ') + True + >>> is_empty('test') + False + >>> is_empty(100.88) + False + >>> is_empty([1, 2, 3]) + False + """ return is_string(in_str) and in_str.strip() == "" def is_full_string(in_str: Any) -> bool: + """ + Checks that input is a string and is not empty ('') or only whitespace. + + >>> is_full_string('test!') + True + >>> is_full_string('') + False + >>> is_full_string(' ') + False + >>> is_full_string(100.999) + False + >>> is_full_string({"a": 1, "b": 2}) + False + """ return is_string(in_str) and in_str.strip() != "" def is_number(in_str: str) -> bool: """ Checks if a string is a valid number. + + >>> is_number(100.5) + Traceback (most recent call last): + ... + ValueError: 100.5 + >>> is_number("100.5") + True + >>> is_number("test") + False + >>> is_number("99") + True + >>> is_number([1, 2, 3]) + Traceback (most recent call last): + ... + ValueError: [1, 2, 3] """ if not is_string(in_str): raise ValueError(in_str) @@ -167,38 +257,103 @@ def is_integer_number(in_str: str) -> bool: An integer may be signed or unsigned or use a "scientific notation". - *Examples:* - - >>> is_integer('42') # returns true - >>> is_integer('42.0') # returns false + >>> is_integer_number('42') + True + >>> is_integer_number('42.0') + False """ return ( - (is_number(in_str) and "." not in in_str) or - is_hexidecimal_integer_number(in_str) or - is_octal_integer_number(in_str) or - is_binary_integer_number(in_str) + (is_number(in_str) and "." not in in_str) + or is_hexidecimal_integer_number(in_str) + or is_octal_integer_number(in_str) + or is_binary_integer_number(in_str) ) def is_hexidecimal_integer_number(in_str: str) -> bool: + """ + Checks whether a string is a hex integer number. + + >>> is_hexidecimal_integer_number('0x12345') + True + >>> is_hexidecimal_integer_number('0x1A3E') + True + >>> is_hexidecimal_integer_number('1234') # Needs 0x + False + >>> is_hexidecimal_integer_number('-0xff') + True + >>> is_hexidecimal_integer_number('test') + False + >>> is_hexidecimal_integer_number(12345) # Not a string + Traceback (most recent call last): + ... + ValueError: 12345 + >>> is_hexidecimal_integer_number(101.4) + Traceback (most recent call last): + ... + ValueError: 101.4 + >>> is_hexidecimal_integer_number(0x1A3E) + Traceback (most recent call last): + ... + ValueError: 6718 + """ if not is_string(in_str): raise ValueError(in_str) return HEX_NUMBER_RE.match(in_str) is not None def is_octal_integer_number(in_str: str) -> bool: + """ + Checks whether a string is an octal number. + + >>> is_octal_integer_number('0o777') + True + >>> is_octal_integer_number('-0O115') + True + >>> is_octal_integer_number('0xFF') # Not octal, needs 0o + False + >>> is_octal_integer_number('7777') # Needs 0o + False + >>> is_octal_integer_number('test') + False + """ if not is_string(in_str): raise ValueError(in_str) return OCT_NUMBER_RE.match(in_str) is not None def is_binary_integer_number(in_str: str) -> bool: + """ + Returns whether a string contains a binary number. + + >>> is_binary_integer_number('0b10111') + True + >>> is_binary_integer_number('-0b111') + True + >>> is_binary_integer_number('0B10101') + True + >>> is_binary_integer_number('0b10102') + False + >>> is_binary_integer_number('0xFFF') + False + >>> is_binary_integer_number('test') + False + """ if not is_string(in_str): raise ValueError(in_str) return BIN_NUMBER_RE.match(in_str) is not None def to_int(in_str: str) -> int: + """Returns the integral value of the string or raises on error. + + >>> to_int('1234') + 1234 + >>> to_int('test') + Traceback (most recent call last): + ... + ValueError: invalid literal for int() with base 10: 'test' + """ if not is_string(in_str): raise ValueError(in_str) if is_binary_integer_number(in_str): @@ -216,41 +371,54 @@ def is_decimal_number(in_str: str) -> bool: A decimal may be signed or unsigned or use a "scientific notation". - >>> is_decimal('42.0') # returns true - >>> is_decimal('42') # returns false + >>> is_decimal_number('42.0') + True + >>> is_decimal_number('42') + False """ return is_number(in_str) and "." in in_str def strip_escape_sequences(in_str: str) -> str: + """ + Remove escape sequences in the input string. + + >>> strip_escape_sequences('this is a test!') + 'this is a test!' + """ in_str = ESCAPE_SEQUENCE_RE.sub("", in_str) return in_str -def add_thousands_separator( - in_str: str, - *, - separator_char = ',', - places = 3 -) -> str: - if isinstance(in_str, int): +def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str: + """ + Add thousands separator to a numeric string. Also handles numbers. + + >>> add_thousands_separator('12345678') + '12,345,678' + >>> add_thousands_separator(12345678) + '12,345,678' + >>> add_thousands_separator(12345678.99) + '12,345,678.99' + >>> add_thousands_separator('test') + Traceback (most recent call last): + ... + ValueError: test + + """ + if isinstance(in_str, numbers.Number): in_str = f'{in_str}' if is_number(in_str): - return _add_thousands_separator( - in_str, - separator_char = separator_char, - places = places - ) + return _add_thousands_separator(in_str, separator_char=separator_char, places=places) raise ValueError(in_str) -def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str: +def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str: decimal_part = "" if '.' in in_str: (in_str, decimal_part) = in_str.split('.') tmp = [iter(in_str[::-1])] * places - ret = separator_char.join( - "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1] + ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1] if len(decimal_part) > 0: ret += '.' ret += decimal_part @@ -263,11 +431,12 @@ def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool: """ Check if a string is a valid url. - *Examples:* - - >>> is_url('http://www.mysite.com') # returns true - >>> is_url('https://mysite.com') # returns true - >>> is_url('.mysite.com') # returns false + >>> is_url('http://www.mysite.com') + True + >>> is_url('https://mysite.com') + True + >>> is_url('.mysite.com') + False """ if not is_full_string(in_str): return False @@ -285,16 +454,12 @@ def is_email(in_str: Any) -> bool: Reference: https://tools.ietf.org/html/rfc3696#section-3 - *Examples:* - - >>> is_email('my.email@the-provider.com') # returns true - >>> is_email('@gmail.com') # returns false + >>> is_email('my.email@the-provider.com') + True + >>> is_email('@gmail.com') + False """ - if ( - not is_full_string(in_str) - or len(in_str) > 320 - or in_str.startswith(".") - ): + if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."): return False try: @@ -304,12 +469,7 @@ def is_email(in_str: Any) -> bool: # head's size must be <= 64, tail <= 255, head must not start # with a dot or contain multiple consecutive dots. - if ( - len(head) > 64 - or len(tail) > 255 - or head.endswith(".") - or (".." in head) - ): + if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head): return False # removes escaped spaces, so that later on the test regex will @@ -331,6 +491,11 @@ def is_email(in_str: Any) -> bool: def suffix_string_to_number(in_str: str) -> Optional[int]: """Take a string like "33Gb" and convert it into a number (of bytes) like 34603008. Return None if the input string is not valid. + + >>> suffix_string_to_number('1Mb') + 1048576 + >>> suffix_string_to_number('13.1Gb') + 14066017894 """ def suffix_capitalize(s: str) -> str: @@ -352,13 +517,21 @@ def suffix_string_to_number(in_str: str) -> Optional[int]: if multiplier is not None: r = rest[x] if is_integer_number(r): - return int(r) * multiplier + return to_int(r) * multiplier + if is_decimal_number(r): + return int(float(r) * multiplier) return None def number_to_suffix_string(num: int) -> Optional[str]: """Take a number (of bytes) and returns a string like "43.8Gb". Returns none if the input is invalid. + + >>> number_to_suffix_string(14066017894) + '13.1Gb' + >>> number_to_suffix_string(1024 * 1024) + '1.0Mb' + """ d = 0.0 suffix = None @@ -413,9 +586,7 @@ def is_camel_case(in_str: Any) -> bool: - it contains both lowercase and uppercase letters - it does not start with a number """ - return ( - is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None - ) + return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None def is_snake_case(in_str: Any, *, separator: str = "_") -> bool: @@ -427,17 +598,23 @@ def is_snake_case(in_str: Any, *, separator: str = "_") -> bool: - it's composed only by lowercase/uppercase letters and digits - it contains at least one underscore (or provided separator) - it does not start with a number + + >>> is_snake_case('this_is_a_test') + True + >>> is_snake_case('___This_Is_A_Test_1_2_3___') + True + >>> is_snake_case('this-is-a-test') + False + >>> is_snake_case('this-is-a-test', separator='-') + True + """ if is_full_string(in_str): re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE} - re_template = ( - r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)" - ) + re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)" r = re_map.get( separator, - re.compile( - re_template.format(sign=re.escape(separator)), re.IGNORECASE - ), + re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE), ) return r.match(in_str) is not None return False @@ -447,11 +624,12 @@ def is_json(in_str: Any) -> bool: """ Check if a string is a valid json. - *Examples:* - - >>> is_json('{"name": "Peter"}') # returns true - >>> is_json('[1, 2, 3]') # returns true - >>> is_json('{nope}') # returns false + >>> is_json('{"name": "Peter"}') + True + >>> is_json('[1, 2, 3]') + True + >>> is_json('{nope}') + False """ if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None: try: @@ -465,11 +643,12 @@ def is_uuid(in_str: Any, allow_hex: bool = False) -> bool: """ Check if a string is a valid UUID. - *Example:* - - >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') # returns true - >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf') # returns false - >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True) # returns true + >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') + True + >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf') + False + >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True) + True """ # string casting is used to allow UUID itself as input data type s = str(in_str) @@ -482,11 +661,12 @@ def is_ip_v4(in_str: Any) -> bool: """ Checks if a string is a valid ip v4. - *Examples:* - - >>> is_ip_v4('255.200.100.75') # returns true - >>> is_ip_v4('nope') # returns false (not an ip) - >>> is_ip_v4('255.200.100.999') # returns false (999 is out of range) + >>> is_ip_v4('255.200.100.75') + True + >>> is_ip_v4('nope') + False + >>> is_ip_v4('255.200.100.999') # 999 out of range + False """ if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None: return False @@ -501,11 +681,14 @@ def is_ip_v4(in_str: Any) -> bool: def extract_ip_v4(in_str: Any) -> Optional[str]: """ Extracts the IPv4 chunk of a string or None. + + >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ') + '127.0.0.1' + >>> extract_ip_v4('Your mom dresses you funny.') """ if not is_full_string(in_str): return None - in_str.strip() - m = SHALLOW_IP_V4_RE.match(in_str) + m = ANYWHERE_IP_V4_RE.search(in_str) if m is not None: return m.group(0) return None @@ -515,10 +698,10 @@ def is_ip_v6(in_str: Any) -> bool: """ Checks if a string is a valid ip v6. - *Examples:* - - >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true - >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # returns false (invalid "?") + >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') + True + >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?" + False """ return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None @@ -526,11 +709,14 @@ def is_ip_v6(in_str: Any) -> bool: def extract_ip_v6(in_str: Any) -> Optional[str]: """ Extract IPv6 chunk or None. + + >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334') + '2001:db8:85a3:0000:0000:8a2e:370:7334' + >>> extract_ip_v6("(and she's ugly too, btw)") """ if not is_full_string(in_str): return None - in_str.strip() - m = IP_V6_RE.match(in_str) + m = ANYWHERE_IP_V6_RE.search(in_str) if m is not None: return m.group(0) return None @@ -540,17 +726,29 @@ def is_ip(in_str: Any) -> bool: """ Checks if a string is a valid ip (either v4 or v6). - *Examples:* - - >>> is_ip('255.200.100.75') # returns true - >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true - >>> is_ip('1.2.3') # returns false + >>> is_ip('255.200.100.75') + True + >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334') + True + >>> is_ip('1.2.3') + False + >>> is_ip('1.2.3.999') + False """ return is_ip_v6(in_str) or is_ip_v4(in_str) def extract_ip(in_str: Any) -> Optional[str]: - """Extract the IP address or None.""" + """ + Extract the IP address or None. + + >>> extract_ip('Attacker: 255.200.100.75') + '255.200.100.75' + >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334') + '2001:db8:85a3:0000:0000:8a2e:370:7334' + >>> extract_ip('1.2.3') + + """ ip = extract_ip_v4(in_str) if ip is None: ip = extract_ip_v6(in_str) @@ -558,16 +756,35 @@ def extract_ip(in_str: Any) -> Optional[str]: def is_mac_address(in_str: Any) -> bool: - """Return True if in_str is a valid MAC address false otherwise.""" + """Return True if in_str is a valid MAC address false otherwise. + + >>> is_mac_address("34:29:8F:12:0D:2F") + True + >>> is_mac_address('34:29:8f:12:0d:2f') + True + >>> is_mac_address('34-29-8F-12-0D-2F') + True + >>> is_mac_address("test") + False + """ return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]: - """Extract the MAC address from in_str""" + """ + Extract the MAC address from in_str. + + >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F') + '34:29:8F:12:0D:2F' + + >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]') + 'd8:5d:e2:34:54:86' + + """ if not is_full_string(in_str): return None in_str.strip() - m = MAC_ADDRESS_RE.match(in_str) + m = ANYWHERE_MAC_ADDRESS_RE.search(in_str) if m is not None: mac = m.group(0) mac.replace(":", separator) @@ -580,16 +797,11 @@ def is_slug(in_str: Any, separator: str = "-") -> bool: """ Checks if a given string is a slug (as created by `slugify()`). - *Examples:* - - >>> is_slug('my-blog-post-title') # returns true - >>> is_slug('My blog post title') # returns false + >>> is_slug('my-blog-post-title') + True + >>> is_slug('My blog post title') + False - :param in_str: String to check. - :type in_str: str - :param separator: Join sign used by the slug. - :type separator: str - :return: True if slug, false otherwise. """ if not is_full_string(in_str): return False @@ -604,10 +816,11 @@ def contains_html(in_str: str) -> bool: By design, this function matches ANY type of tag, so don't expect to use it as an HTML validator, its goal is to detect "malicious" or undesired tags in the text. - *Examples:* + >>> contains_html('my string is bold') + True + >>> contains_html('my string is not bold') + False - >>> contains_html('my string is bold') # returns true - >>> contains_html('my string is not bold') # returns false """ if not is_string(in_str): raise ValueError(in_str) @@ -623,27 +836,27 @@ def words_count(in_str: str) -> int: Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop" will be 4 not 1 (even if there are no spaces in the string). - *Examples:* + >>> words_count('hello world') + 2 + >>> words_count('one,two,three.stop') + 4 - >>> words_count('hello world') # returns 2 - >>> words_count('one,two,three.stop') # returns 4 """ if not is_string(in_str): raise ValueError(in_str) return len(WORDS_COUNT_RE.findall(in_str)) -def generate_uuid(as_hex: bool = False) -> str: +def generate_uuid(omit_dashes: bool = False) -> str: """ Generated an UUID string (using `uuid.uuid4()`). - *Examples:* + generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b' + generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b' - >>> uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b' - >>> uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b' """ uid = uuid4() - if as_hex: + if omit_dashes: return uid.hex return str(uid) @@ -653,9 +866,8 @@ def generate_random_alphanumeric_string(size: int) -> str: Returns a string of the specified size containing random characters (uppercase/lowercase ascii letters and digits). - *Example:* + random_string(9) # possible output: "cx3QQbzYg" - >>> random_string(9) # possible output: "cx3QQbzYg" """ if size < 1: raise ValueError("size must be >= 1") @@ -667,6 +879,10 @@ def generate_random_alphanumeric_string(size: int) -> str: def reverse(in_str: str) -> str: """ Returns the string with its chars reversed. + + >>> reverse('test') + 'tset' + """ if not is_string(in_str): raise ValueError(in_str) @@ -677,14 +893,17 @@ def camel_case_to_snake_case(in_str, *, separator="_"): """ Convert a camel case string into a snake case one. (The original string is returned if is not a valid camel case string) + + >>> camel_case_to_snake_case('MacAddressExtractorFactory') + 'mac_address_extractor_factory' + >>> camel_case_to_snake_case('Luke Skywalker') + 'Luke Skywalker' """ if not is_string(in_str): raise ValueError(in_str) if not is_camel_case(in_str): return in_str - return CAMEL_CASE_REPLACE_RE.sub( - lambda m: m.group(1) + separator, in_str - ).lower() + return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower() def snake_case_to_camel_case( @@ -693,6 +912,11 @@ def snake_case_to_camel_case( """ Convert a snake case string into a camel case one. (The original string is returned if is not a valid snake case string) + + >>> snake_case_to_camel_case('this_is_a_test') + 'ThisIsATest' + >>> snake_case_to_camel_case('Han Solo') + 'Han Solo' """ if not is_string(in_str): raise ValueError(in_str) @@ -705,12 +929,22 @@ def snake_case_to_camel_case( def to_char_list(in_str: str) -> List[str]: + """Convert a string into a list of chars. + + >>> to_char_list('test') + ['t', 'e', 's', 't'] + """ if not is_string(in_str): return [] return list(in_str) def from_char_list(in_list: List[str]) -> str: + """Convert a char list into a string. + + >>> from_char_list(['t', 'e', 's', 't']) + 'test' + """ return "".join(in_list) @@ -731,10 +965,10 @@ def strip_html(in_str: str, keep_tag_content: bool = False) -> str: """ Remove html code contained into the given string. - *Examples:* - - >>> strip_html('test: click here') # returns 'test: ' - >>> strip_html('test: click here', keep_tag_content=True) # returns 'test: click here' + >>> strip_html('test: click here') + 'test: ' + >>> strip_html('test: click here', keep_tag_content=True) + 'test: click here' """ if not is_string(in_str): raise ValueError(in_str) @@ -744,14 +978,14 @@ def strip_html(in_str: str, keep_tag_content: bool = False) -> str: def asciify(in_str: str) -> str: """ - Force string content to be ascii-only by translating all non-ascii chars into the closest possible representation - (eg: ó -> o, Ë -> E, ç -> c...). - - **Bear in mind**: Some chars may be lost if impossible to translate. + Force string content to be ascii-only by translating all non-ascii + chars into the closest possible representation (eg: ó -> o, Ë -> + E, ç -> c...). - *Example:* + N.B. Some chars may be lost if impossible to translate. - >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') # returns 'eeuuooaaeynAAACIINOE' + >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') + 'eeuuooaaeynAAACIINOE' """ if not is_string(in_str): raise ValueError(in_str) @@ -780,10 +1014,10 @@ def slugify(in_str: str, *, separator: str = "-") -> str: - all chars are encoded as ascii (by using `asciify()`) - is safe for URL - *Examples:* - - >>> slugify('Top 10 Reasons To Love Dogs!!!') # returns: 'top-10-reasons-to-love-dogs' - >>> slugify('Mönstér Mägnët') # returns 'monster-magnet' + >>> slugify('Top 10 Reasons To Love Dogs!!!') + 'top-10-reasons-to-love-dogs' + >>> slugify('Mönstér Mägnët') + 'monster-magnet' """ if not is_string(in_str): raise ValueError(in_str) @@ -803,7 +1037,8 @@ def to_bool(in_str: str) -> bool: """ Turns a string into a boolean based on its content (CASE INSENSITIVE). - A positive boolean (True) is returned if the string value is one of the following: + A positive boolean (True) is returned if the string value is one + of the following: - "true" - "1" @@ -811,71 +1046,113 @@ def to_bool(in_str: str) -> bool: - "y" Otherwise False is returned. + + >>> to_bool('True') + True + + >>> to_bool('1') + True + + >>> to_bool('yes') + True + + >>> to_bool('no') + False + + >>> to_bool('huh?') + False + + >>> to_bool('on') + True + """ if not is_string(in_str): raise ValueError(in_str) - return in_str.lower() in ("true", "1", "yes", "y", "t") + return in_str.lower() in ("true", "1", "yes", "y", "t", "on") def to_date(in_str: str) -> Optional[datetime.date]: - import dateparse.dateparse_utils as dp + """ + Parses a date string. See DateParser docs for details. + """ + import dateparse.dateparse_utils as du + try: - d = dp.DateParser() + d = du.DateParser() # type: ignore d.parse(in_str) return d.get_date() - except dp.ParseException: - logger.warning(f'Unable to parse date {in_str}.') + except du.ParseException: # type: ignore + msg = f'Unable to parse date {in_str}.' + logger.warning(msg) return None def valid_date(in_str: str) -> bool: + """ + True if the string represents a valid date. + """ import dateparse.dateparse_utils as dp + try: - d = dp.DateParser() + d = dp.DateParser() # type: ignore _ = d.parse(in_str) return True - except dp.ParseException: - logger.warning(f'Unable to parse date {in_str}.') + except dp.ParseException: # type: ignore + msg = f'Unable to parse date {in_str}.' + logger.warning(msg) return False def to_datetime(in_str: str) -> Optional[datetime.datetime]: + """ + Parses a datetime string. See DateParser docs for more info. + """ import dateparse.dateparse_utils as dp + try: - d = dp.DateParser() + d = dp.DateParser() # type: ignore dt = d.parse(in_str) - if type(dt) == datetime.datetime: + if isinstance(dt, datetime.datetime): return dt except ValueError: - logger.warning(f'Unable to parse datetime {in_str}.') + msg = f'Unable to parse datetime {in_str}.' + logger.warning(msg) return None def valid_datetime(in_str: str) -> bool: + """ + True if the string represents a valid datetime. + """ _ = to_datetime(in_str) if _ is not None: return True - logger.warning(f'Unable to parse datetime {in_str}.') + msg = f'Unable to parse datetime {in_str}.' + logger.warning(msg) return False -def dedent(in_str: str) -> str: +def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str: """ - Removes tab indentation from multi line strings (inspired by analogous Scala function). + Squeeze runs of more than one character_to_squeeze into one. + + >>> squeeze(' this is a test ') + ' this is a test ' - *Example:* + >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|') + 'one|!|two|!|three' - >>> strip_margin(''' - >>> line 1 - >>> line 2 - >>> line 3 - >>> ''') - >>> # returns: - >>> ''' - >>> line 1 - >>> line 2 - >>> line 3 - >>> ''' + """ + return re.sub( + r'(' + re.escape(character_to_squeeze) + r')+', + character_to_squeeze, + in_str, + ) + + +def dedent(in_str: str) -> str: + """ + Removes tab indentation from multi line strings (inspired by analogous Scala function). """ if not is_string(in_str): raise ValueError(in_str) @@ -885,6 +1162,13 @@ def dedent(in_str: str) -> str: def indent(in_str: str, amount: int) -> str: + """ + Indents string by prepending amount spaces. + + >>> indent('This is a test', 4) + ' This is a test' + + """ if not is_string(in_str): raise ValueError(in_str) line_separator = '\n' @@ -893,6 +1177,7 @@ def indent(in_str: str, amount: int) -> str: def sprintf(*args, **kwargs) -> str: + """String printf, like in C""" ret = "" sep = kwargs.pop("sep", None) @@ -924,9 +1209,19 @@ def sprintf(*args, **kwargs) -> str: class SprintfStdout(object): + """ + A context manager that captures outputs to stdout. + + with SprintfStdout() as buf: + print("test") + print(buf()) + + 'test\n' + """ + def __init__(self) -> None: self.destination = io.StringIO() - self.recorder = None + self.recorder: contextlib.redirect_stdout def __enter__(self) -> Callable[[], str]: self.recorder = contextlib.redirect_stdout(self.destination) @@ -939,19 +1234,167 @@ class SprintfStdout(object): return None # don't suppress exceptions +def capitalize_first_letter(txt: str) -> str: + """Capitalize the first letter of a string. + + >>> capitalize_first_letter('test') + 'Test' + >>> capitalize_first_letter("ALREADY!") + 'ALREADY!' + + """ + return txt[0].upper() + txt[1:] + + +def it_they(n: int) -> str: + """It or they? + + >>> it_they(1) + 'it' + >>> it_they(100) + 'they' + + """ + if n == 1: + return "it" + return "they" + + def is_are(n: int) -> str: + """Is or are? + + >>> is_are(1) + 'is' + >>> is_are(2) + 'are' + + """ if n == 1: return "is" return "are" def pluralize(n: int) -> str: + """Add an s? + + >>> pluralize(15) + 's' + >>> count = 1 + >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.') + There is 1 file. + >>> count = 4 + >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.') + There are 4 files. + + """ if n == 1: return "" return "s" +def make_contractions(txt: str) -> str: + """Glue words together to form contractions. + + >>> make_contractions('It is nice today.') + "It's nice today." + + >>> make_contractions('I can not even...') + "I can't even..." + + >>> make_contractions('She could not see!') + "She couldn't see!" + + >>> make_contractions('But she will not go.') + "But she won't go." + + >>> make_contractions('Verily, I shall not.') + "Verily, I shan't." + + >>> make_contractions('No you cannot.') + "No you can't." + + >>> make_contractions('I said you can not go.') + "I said you can't go." + + """ + + first_second = [ + ( + [ + 'are', + 'could', + 'did', + 'has', + 'have', + 'is', + 'must', + 'should', + 'was', + 'were', + 'would', + ], + ['(n)o(t)'], + ), + ( + [ + "I", + "you", + "he", + "she", + "it", + "we", + "they", + "how", + "why", + "when", + "where", + "who", + "there", + ], + ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'], + ), + ] + + # Special cases: can't, shan't and won't. + txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE) + txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE) + txt = re.sub( + r'\b(w)ill\s*(n)(o)(t)\b', + r"\1\3\2'\4", + txt, + count=0, + flags=re.IGNORECASE, + ) + + for first_list, second_list in first_second: + for first in first_list: + for second in second_list: + # Disallow there're/where're. They're valid English + # but sound weird. + if (first in ('there', 'where')) and second == 'a(re)': + continue + + pattern = fr'\b({first})\s+{second}\b' + if second == '(n)o(t)': + replacement = r"\1\2'\3" + else: + replacement = r"\1'\2" + txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE) + + return txt + + def thify(n: int) -> str: + """Return the proper cardinal suffix for a number. + + >>> thify(1) + 'st' + >>> thify(33) + 'rd' + >>> thify(16) + 'th' + + """ digit = str(n) assert is_integer_number(digit) digit = digit[-1:] @@ -966,13 +1409,22 @@ def thify(n: int) -> str: def ngrams(txt: str, n: int): + """Return the ngrams from a string. + + >>> [x for x in ngrams('This is a test', 2)] + ['This is', 'is a', 'a test'] + + """ words = txt.split() - return ngrams_presplit(words, n) + for ngram in ngrams_presplit(words, n): + ret = '' + for word in ngram: + ret += f'{word} ' + yield ret.strip() -def ngrams_presplit(words: Iterable[str], n: int): - for ngram in zip(*[words[i:] for i in range(n)]): - yield(' '.join(ngram)) +def ngrams_presplit(words: Sequence[str], n: int): + return list_utils.ngrams(words, n) def bigrams(txt: str): @@ -981,3 +1433,244 @@ def bigrams(txt: str): def trigrams(txt: str): return ngrams(txt, 3) + + +def shuffle_columns_into_list( + input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim='' +) -> Iterable[str]: + """Helper to shuffle / parse columnar data and return the results as a + list. The column_specs argument is an iterable collection of + numeric sequences that indicate one or more column numbers to + copy. + + >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split() + >>> shuffle_columns_into_list( + ... cols, + ... [ [8], [2, 3], [5, 6, 7] ], + ... delim=' ', + ... ) + ['acl_test.py', 'scott wheel', 'Jul 9 11:34'] + + """ + out = [] + + # Column specs map input lines' columns into outputs. + # [col1, col2...] + for spec in column_specs: + hunk = '' + for n in spec: + hunk = hunk + delim + input_lines[n] + hunk = hunk.strip(delim) + out.append(hunk) + return out + + +def shuffle_columns_into_dict( + input_lines: Sequence[str], + column_specs: Iterable[Tuple[str, Iterable[int]]], + delim='', +) -> Dict[str, str]: + """Helper to shuffle / parse columnar data and return the results + as a dict. + + >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split() + >>> shuffle_columns_into_dict( + ... cols, + ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ], + ... delim=' ', + ... ) + {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'} + + """ + out = {} + + # Column specs map input lines' columns into outputs. + # "key", [col1, col2...] + for spec in column_specs: + hunk = '' + for n in spec[1]: + hunk = hunk + delim + input_lines[n] + hunk = hunk.strip(delim) + out[spec[0]] = hunk + return out + + +def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str: + """Interpolate a string with data from a dict. + + >>> interpolate_using_dict('This is a {adjective} {noun}.', + ... {'adjective': 'good', 'noun': 'example'}) + 'This is a good example.' + + """ + return sprintf(txt.format(**values), end='') + + +def to_ascii(x: str): + """Encode as ascii bytes string. + + >>> to_ascii('test') + b'test' + + >>> to_ascii(b'1, 2, 3') + b'1, 2, 3' + + """ + if isinstance(x, str): + return x.encode('ascii') + if isinstance(x, bytes): + return x + raise Exception('to_ascii works with strings and bytes') + + +def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes: + """Encode txt and then encode the bytes with a 64-character + alphabet. This is compatible with uudecode. + + >>> to_base64('hello?') + b'aGVsbG8/\\n' + + """ + return base64.encodebytes(txt.encode(encoding, errors)) + + +def is_base64(txt: str) -> bool: + """Determine whether a string is base64 encoded (with Python's standard + base64 alphabet which is the same as what uuencode uses). + + >>> is_base64('test') # all letters in the b64 alphabet + True + + >>> is_base64('another test, how do you like this one?') + False + + >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok. + True + + """ + a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/' + alphabet = set(a.encode('ascii')) + for char in to_ascii(txt.strip()): + if char not in alphabet: + return False + return True + + +def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str: + """Convert base64 encoded string back to normal strings. + + >>> from_base64(b'aGVsbG8/\\n') + 'hello?' + + """ + return base64.decodebytes(b64).decode(encoding, errors) + + +def chunk(txt: str, chunk_size): + """Chunk up a string. + + >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8)) + '01001101 11000101 10101010 10101010 10011111 10101000' + + """ + if len(txt) % chunk_size != 0: + msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})' + logger.warning(msg) + warnings.warn(msg, stacklevel=2) + for x in range(0, len(txt), chunk_size): + yield txt[x : x + chunk_size] + + +def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str: + """Encode txt and then chop it into bytes. Note: only bitstrings + with delimiter='' are interpretable by from_bitstring. + + >>> to_bitstring('hello?') + '011010000110010101101100011011000110111100111111' + + >>> to_bitstring('test', delimiter=' ') + '01110100 01100101 01110011 01110100' + + >>> to_bitstring(b'test') + '01110100011001010111001101110100' + + """ + etxt = to_ascii(txt) + bits = bin(int.from_bytes(etxt, 'big')) + bits = bits[2:] + return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8)) + + +def is_bitstring(txt: str) -> bool: + """Is this a bitstring? + + >>> is_bitstring('011010000110010101101100011011000110111100111111') + True + + >>> is_bitstring('1234') + False + + """ + return is_binary_integer_number(f'0b{txt}') + + +def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str: + """Convert from bitstring back to bytes then decode into a str. + + >>> from_bitstring('011010000110010101101100011011000110111100111111') + 'hello?' + + """ + n = int(bits, 2) + return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0' + + +def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]: + """Turn an IPv4 address into a tuple for sorting purposes. + + >>> ip_v4_sort_key('10.0.0.18') + (10, 0, 0, 18) + + >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9'] + >>> sorted(ips, key=lambda x: ip_v4_sort_key(x)) + ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1'] + + """ + if not is_ip_v4(txt): + print(f"not IP: {txt}") + return None + return tuple([int(x) for x in txt.split('.')]) + + +def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]: + """Chunk up a file path so that parent/ancestor paths sort before + children/descendant paths. + + >>> path_ancestors_before_descendants_sort_key('/usr/local/bin') + ('usr', 'local', 'bin') + + >>> paths = ['/usr/local', '/usr/local/bin', '/usr'] + >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x)) + ['/usr', '/usr/local', '/usr/local/bin'] + + """ + return tuple([x for x in volume.split('/') if len(x) > 0]) + + +def replace_all(in_str: str, replace_set: str, replacement: str) -> str: + """Execute several replace operations in a row. + + >>> s = 'this_is a-test!' + >>> replace_all(s, ' _-!', '') + 'thisisatest' + + """ + for char in replace_set: + in_str = in_str.replace(char, replacement) + return in_str + + +if __name__ == '__main__': + import doctest + + doctest.testmod()