#!/usr/bin/env python3 # -*- coding: utf-8 -*- """The MIT License (MIT) Copyright (c) 2016-2020 Davide Zanotti Modifications Copyright (c) 2021-2022 Scott Gasch Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. This class is based on: https://github.com/daveoncode/python-string-utils. """ import base64 import contextlib # type: ignore import datetime import io import json import logging import numbers import random import re import string import unicodedata import warnings from itertools import zip_longest from typing import ( Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, ) from uuid import uuid4 import list_utils logger = logging.getLogger(__name__) NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$") HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$") OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$") BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$") URLS_RAW_STRING = ( r"([a-z-]+://)" # scheme r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password r"(www\.)?" # www. r"((?]*/?>)(.*?())?||)", re.IGNORECASE | re.MULTILINE | re.DOTALL, ) HTML_TAG_ONLY_RE = re.compile( r"(<([a-z]+:)?[a-z]+[^>]*/?>|||)", re.IGNORECASE | re.MULTILINE | re.DOTALL, ) SPACES_RE = re.compile(r"\s") NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE) MARGIN_RE = re.compile(r"^[^\S\r\n]+") ESCAPE_SEQUENCE_RE = re.compile(r"\[[^A-Za-z]*[A-Za-z]") NUM_SUFFIXES = { "Pb": (1024**5), "P": (1024**5), "Tb": (1024**4), "T": (1024**4), "Gb": (1024**3), "G": (1024**3), "Mb": (1024**2), "M": (1024**2), "Kb": (1024**1), "K": (1024**1), } def is_none_or_empty(in_str: Optional[str]) -> bool: """ Returns true if the input string is either None or an empty string. >>> is_none_or_empty("") True >>> is_none_or_empty(None) True >>> is_none_or_empty(" \t ") True >>> is_none_or_empty('Test') False """ return in_str is None or len(in_str.strip()) == 0 def is_string(obj: Any) -> bool: """ Checks if an object is a string. >>> is_string('test') True >>> is_string(123) False >>> is_string(100.3) False >>> is_string([1, 2, 3]) False """ return isinstance(obj, str) def is_empty_string(in_str: Any) -> bool: return is_empty(in_str) def is_empty(in_str: Any) -> bool: """ Checks if input is a string and empty or only whitespace. >>> is_empty('') True >>> is_empty(' \t\t ') True >>> is_empty('test') False >>> is_empty(100.88) False >>> is_empty([1, 2, 3]) False """ return is_string(in_str) and in_str.strip() == "" def is_full_string(in_str: Any) -> bool: """ Checks that input is a string and is not empty ('') or only whitespace. >>> is_full_string('test!') True >>> is_full_string('') False >>> is_full_string(' ') False >>> is_full_string(100.999) False >>> is_full_string({"a": 1, "b": 2}) False """ return is_string(in_str) and in_str.strip() != "" def is_number(in_str: str) -> bool: """ Checks if a string is a valid number. >>> is_number(100.5) Traceback (most recent call last): ... ValueError: 100.5 >>> is_number("100.5") True >>> is_number("test") False >>> is_number("99") True >>> is_number([1, 2, 3]) Traceback (most recent call last): ... ValueError: [1, 2, 3] """ if not is_string(in_str): raise ValueError(in_str) return NUMBER_RE.match(in_str) is not None def is_integer_number(in_str: str) -> bool: """ Checks whether the given string represents an integer or not. An integer may be signed or unsigned or use a "scientific notation". >>> is_integer_number('42') True >>> is_integer_number('42.0') False """ return ( (is_number(in_str) and "." not in in_str) or is_hexidecimal_integer_number(in_str) or is_octal_integer_number(in_str) or is_binary_integer_number(in_str) ) def is_hexidecimal_integer_number(in_str: str) -> bool: """ Checks whether a string is a hex integer number. >>> is_hexidecimal_integer_number('0x12345') True >>> is_hexidecimal_integer_number('0x1A3E') True >>> is_hexidecimal_integer_number('1234') # Needs 0x False >>> is_hexidecimal_integer_number('-0xff') True >>> is_hexidecimal_integer_number('test') False >>> is_hexidecimal_integer_number(12345) # Not a string Traceback (most recent call last): ... ValueError: 12345 >>> is_hexidecimal_integer_number(101.4) Traceback (most recent call last): ... ValueError: 101.4 >>> is_hexidecimal_integer_number(0x1A3E) Traceback (most recent call last): ... ValueError: 6718 """ if not is_string(in_str): raise ValueError(in_str) return HEX_NUMBER_RE.match(in_str) is not None def is_octal_integer_number(in_str: str) -> bool: """ Checks whether a string is an octal number. >>> is_octal_integer_number('0o777') True >>> is_octal_integer_number('-0O115') True >>> is_octal_integer_number('0xFF') # Not octal, needs 0o False >>> is_octal_integer_number('7777') # Needs 0o False >>> is_octal_integer_number('test') False """ if not is_string(in_str): raise ValueError(in_str) return OCT_NUMBER_RE.match(in_str) is not None def is_binary_integer_number(in_str: str) -> bool: """ Returns whether a string contains a binary number. >>> is_binary_integer_number('0b10111') True >>> is_binary_integer_number('-0b111') True >>> is_binary_integer_number('0B10101') True >>> is_binary_integer_number('0b10102') False >>> is_binary_integer_number('0xFFF') False >>> is_binary_integer_number('test') False """ if not is_string(in_str): raise ValueError(in_str) return BIN_NUMBER_RE.match(in_str) is not None def to_int(in_str: str) -> int: """Returns the integral value of the string or raises on error. >>> to_int('1234') 1234 >>> to_int('test') Traceback (most recent call last): ... ValueError: invalid literal for int() with base 10: 'test' """ if not is_string(in_str): raise ValueError(in_str) if is_binary_integer_number(in_str): return int(in_str, 2) if is_octal_integer_number(in_str): return int(in_str, 8) if is_hexidecimal_integer_number(in_str): return int(in_str, 16) return int(in_str) def is_decimal_number(in_str: str) -> bool: """ Checks whether the given string represents a decimal or not. A decimal may be signed or unsigned or use a "scientific notation". >>> is_decimal_number('42.0') True >>> is_decimal_number('42') False """ return is_number(in_str) and "." in in_str def strip_escape_sequences(in_str: str) -> str: """ Remove escape sequences in the input string. >>> strip_escape_sequences('this is a test!') 'this is a test!' """ in_str = ESCAPE_SEQUENCE_RE.sub("", in_str) return in_str def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str: """ Add thousands separator to a numeric string. Also handles numbers. >>> add_thousands_separator('12345678') '12,345,678' >>> add_thousands_separator(12345678) '12,345,678' >>> add_thousands_separator(12345678.99) '12,345,678.99' >>> add_thousands_separator('test') Traceback (most recent call last): ... ValueError: test """ if isinstance(in_str, numbers.Number): in_str = f'{in_str}' if is_number(in_str): return _add_thousands_separator(in_str, separator_char=separator_char, places=places) raise ValueError(in_str) def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str: decimal_part = "" if '.' in in_str: (in_str, decimal_part) = in_str.split('.') tmp = [iter(in_str[::-1])] * places ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1] if len(decimal_part) > 0: ret += '.' ret += decimal_part return ret # Full url example: # scheme://username:password@www.domain.com:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool: """ Check if a string is a valid url. >>> is_url('http://www.mysite.com') True >>> is_url('https://mysite.com') True >>> is_url('.mysite.com') False """ if not is_full_string(in_str): return False valid = URL_RE.match(in_str) is not None if allowed_schemes: return valid and any([in_str.startswith(s) for s in allowed_schemes]) return valid def is_email(in_str: Any) -> bool: """ Check if a string is a valid email. Reference: https://tools.ietf.org/html/rfc3696#section-3 >>> is_email('my.email@the-provider.com') True >>> is_email('@gmail.com') False """ if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."): return False try: # we expect 2 tokens, one before "@" and one after, otherwise # we have an exception and the email is not valid. head, tail = in_str.split("@") # head's size must be <= 64, tail <= 255, head must not start # with a dot or contain multiple consecutive dots. if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head): return False # removes escaped spaces, so that later on the test regex will # accept the string. head = head.replace("\\ ", "") if head.startswith('"') and head.endswith('"'): head = head.replace(" ", "")[1:-1] return EMAIL_RE.match(head + "@" + tail) is not None except ValueError: # borderline case in which we have multiple "@" signs but the # head part is correctly escaped. if ESCAPED_AT_SIGN.search(in_str) is not None: # replace "@" with "a" in the head return is_email(ESCAPED_AT_SIGN.sub("a", in_str)) return False def suffix_string_to_number(in_str: str) -> Optional[int]: """Take a string like "33Gb" and convert it into a number (of bytes) like 34603008. Return None if the input string is not valid. >>> suffix_string_to_number('1Mb') 1048576 >>> suffix_string_to_number('13.1Gb') 14066017894 """ def suffix_capitalize(s: str) -> str: if len(s) == 1: return s.upper() elif len(s) == 2: return f"{s[0].upper()}{s[1].lower()}" return suffix_capitalize(s[0:1]) if is_string(in_str): if is_integer_number(in_str): return to_int(in_str) suffixes = [in_str[-2:], in_str[-1:]] rest = [in_str[:-2], in_str[:-1]] for x in range(len(suffixes)): s = suffixes[x] s = suffix_capitalize(s) multiplier = NUM_SUFFIXES.get(s, None) if multiplier is not None: r = rest[x] if is_integer_number(r): return to_int(r) * multiplier if is_decimal_number(r): return int(float(r) * multiplier) return None def number_to_suffix_string(num: int) -> Optional[str]: """Take a number (of bytes) and returns a string like "43.8Gb". Returns none if the input is invalid. >>> number_to_suffix_string(14066017894) '13.1Gb' >>> number_to_suffix_string(1024 * 1024) '1.0Mb' """ d = 0.0 suffix = None for (sfx, size) in NUM_SUFFIXES.items(): if num >= size: d = num / size suffix = sfx break if suffix is not None: return f"{d:.1f}{suffix}" else: return f'{num:d}' def is_credit_card(in_str: Any, card_type: str = None) -> bool: """ Checks if a string is a valid credit card number. If card type is provided then it checks against that specific type only, otherwise any known credit card number will be accepted. Supported card types are the following: - VISA - MASTERCARD - AMERICAN_EXPRESS - DINERS_CLUB - DISCOVER - JCB """ if not is_full_string(in_str): return False if card_type is not None: if card_type not in CREDIT_CARDS: raise KeyError( f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}' ) return CREDIT_CARDS[card_type].match(in_str) is not None for c in CREDIT_CARDS: if CREDIT_CARDS[c].match(in_str) is not None: return True return False def is_camel_case(in_str: Any) -> bool: """ Checks if a string is formatted as camel case. A string is considered camel case when: - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9]) - it contains both lowercase and uppercase letters - it does not start with a number """ return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None def is_snake_case(in_str: Any, *, separator: str = "_") -> bool: """ Checks if a string is formatted as "snake case". A string is considered snake case when: - it's composed only by lowercase/uppercase letters and digits - it contains at least one underscore (or provided separator) - it does not start with a number >>> is_snake_case('this_is_a_test') True >>> is_snake_case('___This_Is_A_Test_1_2_3___') True >>> is_snake_case('this-is-a-test') False >>> is_snake_case('this-is-a-test', separator='-') True """ if is_full_string(in_str): re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE} re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)" r = re_map.get( separator, re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE), ) return r.match(in_str) is not None return False def is_json(in_str: Any) -> bool: """ Check if a string is a valid json. >>> is_json('{"name": "Peter"}') True >>> is_json('[1, 2, 3]') True >>> is_json('{nope}') False """ if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None: try: return isinstance(json.loads(in_str), (dict, list)) except (TypeError, ValueError, OverflowError): pass return False def is_uuid(in_str: Any, allow_hex: bool = False) -> bool: """ Check if a string is a valid UUID. >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') True >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf') False >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True) True """ # string casting is used to allow UUID itself as input data type s = str(in_str) if allow_hex: return UUID_HEX_OK_RE.match(s) is not None return UUID_RE.match(s) is not None def is_ip_v4(in_str: Any) -> bool: """ Checks if a string is a valid ip v4. >>> is_ip_v4('255.200.100.75') True >>> is_ip_v4('nope') False >>> is_ip_v4('255.200.100.999') # 999 out of range False """ if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None: return False # checks that each entry in the ip is in the valid range (0 to 255) for token in in_str.split("."): if not 0 <= int(token) <= 255: return False return True def extract_ip_v4(in_str: Any) -> Optional[str]: """ Extracts the IPv4 chunk of a string or None. >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ') '127.0.0.1' >>> extract_ip_v4('Your mom dresses you funny.') """ if not is_full_string(in_str): return None m = ANYWHERE_IP_V4_RE.search(in_str) if m is not None: return m.group(0) return None def is_ip_v6(in_str: Any) -> bool: """ Checks if a string is a valid ip v6. >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') True >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?" False """ return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None def extract_ip_v6(in_str: Any) -> Optional[str]: """ Extract IPv6 chunk or None. >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334') '2001:db8:85a3:0000:0000:8a2e:370:7334' >>> extract_ip_v6("(and she's ugly too, btw)") """ if not is_full_string(in_str): return None m = ANYWHERE_IP_V6_RE.search(in_str) if m is not None: return m.group(0) return None def is_ip(in_str: Any) -> bool: """ Checks if a string is a valid ip (either v4 or v6). >>> is_ip('255.200.100.75') True >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334') True >>> is_ip('1.2.3') False >>> is_ip('1.2.3.999') False """ return is_ip_v6(in_str) or is_ip_v4(in_str) def extract_ip(in_str: Any) -> Optional[str]: """ Extract the IP address or None. >>> extract_ip('Attacker: 255.200.100.75') '255.200.100.75' >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334') '2001:db8:85a3:0000:0000:8a2e:370:7334' >>> extract_ip('1.2.3') """ ip = extract_ip_v4(in_str) if ip is None: ip = extract_ip_v6(in_str) return ip def is_mac_address(in_str: Any) -> bool: """Return True if in_str is a valid MAC address false otherwise. >>> is_mac_address("34:29:8F:12:0D:2F") True >>> is_mac_address('34:29:8f:12:0d:2f') True >>> is_mac_address('34-29-8F-12-0D-2F') True >>> is_mac_address("test") False """ return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]: """ Extract the MAC address from in_str. >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F') '34:29:8F:12:0D:2F' >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]') 'd8:5d:e2:34:54:86' """ if not is_full_string(in_str): return None in_str.strip() m = ANYWHERE_MAC_ADDRESS_RE.search(in_str) if m is not None: mac = m.group(0) mac.replace(":", separator) mac.replace("-", separator) return mac return None def is_slug(in_str: Any, separator: str = "-") -> bool: """ Checks if a given string is a slug (as created by `slugify()`). >>> is_slug('my-blog-post-title') True >>> is_slug('My blog post title') False """ if not is_full_string(in_str): return False rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$" return re.match(rex, in_str) is not None def contains_html(in_str: str) -> bool: """ Checks if the given string contains HTML/XML tags. By design, this function matches ANY type of tag, so don't expect to use it as an HTML validator, its goal is to detect "malicious" or undesired tags in the text. >>> contains_html('my string is bold') True >>> contains_html('my string is not bold') False """ if not is_string(in_str): raise ValueError(in_str) return HTML_RE.search(in_str) is not None def words_count(in_str: str) -> int: """ Returns the number of words contained into the given string. This method is smart, it does consider only sequence of one or more letter and/or numbers as "words", so a string like this: "! @ # % ... []" will return zero! Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop" will be 4 not 1 (even if there are no spaces in the string). >>> words_count('hello world') 2 >>> words_count('one,two,three.stop') 4 """ if not is_string(in_str): raise ValueError(in_str) return len(WORDS_COUNT_RE.findall(in_str)) def word_count(in_str: str) -> int: return words_count(in_str) def generate_uuid(omit_dashes: bool = False) -> str: """ Generated an UUID string (using `uuid.uuid4()`). generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b' generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b' """ uid = uuid4() if omit_dashes: return uid.hex return str(uid) def generate_random_alphanumeric_string(size: int) -> str: """ Returns a string of the specified size containing random characters (uppercase/lowercase ascii letters and digits). >>> random.seed(22) >>> generate_random_alphanumeric_string(9) '96ipbNClS' """ if size < 1: raise ValueError("size must be >= 1") chars = string.ascii_letters + string.digits buffer = [random.choice(chars) for _ in range(size)] return from_char_list(buffer) def reverse(in_str: str) -> str: """ Returns the string with its chars reversed. >>> reverse('test') 'tset' """ if not is_string(in_str): raise ValueError(in_str) return in_str[::-1] def camel_case_to_snake_case(in_str, *, separator="_"): """ Convert a camel case string into a snake case one. (The original string is returned if is not a valid camel case string) >>> camel_case_to_snake_case('MacAddressExtractorFactory') 'mac_address_extractor_factory' >>> camel_case_to_snake_case('Luke Skywalker') 'Luke Skywalker' """ if not is_string(in_str): raise ValueError(in_str) if not is_camel_case(in_str): return in_str return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower() def snake_case_to_camel_case( in_str: str, *, upper_case_first: bool = True, separator: str = "_" ) -> str: """ Convert a snake case string into a camel case one. (The original string is returned if is not a valid snake case string) >>> snake_case_to_camel_case('this_is_a_test') 'ThisIsATest' >>> snake_case_to_camel_case('Han Solo') 'Han Solo' """ if not is_string(in_str): raise ValueError(in_str) if not is_snake_case(in_str, separator=separator): return in_str tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)] if not upper_case_first: tokens[0] = tokens[0].lower() return from_char_list(tokens) def to_char_list(in_str: str) -> List[str]: """Convert a string into a list of chars. >>> to_char_list('test') ['t', 'e', 's', 't'] """ if not is_string(in_str): return [] return list(in_str) def from_char_list(in_list: List[str]) -> str: """Convert a char list into a string. >>> from_char_list(['t', 'e', 's', 't']) 'test' """ return "".join(in_list) def shuffle(in_str: str) -> str: """Return a new string containing same chars of the given one but in a randomized order. """ if not is_string(in_str): raise ValueError(in_str) # turn the string into a list of chars chars = to_char_list(in_str) random.shuffle(chars) return from_char_list(chars) def scramble(in_str: str) -> str: return shuffle(in_str) def strip_html(in_str: str, keep_tag_content: bool = False) -> str: """ Remove html code contained into the given string. >>> strip_html('test: click here') 'test: ' >>> strip_html('test: click here', keep_tag_content=True) 'test: click here' """ if not is_string(in_str): raise ValueError(in_str) r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE return r.sub("", in_str) def asciify(in_str: str) -> str: """ Force string content to be ascii-only by translating all non-ascii chars into the closest possible representation (eg: ó -> o, Ë -> E, ç -> c...). N.B. Some chars may be lost if impossible to translate. >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') 'eeuuooaaeynAAACIINOE' """ if not is_string(in_str): raise ValueError(in_str) # "NFKD" is the algorithm which is able to successfully translate # the most of non-ascii chars. normalized = unicodedata.normalize("NFKD", in_str) # encode string forcing ascii and ignore any errors # (unrepresentable chars will be stripped out) ascii_bytes = normalized.encode("ascii", "ignore") # turns encoded bytes into an utf-8 string return ascii_bytes.decode("utf-8") def slugify(in_str: str, *, separator: str = "-") -> str: """ Converts a string into a "slug" using provided separator. The returned string has the following properties: - it has no spaces - all letters are in lower case - all punctuation signs and non alphanumeric chars are removed - words are divided using provided separator - all chars are encoded as ascii (by using `asciify()`) - is safe for URL >>> slugify('Top 10 Reasons To Love Dogs!!!') 'top-10-reasons-to-love-dogs' >>> slugify('Mönstér Mägnët') 'monster-magnet' """ if not is_string(in_str): raise ValueError(in_str) # replace any character that is NOT letter or number with spaces out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip() # replace spaces with join sign out = SPACES_RE.sub(separator, out) # normalize joins (remove duplicates) out = re.sub(re.escape(separator) + r"+", separator, out) return asciify(out) def to_bool(in_str: str) -> bool: """ Turns a string into a boolean based on its content (CASE INSENSITIVE). A positive boolean (True) is returned if the string value is one of the following: - "true" - "1" - "yes" - "y" Otherwise False is returned. >>> to_bool('True') True >>> to_bool('1') True >>> to_bool('yes') True >>> to_bool('no') False >>> to_bool('huh?') False >>> to_bool('on') True """ if not is_string(in_str): raise ValueError(in_str) return in_str.lower() in ("true", "1", "yes", "y", "t", "on") def to_date(in_str: str) -> Optional[datetime.date]: """ Parses a date string. See DateParser docs for details. """ import dateparse.dateparse_utils as du try: d = du.DateParser() # type: ignore d.parse(in_str) return d.get_date() except du.ParseException: # type: ignore msg = f'Unable to parse date {in_str}.' logger.warning(msg) return None def valid_date(in_str: str) -> bool: """ True if the string represents a valid date. """ import dateparse.dateparse_utils as dp try: d = dp.DateParser() # type: ignore _ = d.parse(in_str) return True except dp.ParseException: # type: ignore msg = f'Unable to parse date {in_str}.' logger.warning(msg) return False def to_datetime(in_str: str) -> Optional[datetime.datetime]: """ Parses a datetime string. See DateParser docs for more info. """ import dateparse.dateparse_utils as dp try: d = dp.DateParser() # type: ignore dt = d.parse(in_str) if isinstance(dt, datetime.datetime): return dt except ValueError: msg = f'Unable to parse datetime {in_str}.' logger.warning(msg) return None def valid_datetime(in_str: str) -> bool: """ True if the string represents a valid datetime. """ _ = to_datetime(in_str) if _ is not None: return True msg = f'Unable to parse datetime {in_str}.' logger.warning(msg) return False def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str: """ Squeeze runs of more than one character_to_squeeze into one. >>> squeeze(' this is a test ') ' this is a test ' >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|') 'one|!|two|!|three' """ return re.sub( r'(' + re.escape(character_to_squeeze) + r')+', character_to_squeeze, in_str, ) def dedent(in_str: str) -> str: """ Removes tab indentation from multi line strings (inspired by analogous Scala function). """ if not is_string(in_str): raise ValueError(in_str) line_separator = '\n' lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)] return line_separator.join(lines) def indent(in_str: str, amount: int) -> str: """ Indents string by prepending amount spaces. >>> indent('This is a test', 4) ' This is a test' """ if not is_string(in_str): raise ValueError(in_str) line_separator = '\n' lines = [" " * amount + line for line in in_str.split(line_separator)] return line_separator.join(lines) def sprintf(*args, **kwargs) -> str: """String printf, like in C""" ret = "" sep = kwargs.pop("sep", None) if sep is not None: if not isinstance(sep, str): raise TypeError("sep must be None or a string") end = kwargs.pop("end", None) if end is not None: if not isinstance(end, str): raise TypeError("end must be None or a string") if kwargs: raise TypeError("invalid keyword arguments to sprint()") if sep is None: sep = " " if end is None: end = "\n" for i, arg in enumerate(args): if i: ret += sep if isinstance(arg, str): ret += arg else: ret += str(arg) ret += end return ret def strip_ansi_sequences(in_str: str) -> str: """Strips ANSI sequences out of strings. >>> import ansi as a >>> s = a.fg('blue') + 'blue!' + a.reset() >>> len(s) # '\x1b[38;5;21mblue!\x1b[m' 18 >>> len(strip_ansi_sequences(s)) 5 >>> strip_ansi_sequences(s) 'blue!' """ return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str) class SprintfStdout(contextlib.AbstractContextManager): """ A context manager that captures outputs to stdout. with SprintfStdout() as buf: print("test") print(buf()) 'test\n' """ def __init__(self) -> None: self.destination = io.StringIO() self.recorder: contextlib.redirect_stdout def __enter__(self) -> Callable[[], str]: self.recorder = contextlib.redirect_stdout(self.destination) self.recorder.__enter__() return lambda: self.destination.getvalue() def __exit__(self, *args) -> Literal[False]: self.recorder.__exit__(*args) self.destination.seek(0) return False def capitalize_first_letter(txt: str) -> str: """Capitalize the first letter of a string. >>> capitalize_first_letter('test') 'Test' >>> capitalize_first_letter("ALREADY!") 'ALREADY!' """ return txt[0].upper() + txt[1:] def it_they(n: int) -> str: """It or they? >>> it_they(1) 'it' >>> it_they(100) 'they' """ if n == 1: return "it" return "they" def is_are(n: int) -> str: """Is or are? >>> is_are(1) 'is' >>> is_are(2) 'are' """ if n == 1: return "is" return "are" def pluralize(n: int) -> str: """Add an s? >>> pluralize(15) 's' >>> count = 1 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.') There is 1 file. >>> count = 4 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.') There are 4 files. """ if n == 1: return "" return "s" def make_contractions(txt: str) -> str: """Glue words together to form contractions. >>> make_contractions('It is nice today.') "It's nice today." >>> make_contractions('I can not even...') "I can't even..." >>> make_contractions('She could not see!') "She couldn't see!" >>> make_contractions('But she will not go.') "But she won't go." >>> make_contractions('Verily, I shall not.') "Verily, I shan't." >>> make_contractions('No you cannot.') "No you can't." >>> make_contractions('I said you can not go.') "I said you can't go." """ first_second = [ ( [ 'are', 'could', 'did', 'has', 'have', 'is', 'must', 'should', 'was', 'were', 'would', ], ['(n)o(t)'], ), ( [ "I", "you", "he", "she", "it", "we", "they", "how", "why", "when", "where", "who", "there", ], ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'], ), ] # Special cases: can't, shan't and won't. txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE) txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE) txt = re.sub( r'\b(w)ill\s*(n)(o)(t)\b', r"\1\3\2'\4", txt, count=0, flags=re.IGNORECASE, ) for first_list, second_list in first_second: for first in first_list: for second in second_list: # Disallow there're/where're. They're valid English # but sound weird. if (first in ('there', 'where')) and second == 'a(re)': continue pattern = fr'\b({first})\s+{second}\b' if second == '(n)o(t)': replacement = r"\1\2'\3" else: replacement = r"\1'\2" txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE) return txt def thify(n: int) -> str: """Return the proper cardinal suffix for a number. >>> thify(1) 'st' >>> thify(33) 'rd' >>> thify(16) 'th' """ digit = str(n) assert is_integer_number(digit) digit = digit[-1:] if digit == "1": return "st" elif digit == "2": return "nd" elif digit == "3": return "rd" else: return "th" def ngrams(txt: str, n: int): """Return the ngrams from a string. >>> [x for x in ngrams('This is a test', 2)] ['This is', 'is a', 'a test'] """ words = txt.split() for ngram in ngrams_presplit(words, n): ret = '' for word in ngram: ret += f'{word} ' yield ret.strip() def ngrams_presplit(words: Sequence[str], n: int): return list_utils.ngrams(words, n) def bigrams(txt: str): return ngrams(txt, 2) def trigrams(txt: str): return ngrams(txt, 3) def shuffle_columns_into_list( input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim='' ) -> Iterable[str]: """Helper to shuffle / parse columnar data and return the results as a list. The column_specs argument is an iterable collection of numeric sequences that indicate one or more column numbers to copy. >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split() >>> shuffle_columns_into_list( ... cols, ... [ [8], [2, 3], [5, 6, 7] ], ... delim=' ', ... ) ['acl_test.py', 'scott wheel', 'Jul 9 11:34'] """ out = [] # Column specs map input lines' columns into outputs. # [col1, col2...] for spec in column_specs: hunk = '' for n in spec: hunk = hunk + delim + input_lines[n] hunk = hunk.strip(delim) out.append(hunk) return out def shuffle_columns_into_dict( input_lines: Sequence[str], column_specs: Iterable[Tuple[str, Iterable[int]]], delim='', ) -> Dict[str, str]: """Helper to shuffle / parse columnar data and return the results as a dict. >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split() >>> shuffle_columns_into_dict( ... cols, ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ], ... delim=' ', ... ) {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'} """ out = {} # Column specs map input lines' columns into outputs. # "key", [col1, col2...] for spec in column_specs: hunk = '' for n in spec[1]: hunk = hunk + delim + input_lines[n] hunk = hunk.strip(delim) out[spec[0]] = hunk return out def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str: """Interpolate a string with data from a dict. >>> interpolate_using_dict('This is a {adjective} {noun}.', ... {'adjective': 'good', 'noun': 'example'}) 'This is a good example.' """ return sprintf(txt.format(**values), end='') def to_ascii(x: str): """Encode as ascii bytes string. >>> to_ascii('test') b'test' >>> to_ascii(b'1, 2, 3') b'1, 2, 3' """ if isinstance(x, str): return x.encode('ascii') if isinstance(x, bytes): return x raise Exception('to_ascii works with strings and bytes') def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes: """Encode txt and then encode the bytes with a 64-character alphabet. This is compatible with uudecode. >>> to_base64('hello?') b'aGVsbG8/\\n' """ return base64.encodebytes(txt.encode(encoding, errors)) def is_base64(txt: str) -> bool: """Determine whether a string is base64 encoded (with Python's standard base64 alphabet which is the same as what uuencode uses). >>> is_base64('test') # all letters in the b64 alphabet True >>> is_base64('another test, how do you like this one?') False >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok. True """ a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/' alphabet = set(a.encode('ascii')) for char in to_ascii(txt.strip()): if char not in alphabet: return False return True def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str: """Convert base64 encoded string back to normal strings. >>> from_base64(b'aGVsbG8/\\n') 'hello?' """ return base64.decodebytes(b64).decode(encoding, errors) def chunk(txt: str, chunk_size): """Chunk up a string. >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8)) '01001101 11000101 10101010 10101010 10011111 10101000' """ if len(txt) % chunk_size != 0: msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})' logger.warning(msg) warnings.warn(msg, stacklevel=2) for x in range(0, len(txt), chunk_size): yield txt[x : x + chunk_size] def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str: """Encode txt and then chop it into bytes. Note: only bitstrings with delimiter='' are interpretable by from_bitstring. >>> to_bitstring('hello?') '011010000110010101101100011011000110111100111111' >>> to_bitstring('test', delimiter=' ') '01110100 01100101 01110011 01110100' >>> to_bitstring(b'test') '01110100011001010111001101110100' """ etxt = to_ascii(txt) bits = bin(int.from_bytes(etxt, 'big')) bits = bits[2:] return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8)) def is_bitstring(txt: str) -> bool: """Is this a bitstring? >>> is_bitstring('011010000110010101101100011011000110111100111111') True >>> is_bitstring('1234') False """ return is_binary_integer_number(f'0b{txt}') def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str: """Convert from bitstring back to bytes then decode into a str. >>> from_bitstring('011010000110010101101100011011000110111100111111') 'hello?' """ n = int(bits, 2) return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0' def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]: """Turn an IPv4 address into a tuple for sorting purposes. >>> ip_v4_sort_key('10.0.0.18') (10, 0, 0, 18) >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9'] >>> sorted(ips, key=lambda x: ip_v4_sort_key(x)) ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1'] """ if not is_ip_v4(txt): print(f"not IP: {txt}") return None return tuple(int(x) for x in txt.split('.')) def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]: """Chunk up a file path so that parent/ancestor paths sort before children/descendant paths. >>> path_ancestors_before_descendants_sort_key('/usr/local/bin') ('usr', 'local', 'bin') >>> paths = ['/usr/local', '/usr/local/bin', '/usr'] >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x)) ['/usr', '/usr/local', '/usr/local/bin'] """ return tuple(x for x in volume.split('/') if len(x) > 0) def replace_all(in_str: str, replace_set: str, replacement: str) -> str: """Execute several replace operations in a row. >>> s = 'this_is a-test!' >>> replace_all(s, ' _-!', '') 'thisisatest' """ for char in replace_set: in_str = in_str.replace(char, replacement) return in_str def replace_nth(in_str: str, source: str, target: str, nth: int): """Replaces the nth occurrance of a substring within a string. >>> replace_nth('this is a test', ' ', '-', 3) 'this is a-test' """ where = [m.start() for m in re.finditer(source, in_str)][nth - 1] before = in_str[:where] after = in_str[where:] after = after.replace(source, target, 1) return before + after if __name__ == '__main__': import doctest doctest.testmod()