X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=string_utils.py;h=4127079fc0a1b5670e676986421fad26009a3733;hb=a9bdfd8fc9f84b7b2c09a57cd12ba32259e84d1c;hp=ec662d234d8afa788cce2779e51f51622bc2862e;hpb=b22b39493c5b6c747b16e9430f3833bb8869cef6;p=python_utils.git diff --git a/string_utils.py b/string_utils.py index ec662d2..4127079 100644 --- a/string_utils.py +++ b/string_utils.py @@ -40,7 +40,17 @@ import string import unicodedata import warnings from itertools import zip_longest -from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Literal, + Optional, + Sequence, + Tuple, +) from uuid import uuid4 import list_utils @@ -157,7 +167,12 @@ NUM_SUFFIXES = { def is_none_or_empty(in_str: Optional[str]) -> bool: """ - Returns true if the input string is either None or an empty string. + Args: + in_str: the string to test + + Returns: + True if the input string is either None or an empty string, + False otherwise. >>> is_none_or_empty("") True @@ -173,7 +188,11 @@ def is_none_or_empty(in_str: Optional[str]) -> bool: def is_string(obj: Any) -> bool: """ - Checks if an object is a string. + Args: + in_str: the object to test + + Returns: + True if the object is a string and False otherwise. >>> is_string('test') True @@ -188,12 +207,23 @@ def is_string(obj: Any) -> bool: def is_empty_string(in_str: Any) -> bool: + """ + Args: + in_str: the string to test + + Returns: + True if the string is empty and False otherwise. + """ return is_empty(in_str) def is_empty(in_str: Any) -> bool: """ - Checks if input is a string and empty or only whitespace. + Args: + in_str: the string to test + + Returns: + True if the string is empty and false otherwise. >>> is_empty('') True @@ -211,7 +241,12 @@ def is_empty(in_str: Any) -> bool: def is_full_string(in_str: Any) -> bool: """ - Checks that input is a string and is not empty ('') or only whitespace. + Args: + in_str: the object to test + + Returns: + True if the object is a string and is not empty ('') and + is not only composed of whitespace. >>> is_full_string('test!') True @@ -229,7 +264,12 @@ def is_full_string(in_str: Any) -> bool: def is_number(in_str: str) -> bool: """ - Checks if a string is a valid number. + Args: + in_str: the string to test + + Returns: + True if the string contains a valid numberic value and + False otherwise. >>> is_number(100.5) Traceback (most recent call last): @@ -253,9 +293,13 @@ def is_number(in_str: str) -> bool: def is_integer_number(in_str: str) -> bool: """ - Checks whether the given string represents an integer or not. + Args: + in_str: the string to test - An integer may be signed or unsigned or use a "scientific notation". + Returns: + True if the string contains a valid (signed or unsigned, + decimal, hex, or octal, regular or scientific) integral + expression and False otherwise. >>> is_integer_number('42') True @@ -272,7 +316,11 @@ def is_integer_number(in_str: str) -> bool: def is_hexidecimal_integer_number(in_str: str) -> bool: """ - Checks whether a string is a hex integer number. + Args: + in_str: the string to test + + Returns: + True if the string is a hex integer number and False otherwise. >>> is_hexidecimal_integer_number('0x12345') True @@ -304,7 +352,11 @@ def is_hexidecimal_integer_number(in_str: str) -> bool: def is_octal_integer_number(in_str: str) -> bool: """ - Checks whether a string is an octal number. + Args: + in_str: the string to test + + Returns: + True if the string is a valid octal integral number and False otherwise. >>> is_octal_integer_number('0o777') True @@ -324,7 +376,11 @@ def is_octal_integer_number(in_str: str) -> bool: def is_binary_integer_number(in_str: str) -> bool: """ - Returns whether a string contains a binary number. + Args: + in_str: the string to test + + Returns: + True if the string contains a binary integral number and False otherwise. >>> is_binary_integer_number('0b10111') True @@ -345,7 +401,12 @@ def is_binary_integer_number(in_str: str) -> bool: def to_int(in_str: str) -> int: - """Returns the integral value of the string or raises on error. + """ + Args: + in_str: the string to convert + + Returns: + The integral value of the string or raises on error. >>> to_int('1234') 1234 @@ -367,9 +428,17 @@ def to_int(in_str: str) -> int: def is_decimal_number(in_str: str) -> bool: """ - Checks whether the given string represents a decimal or not. + Args: + in_str: the string to check + + Returns: + True if the given string represents a decimal or False + otherwise. A decimal may be signed or unsigned or use + a "scientific notation". - A decimal may be signed or unsigned or use a "scientific notation". + .. note:: + We do not consider integers without a decimal point + to be decimals; they return False (see example). >>> is_decimal_number('42.0') True @@ -381,7 +450,16 @@ def is_decimal_number(in_str: str) -> bool: def strip_escape_sequences(in_str: str) -> str: """ - Remove escape sequences in the input string. + Args: + in_str: the string to strip of escape sequences. + + Returns: + in_str with escape sequences removed. + + .. note:: + What is considered to be an "escape sequence" is defined + by a regular expression. While this gets common ones, + there may exist valid sequences that it doesn't match. >>> strip_escape_sequences('this is a test!') 'this is a test!' @@ -392,7 +470,13 @@ def strip_escape_sequences(in_str: str) -> str: def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str: """ - Add thousands separator to a numeric string. Also handles numbers. + Args: + in_str: string or number to which to add thousands separator(s) + separator_char: the separator character to add (defaults to comma) + places: add a separator every N places (defaults to three) + + Returns: + A numeric string with thousands separators added appropriately. >>> add_thousands_separator('12345678') '12,345,678' @@ -425,11 +509,18 @@ def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> st return ret -# Full url example: -# scheme://username:password@www.domain.com:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool: """ - Check if a string is a valid url. + Args: + in_str: the string to test + allowed_schemes: an optional list of allowed schemes (e.g. + ['http', 'https', 'ftp']. If passed, only URLs that + begin with the one of the schemes passed will be considered + to be valid. Otherwise, any scheme:// will be considered + valid. + + Returns: + True if in_str contains a valid URL and False otherwise. >>> is_url('http://www.mysite.com') True @@ -437,6 +528,8 @@ def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool: True >>> is_url('.mysite.com') False + >>> is_url('scheme://username:password@www.domain.com:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash') + True """ if not is_full_string(in_str): return False @@ -450,9 +543,12 @@ def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool: def is_email(in_str: Any) -> bool: """ - Check if a string is a valid email. + Args: + in_str: the email address to check - Reference: https://tools.ietf.org/html/rfc3696#section-3 + Returns: True if the in_str contains a valid email (as defined by + https://tools.ietf.org/html/rfc3696#section-3) or False + otherwise. >>> is_email('my.email@the-provider.com') True @@ -489,8 +585,14 @@ def is_email(in_str: Any) -> bool: def suffix_string_to_number(in_str: str) -> Optional[int]: - """Take a string like "33Gb" and convert it into a number (of bytes) - like 34603008. Return None if the input string is not valid. + """Takes a string like "33Gb" and converts it into a number (of bytes) + like 34603008. + + Args: + in_str: the string with a suffix to be interpreted and removed. + + Returns: + An integer number of bytes or None to indicate an error. >>> suffix_string_to_number('1Mb') 1048576 @@ -525,13 +627,18 @@ def suffix_string_to_number(in_str: str) -> Optional[int]: def number_to_suffix_string(num: int) -> Optional[str]: """Take a number (of bytes) and returns a string like "43.8Gb". - Returns none if the input is invalid. + + Args: + num: an integer number of bytes + + Returns: + A string with a suffix representing num bytes concisely or + None to indicate an error. >>> number_to_suffix_string(14066017894) '13.1Gb' >>> number_to_suffix_string(1024 * 1024) '1.0Mb' - """ d = 0.0 suffix = None @@ -548,18 +655,23 @@ def number_to_suffix_string(num: int) -> Optional[str]: def is_credit_card(in_str: Any, card_type: str = None) -> bool: """ - Checks if a string is a valid credit card number. - If card type is provided then it checks against that specific type only, - otherwise any known credit card number will be accepted. + Args: + in_str: a string to check + card_type: if provided, contains the card type to validate + with. Otherwise, all known credit card number types will + be accepted. - Supported card types are the following: + Supported card types are the following: - - VISA - - MASTERCARD - - AMERICAN_EXPRESS - - DINERS_CLUB - - DISCOVER - - JCB + * VISA + * MASTERCARD + * AMERICAN_EXPRESS + * DINERS_CLUB + * DISCOVER + * JCB + + Returns: + True if in_str is a valid credit card number. """ if not is_full_string(in_str): return False @@ -578,26 +690,31 @@ def is_credit_card(in_str: Any, card_type: str = None) -> bool: def is_camel_case(in_str: Any) -> bool: """ - Checks if a string is formatted as camel case. + Args: + in_str: the string to test - A string is considered camel case when: + Returns: + True if the string is formatted as camel case and False otherwise. + A string is considered camel case when: - - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9]) - - it contains both lowercase and uppercase letters - - it does not start with a number + * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9]) + * it contains both lowercase and uppercase letters + * it does not start with a number """ return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None def is_snake_case(in_str: Any, *, separator: str = "_") -> bool: """ - Checks if a string is formatted as "snake case". + Args: + in_str: the string to test - A string is considered snake case when: + Returns: True if the string is snake case and False otherwise. A + string is considered snake case when: - - it's composed only by lowercase/uppercase letters and digits - - it contains at least one underscore (or provided separator) - - it does not start with a number + * it's composed only by lowercase/uppercase letters and digits + * it contains at least one underscore (or provided separator) + * it does not start with a number >>> is_snake_case('this_is_a_test') True @@ -607,7 +724,6 @@ def is_snake_case(in_str: Any, *, separator: str = "_") -> bool: False >>> is_snake_case('this-is-a-test', separator='-') True - """ if is_full_string(in_str): re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE} @@ -622,7 +738,11 @@ def is_snake_case(in_str: Any, *, separator: str = "_") -> bool: def is_json(in_str: Any) -> bool: """ - Check if a string is a valid json. + Args: + in_str: the string to test + + Returns: + True if the in_str contains valid JSON and False otherwise. >>> is_json('{"name": "Peter"}') True @@ -641,7 +761,11 @@ def is_json(in_str: Any) -> bool: def is_uuid(in_str: Any, allow_hex: bool = False) -> bool: """ - Check if a string is a valid UUID. + Args: + in_str: the string to test + + Returns: + True if the in_str contains a valid UUID and False otherwise. >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') True @@ -659,7 +783,11 @@ def is_uuid(in_str: Any, allow_hex: bool = False) -> bool: def is_ip_v4(in_str: Any) -> bool: """ - Checks if a string is a valid ip v4. + Args: + in_str: the string to test + + Returns: + True if in_str contains a valid IPv4 address and False otherwise. >>> is_ip_v4('255.200.100.75') True @@ -680,7 +808,12 @@ def is_ip_v4(in_str: Any) -> bool: def extract_ip_v4(in_str: Any) -> Optional[str]: """ - Extracts the IPv4 chunk of a string or None. + Args: + in_str: the string to extract an IPv4 address from. + + Returns: + The first extracted IPv4 address from in_str or None if + none were found or an error occurred. >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ') '127.0.0.1' @@ -696,7 +829,11 @@ def extract_ip_v4(in_str: Any) -> Optional[str]: def is_ip_v6(in_str: Any) -> bool: """ - Checks if a string is a valid ip v6. + Args: + in_str: the string to test. + + Returns: + True if in_str contains a valid IPv6 address and False otherwise. >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') True @@ -708,7 +845,12 @@ def is_ip_v6(in_str: Any) -> bool: def extract_ip_v6(in_str: Any) -> Optional[str]: """ - Extract IPv6 chunk or None. + Args: + in_str: the string from which to extract an IPv6 address. + + Returns: + The first IPv6 address found in in_str or None if no address + was found or an error occurred. >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334') '2001:db8:85a3:0000:0000:8a2e:370:7334' @@ -724,7 +866,12 @@ def extract_ip_v6(in_str: Any) -> Optional[str]: def is_ip(in_str: Any) -> bool: """ - Checks if a string is a valid ip (either v4 or v6). + Args: + in_str: the string to test. + + Returns: + True if in_str contains a valid IP address (either IPv4 or + IPv6). >>> is_ip('255.200.100.75') True @@ -740,14 +887,18 @@ def is_ip(in_str: Any) -> bool: def extract_ip(in_str: Any) -> Optional[str]: """ - Extract the IP address or None. + Args: + in_str: the string from which to extract in IP address. + + Returns: + The first IP address (IPv4 or IPv6) found in in_str or + None to indicate none found or an error condition. >>> extract_ip('Attacker: 255.200.100.75') '255.200.100.75' >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334') '2001:db8:85a3:0000:0000:8a2e:370:7334' >>> extract_ip('1.2.3') - """ ip = extract_ip_v4(in_str) if ip is None: @@ -756,7 +907,12 @@ def extract_ip(in_str: Any) -> Optional[str]: def is_mac_address(in_str: Any) -> bool: - """Return True if in_str is a valid MAC address false otherwise. + """ + Args: + in_str: the string to test + + Returns: + True if in_str is a valid MAC address False otherwise. >>> is_mac_address("34:29:8F:12:0D:2F") True @@ -772,14 +928,18 @@ def is_mac_address(in_str: Any) -> bool: def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]: """ - Extract the MAC address from in_str. + Args: + in_str: the string from which to extract a MAC address. + + Returns: + The first MAC address found in in_str or None to indicate no + match or an error. >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F') '34:29:8F:12:0D:2F' >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]') 'd8:5d:e2:34:54:86' - """ if not is_full_string(in_str): return None @@ -795,13 +955,16 @@ def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]: def is_slug(in_str: Any, separator: str = "-") -> bool: """ - Checks if a given string is a slug (as created by `slugify()`). + Args: + in_str: string to test + + Returns: + True if in_str is a slug string and False otherwise. >>> is_slug('my-blog-post-title') True >>> is_slug('My blog post title') False - """ if not is_full_string(in_str): return False @@ -811,10 +974,18 @@ def is_slug(in_str: Any, separator: str = "-") -> bool: def contains_html(in_str: str) -> bool: """ - Checks if the given string contains HTML/XML tags. + Args: + in_str: the string to check for tags in + + Returns: + True if the given string contains HTML/XML tags and False + otherwise. - By design, this function matches ANY type of tag, so don't expect to use it - as an HTML validator, its goal is to detect "malicious" or undesired tags in the text. + .. warning:: + By design, this function matches ANY type of tag, so don't expect + to use it as an HTML validator. It's a quick sanity check at + best. See something like BeautifulSoup for a more full-featuered + HTML parser. >>> contains_html('my string is bold') True @@ -829,31 +1000,67 @@ def contains_html(in_str: str) -> bool: def words_count(in_str: str) -> int: """ - Returns the number of words contained into the given string. + Args: + in_str: the string to count words in + + Returns: + The number of words contained in the given string. - This method is smart, it does consider only sequence of one or more letter and/or numbers - as "words", so a string like this: "! @ # % ... []" will return zero! - Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop" - will be 4 not 1 (even if there are no spaces in the string). + .. note:: + + This method is "smart" in that it does consider only sequences + of one or more letter and/or numbers to be "words". Thus a + string like this: "! @ # % ... []" will return zero. Moreover + it is aware of punctuation, so the count for a string like + "one,two,three.stop" will be 4 not 1 (even if there are no spaces + in the string). >>> words_count('hello world') 2 >>> words_count('one,two,three.stop') 4 - """ if not is_string(in_str): raise ValueError(in_str) return len(WORDS_COUNT_RE.findall(in_str)) +def word_count(in_str: str) -> int: + """ + Args: + in_str: the string to count words in + + Returns: + The number of words contained in the given string. + + .. note:: + + This method is "smart" in that it does consider only sequences + of one or more letter and/or numbers to be "words". Thus a + string like this: "! @ # % ... []" will return zero. Moreover + it is aware of punctuation, so the count for a string like + "one,two,three.stop" will be 4 not 1 (even if there are no spaces + in the string). + + >>> word_count('hello world') + 2 + >>> word_count('one,two,three.stop') + 4 + """ + return words_count(in_str) + + def generate_uuid(omit_dashes: bool = False) -> str: """ - Generated an UUID string (using `uuid.uuid4()`). + Args: + omit_dashes: should we omit the dashes in the generated UUID? + + Returns: + A generated UUID string (using `uuid.uuid4()`) with or without + dashes per the omit_dashes arg. generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b' generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b' - """ uid = uuid4() if omit_dashes: @@ -863,11 +1070,16 @@ def generate_uuid(omit_dashes: bool = False) -> str: def generate_random_alphanumeric_string(size: int) -> str: """ - Returns a string of the specified size containing random - characters (uppercase/lowercase ascii letters and digits). + Args: + size: number of characters to generate - random_string(9) # possible output: "cx3QQbzYg" + Returns: + A string of the specified size containing random characters + (uppercase/lowercase ascii letters and digits). + >>> random.seed(22) + >>> generate_random_alphanumeric_string(9) + '96ipbNClS' """ if size < 1: raise ValueError("size must be >= 1") @@ -878,11 +1090,14 @@ def generate_random_alphanumeric_string(size: int) -> str: def reverse(in_str: str) -> str: """ - Returns the string with its chars reversed. + Args: + in_str: the string to reverse + + Returns: + The reversed (chracter by character) string. >>> reverse('test') 'tset' - """ if not is_string(in_str): raise ValueError(in_str) @@ -891,8 +1106,13 @@ def reverse(in_str: str) -> str: def camel_case_to_snake_case(in_str, *, separator="_"): """ - Convert a camel case string into a snake case one. - (The original string is returned if is not a valid camel case string) + Args: + in_str: the camel case string to convert + + Returns: + A snake case string equivalent to the camel case input or the + original string if it is not a valid camel case string or some + other error occurs. >>> camel_case_to_snake_case('MacAddressExtractorFactory') 'mac_address_extractor_factory' @@ -910,8 +1130,13 @@ def snake_case_to_camel_case( in_str: str, *, upper_case_first: bool = True, separator: str = "_" ) -> str: """ - Convert a snake case string into a camel case one. - (The original string is returned if is not a valid snake case string) + Args: + in_str: the snake case string to convert + + Returns: + A camel case string that is equivalent to the snake case string + provided or the original string back again if it is not valid + snake case or another error occurs. >>> snake_case_to_camel_case('this_is_a_test') 'ThisIsATest' @@ -929,7 +1154,12 @@ def snake_case_to_camel_case( def to_char_list(in_str: str) -> List[str]: - """Convert a string into a list of chars. + """ + Args: + in_str: the string to split into a char list + + Returns: + A list of strings of length one each. >>> to_char_list('test') ['t', 'e', 's', 't'] @@ -940,7 +1170,13 @@ def to_char_list(in_str: str) -> List[str]: def from_char_list(in_list: List[str]) -> str: - """Convert a char list into a string. + """ + Args: + in_list: A list of characters to convert into a string. + + Returns: + The string resulting from gluing the characters in in_list + together. >>> from_char_list(['t', 'e', 's', 't']) 'test' @@ -948,22 +1184,61 @@ def from_char_list(in_list: List[str]) -> str: return "".join(in_list) -def shuffle(in_str: str) -> str: - """Return a new string containing same chars of the given one but in - a randomized order. +def shuffle(in_str: str) -> Optional[str]: """ - if not is_string(in_str): - raise ValueError(in_str) + Args: + in_str: a string to shuffle randomly by character + + Returns: + A new string containing same chars of the given one but in + a randomized order. Note that in rare cases this could result + in the same original string as no check is done. Returns + None to indicate error conditions. - # turn the string into a list of chars + >>> random.seed(22) + >>> shuffle('awesome') + 'meosaew' + """ + if not is_string(in_str): + return None chars = to_char_list(in_str) random.shuffle(chars) return from_char_list(chars) +def scramble(in_str: str) -> Optional[str]: + """ + Args: + in_str: a string to shuffle randomly by character + + Returns: + A new string containing same chars of the given one but in + a randomized order. Note that in rare cases this could result + in the same original string as no check is done. Returns + None to indicate error conditions. + + >>> random.seed(22) + >>> scramble('awesome') + 'meosaew' + """ + return shuffle(in_str) + + def strip_html(in_str: str, keep_tag_content: bool = False) -> str: """ - Remove html code contained into the given string. + Args: + in_str: the string to strip tags from + keep_tag_content: should we keep the inner contents of tags? + + Returns: + A string with all HTML tags removed (optionally with tag contents + preserved). + + .. note:: + This method uses simple regular expressions to strip tags and is + not a full fledged HTML parser by any means. Consider using + something like BeautifulSoup if your needs are more than this + simple code can fulfill. >>> strip_html('test: click here') 'test: ' @@ -978,11 +1253,17 @@ def strip_html(in_str: str, keep_tag_content: bool = False) -> str: def asciify(in_str: str) -> str: """ - Force string content to be ascii-only by translating all non-ascii - chars into the closest possible representation (eg: ó -> o, Ë -> - E, ç -> c...). + Args: + in_str: the string to asciify. + + Returns: + An output string roughly equivalent to the original string + where all content to are ascii-only. This is accomplished + by translating all non-ascii chars into their closest possible + ASCII representation (eg: ó -> o, Ë -> E, ç -> c...). - N.B. Some chars may be lost if impossible to translate. + .. warning:: + Some chars may be lost if impossible to translate. >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') 'eeuuooaaeynAAACIINOE' @@ -1004,15 +1285,20 @@ def asciify(in_str: str) -> str: def slugify(in_str: str, *, separator: str = "-") -> str: """ - Converts a string into a "slug" using provided separator. - The returned string has the following properties: + Args: + in_str: the string to slugify + separator: the character to use during sligification (default + is a dash) + + Returns: + The converted string. The returned string has the following properties: - - it has no spaces - - all letters are in lower case - - all punctuation signs and non alphanumeric chars are removed - - words are divided using provided separator - - all chars are encoded as ascii (by using `asciify()`) - - is safe for URL + * it has no spaces + * all letters are in lower case + * all punctuation signs and non alphanumeric chars are removed + * words are divided using provided separator + * all chars are encoded as ascii (by using :meth:`asciify`) + * is safe for URL >>> slugify('Top 10 Reasons To Love Dogs!!!') 'top-10-reasons-to-love-dogs' @@ -1035,17 +1321,22 @@ def slugify(in_str: str, *, separator: str = "-") -> str: def to_bool(in_str: str) -> bool: """ - Turns a string into a boolean based on its content (CASE INSENSITIVE). + Args: + in_str: the string to convert to boolean - A positive boolean (True) is returned if the string value is one - of the following: + Returns: + A boolean equivalent of the original string based on its contents. + All conversion is case insensitive. A positive boolean (True) is + returned if the string value is any of the following: - - "true" - - "1" - - "yes" - - "y" + * "true" + * "t" + * "1" + * "yes" + * "y" + * "on" - Otherwise False is returned. + Otherwise False is returned. >>> to_bool('True') True @@ -1064,7 +1355,6 @@ def to_bool(in_str: str) -> bool: >>> to_bool('on') True - """ if not is_string(in_str): raise ValueError(in_str) @@ -1073,31 +1363,58 @@ def to_bool(in_str: str) -> bool: def to_date(in_str: str) -> Optional[datetime.date]: """ - Parses a date string. See DateParser docs for details. + Args: + in_str: the string to convert into a date + + Returns: + The datetime.date the string contained or None to indicate + an error. This parser is relatively clever; see + :class:`python_modules.dateparse.dateparse_utils` docs for + details. + + >>> to_date('9/11/2001') + datetime.date(2001, 9, 11) + >>> to_date('xyzzy') """ - import dateparse.dateparse_utils as dp + import dateparse.dateparse_utils as du try: - d = dp.DateParser() + d = du.DateParser() # type: ignore d.parse(in_str) return d.get_date() - except dp.ParseException: + except du.ParseException: # type: ignore msg = f'Unable to parse date {in_str}.' logger.warning(msg) return None -def valid_date(in_str: str) -> bool: +def is_valid_date(in_str: str) -> bool: """ - True if the string represents a valid date. + Args: + in_str: the string to check + + Returns: + True if the string represents a valid date that we can recognize + and False otherwise. This parser is relatively clever; see + :class:`python_modules.dateparse.dateparse_utils` docs for + details. + + >>> is_valid_date('1/2/2022') + True + >>> is_valid_date('christmas') + True + >>> is_valid_date('next wednesday') + True + >>> is_valid_date('xyzzy') + False """ import dateparse.dateparse_utils as dp try: - d = dp.DateParser() + d = dp.DateParser() # type: ignore _ = d.parse(in_str) return True - except dp.ParseException: + except dp.ParseException: # type: ignore msg = f'Unable to parse date {in_str}.' logger.warning(msg) return False @@ -1105,16 +1422,26 @@ def valid_date(in_str: str) -> bool: def to_datetime(in_str: str) -> Optional[datetime.datetime]: """ - Parses a datetime string. See DateParser docs for more info. + Args: + in_str: string to parse into a datetime + + Returns: + A python datetime parsed from in_str or None to indicate + an error. This parser is relatively clever; see + :class:`python_modules.dateparse.dateparse_utils` docs for + details. + + >>> to_datetime('7/20/1969 02:56 GMT') + datetime.datetime(1969, 7, 20, 2, 56, tzinfo=) """ import dateparse.dateparse_utils as dp try: - d = dp.DateParser() + d = dp.DateParser() # type: ignore dt = d.parse(in_str) - if type(dt) == datetime.datetime: + if isinstance(dt, datetime.datetime): return dt - except ValueError: + except Exception: msg = f'Unable to parse datetime {in_str}.' logger.warning(msg) return None @@ -1122,7 +1449,23 @@ def to_datetime(in_str: str) -> Optional[datetime.datetime]: def valid_datetime(in_str: str) -> bool: """ - True if the string represents a valid datetime. + Args: + in_str: the string to check + + Returns: + True if in_str contains a valid datetime and False otherwise. + This parser is relatively clever; see + :class:`python_modules.dateparse.dateparse_utils` docs for + details. + + >>> valid_datetime('next wednesday at noon') + True + >>> valid_datetime('3 weeks ago at midnight') + True + >>> valid_datetime('next easter at 5:00 am') + True + >>> valid_datetime('sometime soon') + False """ _ = to_datetime(in_str) if _ is not None: @@ -1134,7 +1477,13 @@ def valid_datetime(in_str: str) -> bool: def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str: """ - Squeeze runs of more than one character_to_squeeze into one. + Args: + in_str: the string to squeeze + character_to_squeeze: the character to remove runs of + more than one in a row (default = space) + + Returns: A "squeezed string" where runs of more than one + character_to_squeeze into one. >>> squeeze(' this is a test ') ' this is a test ' @@ -1150,12 +1499,23 @@ def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str: ) -def dedent(in_str: str) -> str: +def dedent(in_str: str) -> Optional[str]: """ - Removes tab indentation from multi line strings (inspired by analogous Scala function). + Args: + in_str: the string to dedent + + Returns: + A string with tab indentation removed or None on error. + + .. note:: + + Inspired by analogous Scala function. + + >>> dedent('\t\ttest\\n\t\ting') + 'test\\ning' """ if not is_string(in_str): - raise ValueError(in_str) + return None line_separator = '\n' lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)] return line_separator.join(lines) @@ -1163,11 +1523,15 @@ def dedent(in_str: str) -> str: def indent(in_str: str, amount: int) -> str: """ - Indents string by prepending amount spaces. + Args: + in_str: the string to indent + amount: count of spaces to indent each line by + + Returns: + An indented string created by prepending amount spaces. >>> indent('This is a test', 4) ' This is a test' - """ if not is_string(in_str): raise ValueError(in_str) @@ -1177,7 +1541,15 @@ def indent(in_str: str, amount: int) -> str: def sprintf(*args, **kwargs) -> str: - """String printf, like in C""" + """ + Args: + This function uses the same syntax as the builtin print + function. + + Returns: + An interpolated string capturing print output, like man(3) + :code:sprintf. + """ ret = "" sep = kwargs.pop("sep", None) @@ -1208,15 +1580,45 @@ def sprintf(*args, **kwargs) -> str: return ret -class SprintfStdout(object): +def strip_ansi_sequences(in_str: str) -> str: """ - A context manager that captures outputs to stdout. + Args: + in_str: the string to strip - with SprintfStdout() as buf: - print("test") - print(buf()) + Returns: + in_str with recognized ANSI escape sequences removed. + + .. warning:: + This method works by using a regular expression. + It works for all ANSI escape sequences I've tested with but + may miss some; caveat emptor. + + >>> import ansi as a + >>> s = a.fg('blue') + 'blue!' + a.reset() + >>> len(s) # '\x1b[38;5;21mblue!\x1b[m' + 18 + >>> len(strip_ansi_sequences(s)) + 5 + >>> strip_ansi_sequences(s) + 'blue!' + + """ + return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str) + + +class SprintfStdout(contextlib.AbstractContextManager): + """ + A context manager that captures outputs to stdout to a buffer + without printing them. + + >>> with SprintfStdout() as buf: + ... print("test") + ... print("1, 2, 3") + ... + >>> print(buf(), end='') + test + 1, 2, 3 - 'test\n' """ def __init__(self) -> None: @@ -1228,14 +1630,19 @@ class SprintfStdout(object): self.recorder.__enter__() return lambda: self.destination.getvalue() - def __exit__(self, *args) -> None: + def __exit__(self, *args) -> Literal[False]: self.recorder.__exit__(*args) self.destination.seek(0) - return None # don't suppress exceptions + return False + +def capitalize_first_letter(in_str: str) -> str: + """ + Args: + in_str: the string to capitalize -def capitalize_first_letter(txt: str) -> str: - """Capitalize the first letter of a string. + Returns: + in_str with the first character capitalized. >>> capitalize_first_letter('test') 'Test' @@ -1243,17 +1650,27 @@ def capitalize_first_letter(txt: str) -> str: 'ALREADY!' """ - return txt[0].upper() + txt[1:] + return in_str[0].upper() + in_str[1:] def it_they(n: int) -> str: - """It or they? + """ + Args: + n: how many of them are there? + + Returns: + 'it' if n is one or 'they' otherwize. + + Suggested usage:: + + n = num_files_saved_to_tmp() + print(f'Saved file{pluralize(n)} successfully.') + print(f'{it_they(n)} {is_are(n)} located in /tmp.') >>> it_they(1) 'it' >>> it_they(100) 'they' - """ if n == 1: return "it" @@ -1261,7 +1678,18 @@ def it_they(n: int) -> str: def is_are(n: int) -> str: - """Is or are? + """ + Args: + n: how many of them are there? + + Returns: + 'is' if n is one or 'are' otherwize. + + Suggested usage:: + + n = num_files_saved_to_tmp() + print(f'Saved file{pluralize(n)} successfully.') + print(f'{it_they(n)} {is_are(n)} located in /tmp.') >>> is_are(1) 'is' @@ -1275,7 +1703,18 @@ def is_are(n: int) -> str: def pluralize(n: int) -> str: - """Add an s? + """ + Args: + n: how many of them are there? + + Returns: + 's' if n is greater than one otherwize ''. + + Suggested usage:: + + n = num_files_saved_to_tmp() + print(f'Saved file{pluralize(n)} successfully.') + print(f'{it_they(n)} {is_are(n)} located in /tmp.') >>> pluralize(15) 's' @@ -1285,7 +1724,6 @@ def pluralize(n: int) -> str: >>> count = 4 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.') There are 4 files. - """ if n == 1: return "" @@ -1293,7 +1731,20 @@ def pluralize(n: int) -> str: def make_contractions(txt: str) -> str: - """Glue words together to form contractions. + """This code glues words in txt together to form (English) + contractions. + + Args: + txt: the input text to be contractionized. + + Returns: + Output text identical to original input except for any + recognized contractions are formed. + + .. note:: + The order in which we create contractions is defined by the + implementation and what I thought made more sense when writing + this code. >>> make_contractions('It is nice today.') "It's nice today." @@ -1315,7 +1766,6 @@ def make_contractions(txt: str) -> str: >>> make_contractions('I said you can not go.') "I said you can't go." - """ first_second = [ @@ -1371,7 +1821,7 @@ def make_contractions(txt: str) -> str: for second in second_list: # Disallow there're/where're. They're valid English # but sound weird. - if (first == 'there' or first == 'where') and second == 'a(re)': + if (first in ('there', 'where')) and second == 'a(re)': continue pattern = fr'\b({first})\s+{second}\b' @@ -1385,7 +1835,21 @@ def make_contractions(txt: str) -> str: def thify(n: int) -> str: - """Return the proper cardinal suffix for a number. + """ + Args: + n: how many of them are there? + + Returns: + The proper cardinal suffix for a number. + + Suggested usage:: + + attempt_count = 0 + while True: + attempt_count += 1 + if try_the_thing(): + break + print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.') >>> thify(1) 'st' @@ -1393,7 +1857,6 @@ def thify(n: int) -> str: 'rd' >>> thify(16) 'th' - """ digit = str(n) assert is_integer_number(digit) @@ -1409,11 +1872,16 @@ def thify(n: int) -> str: def ngrams(txt: str, n: int): - """Return the ngrams from a string. + """ + Args: + txt: the string to create ngrams using + n: how many words per ngram created? + + Returns: + Generates the ngrams from the input string. >>> [x for x in ngrams('This is a test', 2)] ['This is', 'is a', 'a test'] - """ words = txt.split() for ngram in ngrams_presplit(words, n): @@ -1424,14 +1892,19 @@ def ngrams(txt: str, n: int): def ngrams_presplit(words: Sequence[str], n: int): + """ + Same as :meth:`ngrams` but with the string pre-split. + """ return list_utils.ngrams(words, n) def bigrams(txt: str): + """Generates the bigrams (n=2) of the given string.""" return ngrams(txt, 2) def trigrams(txt: str): + """Generates the trigrams (n=3) of the given string.""" return ngrams(txt, 3) @@ -1439,29 +1912,40 @@ def shuffle_columns_into_list( input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim='' ) -> Iterable[str]: """Helper to shuffle / parse columnar data and return the results as a - list. The column_specs argument is an iterable collection of - numeric sequences that indicate one or more column numbers to - copy. + list. + + Args: + input_lines: A sequence of strings that represents text that + has been broken into columns by the caller + column_specs: an iterable collection of numeric sequences that + indicate one or more column numbers to copy to form the Nth + position in the output list. See example below. + delim: for column_specs that indicate we should copy more than + one column from the input into this position, use delim to + separate source data. Defaults to ''. + + Returns: + A list of string created by following the instructions set forth + in column_specs. >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split() >>> shuffle_columns_into_list( ... cols, ... [ [8], [2, 3], [5, 6, 7] ], - ... delim=' ', + ... delim='!', ... ) - ['acl_test.py', 'scott wheel', 'Jul 9 11:34'] - + ['acl_test.py', 'scott!wheel', 'Jul!9!11:34'] """ out = [] # Column specs map input lines' columns into outputs. # [col1, col2...] for spec in column_specs: - chunk = '' + hunk = '' for n in spec: - chunk = chunk + delim + input_lines[n] - chunk = chunk.strip(delim) - out.append(chunk) + hunk = hunk + delim + input_lines[n] + hunk = hunk.strip(delim) + out.append(hunk) return out @@ -1473,70 +1957,100 @@ def shuffle_columns_into_dict( """Helper to shuffle / parse columnar data and return the results as a dict. + Args: + input_lines: a sequence of strings that represents text that + has been broken into columns by the caller + column_specs: instructions for what dictionary keys to apply + to individual or compound input column data. See example + below. + delim: when forming compound output data by gluing more than + one input column together, use this character to separate + the source data. Defaults to ''. + + Returns: + A dict formed by applying the column_specs instructions. + >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split() >>> shuffle_columns_into_dict( ... cols, ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ], - ... delim=' ', + ... delim='!', ... ) - {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'} - + {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'} """ out = {} # Column specs map input lines' columns into outputs. # "key", [col1, col2...] for spec in column_specs: - chunk = '' + hunk = '' for n in spec[1]: - chunk = chunk + delim + input_lines[n] - chunk = chunk.strip(delim) - out[spec[0]] = chunk + hunk = hunk + delim + input_lines[n] + hunk = hunk.strip(delim) + out[spec[0]] = hunk return out def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str: - """Interpolate a string with data from a dict. + """ + Interpolate a string with data from a dict. + + Args: + txt: the mad libs template + values: what you and your kids chose for each category. >>> interpolate_using_dict('This is a {adjective} {noun}.', ... {'adjective': 'good', 'noun': 'example'}) 'This is a good example.' - """ return sprintf(txt.format(**values), end='') -def to_ascii(x: str): - """Encode as ascii bytes string. +def to_ascii(txt: str): + """ + Args: + txt: the input data to encode + + Returns: + txt encoded as an ASCII byte string. >>> to_ascii('test') b'test' >>> to_ascii(b'1, 2, 3') b'1, 2, 3' - """ - if type(x) is str: - return x.encode('ascii') - if type(x) is bytes: - return x + if isinstance(txt, str): + return txt.encode('ascii') + if isinstance(txt, bytes): + return txt raise Exception('to_ascii works with strings and bytes') def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes: - """Encode txt and then encode the bytes with a 64-character - alphabet. This is compatible with uudecode. + """ + Args: + txt: the input data to encode + + Returns: + txt encoded with a 64-chracter alphabet. Similar to and compatible + with uuencode/uudecode. >>> to_base64('hello?') b'aGVsbG8/\\n' - """ return base64.encodebytes(txt.encode(encoding, errors)) def is_base64(txt: str) -> bool: - """Determine whether a string is base64 encoded (with Python's standard - base64 alphabet which is the same as what uuencode uses). + """ + Args: + txt: the string to check + + Returns: + True if txt is a valid base64 encoded string. This assumes + txt was encoded with Python's standard base64 alphabet which + is the same as what uuencode/uudecode uses). >>> is_base64('test') # all letters in the b64 alphabet True @@ -1557,21 +2071,31 @@ def is_base64(txt: str) -> bool: def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str: - """Convert base64 encoded string back to normal strings. + """ + Args: + b64: bytestring of 64-bit encoded data to decode / convert. + + Returns: + The decoded form of b64 as a normal python string. Similar to + and compatible with uuencode / uudecode. >>> from_base64(b'aGVsbG8/\\n') 'hello?' - """ return base64.decodebytes(b64).decode(encoding, errors) -def chunk(txt: str, chunk_size): - """Chunk up a string. +def chunk(txt: str, chunk_size: int): + """ + Args: + txt: a string to be chunked into evenly spaced pieces. + chunk_size: the size of each chunk to make + + Returns: + The original string chunked into evenly spaced pieces. >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8)) '01001101 11000101 10101010 10101010 10011111 10101000' - """ if len(txt) % chunk_size != 0: msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})' @@ -1581,9 +2105,16 @@ def chunk(txt: str, chunk_size): yield txt[x : x + chunk_size] -def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str: - """Encode txt and then chop it into bytes. Note: only bitstrings - with delimiter='' are interpretable by from_bitstring. +def to_bitstring(txt: str, *, delimiter='') -> str: + """ + Args: + txt: the string to convert into a bitstring + delimiter: character to insert between adjacent bytes. Note that + only bitstrings with delimiter='' are interpretable by + :meth:`from_bitstring`. + + Returns: + txt converted to ascii/binary and then chopped into bytes. >>> to_bitstring('hello?') '011010000110010101101100011011000110111100111111' @@ -1593,7 +2124,6 @@ def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatep >>> to_bitstring(b'test') '01110100011001010111001101110100' - """ etxt = to_ascii(txt) bits = bin(int.from_bytes(etxt, 'big')) @@ -1602,31 +2132,50 @@ def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatep def is_bitstring(txt: str) -> bool: - """Is this a bitstring? + """ + Args: + txt: the string to check + + Returns: + True if txt is a recognized bitstring and False otherwise. + Note that if delimiter is non empty this code will not + recognize the bitstring. >>> is_bitstring('011010000110010101101100011011000110111100111111') True >>> is_bitstring('1234') False - """ return is_binary_integer_number(f'0b{txt}') def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str: - """Convert from bitstring back to bytes then decode into a str. + """ + Args: + bits: the bitstring to convert back into a python string + encoding: the encoding to use + + Returns: + The regular python string represented by bits. Note that this + code does not work with to_bitstring when delimiter is non-empty. >>> from_bitstring('011010000110010101101100011011000110111100111111') 'hello?' - """ n = int(bits, 2) return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0' def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]: - """Turn an IPv4 address into a tuple for sorting purposes. + """ + Args: + txt: an IP address to chunk up for sorting purposes + + Returns: + A tuple of IP components arranged such that the sorting of + IP addresses using a normal comparator will do something sane + and desireable. >>> ip_v4_sort_key('10.0.0.18') (10, 0, 0, 18) @@ -1634,17 +2183,22 @@ def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]: >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9'] >>> sorted(ips, key=lambda x: ip_v4_sort_key(x)) ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1'] - """ if not is_ip_v4(txt): print(f"not IP: {txt}") return None - return tuple([int(x) for x in txt.split('.')]) + return tuple(int(x) for x in txt.split('.')) def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]: - """Chunk up a file path so that parent/ancestor paths sort before - children/descendant paths. + """ + Args: + volume: the string to chunk up for sorting purposes + + Returns: + A tuple of volume's components such that the sorting of + volumes using a normal comparator will do something sane + and desireable. >>> path_ancestors_before_descendants_sort_key('/usr/local/bin') ('usr', 'local', 'bin') @@ -1652,24 +2206,52 @@ def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]: >>> paths = ['/usr/local', '/usr/local/bin', '/usr'] >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x)) ['/usr', '/usr/local', '/usr/local/bin'] - """ - return tuple([x for x in volume.split('/') if len(x) > 0]) + return tuple(x for x in volume.split('/') if len(x) > 0) def replace_all(in_str: str, replace_set: str, replacement: str) -> str: - """Execute several replace operations in a row. + """ + Execute several replace operations in a row. + + Args: + in_str: the string in which to replace characters + replace_set: the set of target characters to replace + replacement: the character to replace any member of replace_set + with + + Returns: + The string with replacements executed. >>> s = 'this_is a-test!' >>> replace_all(s, ' _-!', '') 'thisisatest' - """ for char in replace_set: in_str = in_str.replace(char, replacement) return in_str +def replace_nth(in_str: str, source: str, target: str, nth: int): + """ + Replaces the nth occurrance of a substring within a string. + + Args: + in_str: the string in which to run the replacement + source: the substring to replace + target: the replacement text + nth: which occurrance of source to replace? + + >>> replace_nth('this is a test', ' ', '-', 3) + 'this is a-test' + """ + where = [m.start() for m in re.finditer(source, in_str)][nth - 1] + before = in_str[:where] + after = in_str[where:] + after = after.replace(source, target, 1) + return before + after + + if __name__ == '__main__': import doctest