X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=src%2Fpyutils%2Fstring_utils.py;h=a016d075647ee8e371cd1f946e0d648f006dd230;hb=HEAD;hp=1e791eb2e3774d80c5f74d37e1795f13e2fc5742;hpb=e9266a3af23eda8cee1ac0a97471c71e4248671b;p=pyutils.git diff --git a/src/pyutils/string_utils.py b/src/pyutils/string_utils.py index 1e791eb..a016d07 100644 --- a/src/pyutils/string_utils.py +++ b/src/pyutils/string_utils.py @@ -50,6 +50,7 @@ from typing import ( Any, Callable, Dict, + Generator, Iterable, List, Literal, @@ -355,6 +356,9 @@ def is_number(in_str: str) -> bool: True if the string contains a valid numberic value and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`, etc... @@ -362,7 +366,7 @@ def is_number(in_str: str) -> bool: >>> is_number(100.5) Traceback (most recent call last): ... - ValueError: 100.5 + TypeError: 100.5 >>> is_number("100.5") True >>> is_number("test") @@ -372,10 +376,10 @@ def is_number(in_str: str) -> bool: >>> is_number([1, 2, 3]) Traceback (most recent call last): ... - ValueError: [1, 2, 3] + TypeError: [1, 2, 3] """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return NUMBER_RE.match(in_str) is not None @@ -414,6 +418,9 @@ def is_hexidecimal_integer_number(in_str: str) -> bool: Returns: True if the string is a hex integer number and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc... @@ -430,18 +437,18 @@ def is_hexidecimal_integer_number(in_str: str) -> bool: >>> is_hexidecimal_integer_number(12345) # Not a string Traceback (most recent call last): ... - ValueError: 12345 + TypeError: 12345 >>> is_hexidecimal_integer_number(101.4) Traceback (most recent call last): ... - ValueError: 101.4 + TypeError: 101.4 >>> is_hexidecimal_integer_number(0x1A3E) Traceback (most recent call last): ... - ValueError: 6718 + TypeError: 6718 """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return HEX_NUMBER_RE.match(in_str) is not None @@ -453,6 +460,9 @@ def is_octal_integer_number(in_str: str) -> bool: Returns: True if the string is a valid octal integral number and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`, etc... @@ -469,7 +479,7 @@ def is_octal_integer_number(in_str: str) -> bool: False """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return OCT_NUMBER_RE.match(in_str) is not None @@ -481,6 +491,9 @@ def is_binary_integer_number(in_str: str) -> bool: Returns: True if the string contains a binary integral number and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`, etc... @@ -499,7 +512,7 @@ def is_binary_integer_number(in_str: str) -> bool: False """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return BIN_NUMBER_RE.match(in_str) is not None @@ -509,7 +522,10 @@ def to_int(in_str: str) -> int: in_str: the string to convert Returns: - The integral value of the string or raises on error. + The integral value of the string. + + Raises: + TypeError: the input argument isn't a string See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`, @@ -527,9 +543,13 @@ def to_int(in_str: str) -> int: Traceback (most recent call last): ... ValueError: invalid literal for int() with base 10: 'test' + >>> to_int(123) + Traceback (most recent call last): + ... + TypeError: 123 """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) if is_binary_integer_number(in_str): return int(in_str, 2) if is_octal_integer_number(in_str): @@ -549,6 +569,9 @@ def number_string_to_integer(in_str: str) -> int: Returns: The integer whose value was parsed from in_str. + Raises: + ValueError: unable to parse a chunk of the number string + See also :meth:`integer_to_number_string`. .. warning:: @@ -702,6 +725,9 @@ def add_thousands_separator( Returns: A numeric string with thousands separators added appropriately. + Raises: + ValueError: a non-numeric string argument is presented + >>> add_thousands_separator('12345678') '12,345,678' >>> add_thousands_separator(12345678) @@ -730,7 +756,7 @@ def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> st (in_str, decimal_part) = in_str.split('.') tmp = [iter(in_str[::-1])] * places ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1] - if len(decimal_part) > 0: + if decimal_part: ret += '.' ret += decimal_part return ret @@ -802,7 +828,7 @@ def is_email(in_str: Any) -> bool: head = head.replace(" ", "")[1:-1] return EMAIL_RE.match(head + "@" + tail) is not None - except ValueError: + except (TypeError, ValueError): # borderline case in which we have multiple "@" signs but the # head part is correctly escaped. if ESCAPED_AT_SIGN.search(in_str) is not None: @@ -827,6 +853,11 @@ def suffix_string_to_number(in_str: str) -> Optional[int]: 1048576 >>> suffix_string_to_number('13.1Gb') 14066017894 + >>> suffix_string_to_number('12345') + 12345 + >>> x = suffix_string_to_number('a lot') + >>> x is None + True """ def suffix_capitalize(s: str) -> str: @@ -904,6 +935,9 @@ def is_credit_card(in_str: Any, card_type: str = None) -> bool: Returns: True if in_str is a valid credit card number. + Raises: + KeyError: card_type is invalid + .. warning:: This code is not verifying the authenticity of the credit card (i.e. not checking whether it's a real card that can be charged); rather @@ -1196,6 +1230,7 @@ def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]: """ Args: in_str: the string from which to extract a MAC address. + separator: the MAC address hex byte separator to use. Returns: The first MAC address found in in_str or None to indicate no @@ -1252,6 +1287,9 @@ def contains_html(in_str: str) -> bool: True if the given string contains HTML/XML tags and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`strip_html`. .. warning:: @@ -1267,7 +1305,7 @@ def contains_html(in_str: str) -> bool: """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return HTML_RE.search(in_str) is not None @@ -1279,6 +1317,9 @@ def words_count(in_str: str) -> int: Returns: The number of words contained in the given string. + Raises: + TypeError: the input argument isn't a string + .. note:: This method is "smart" in that it does consider only sequences of one or more letter and/or numbers to be "words". Thus a @@ -1293,7 +1334,7 @@ def words_count(in_str: str) -> int: 4 """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return len(WORDS_COUNT_RE.findall(in_str)) @@ -1350,6 +1391,9 @@ def generate_random_alphanumeric_string(size: int) -> str: A string of the specified size containing random characters (uppercase/lowercase ascii letters and digits). + Raises: + ValueError: size < 1 + See also :meth:`asciify`, :meth:`generate_uuid`. >>> random.seed(22) @@ -1371,11 +1415,14 @@ def reverse(in_str: str) -> str: Returns: The reversed (chracter by character) string. + Raises: + TypeError: the input argument isn't a string + >>> reverse('test') 'tset' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return in_str[::-1] @@ -1390,6 +1437,9 @@ def camel_case_to_snake_case(in_str: str, *, separator: str = "_"): original string if it is not a valid camel case string or some other error occurs. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`. >>> camel_case_to_snake_case('MacAddressExtractorFactory') @@ -1398,7 +1448,7 @@ def camel_case_to_snake_case(in_str: str, *, separator: str = "_"): 'Luke Skywalker' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) if not is_camel_case(in_str): return in_str return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower() @@ -1418,6 +1468,9 @@ def snake_case_to_camel_case( provided or the original string back again if it is not valid snake case or another error occurs. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`. >>> snake_case_to_camel_case('this_is_a_test') @@ -1426,7 +1479,7 @@ def snake_case_to_camel_case( 'Han Solo' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) if not is_snake_case(in_str, separator=separator): return in_str tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)] @@ -1522,6 +1575,9 @@ def strip_html(in_str: str, keep_tag_content: bool = False) -> str: A string with all HTML tags removed (optionally with tag contents preserved). + Raises: + TypeError: the input argument isn't a string + See also :meth:`contains_html`. .. note:: @@ -1536,7 +1592,7 @@ def strip_html(in_str: str, keep_tag_content: bool = False) -> str: 'test: click here' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE return r.sub("", in_str) @@ -1552,6 +1608,9 @@ def asciify(in_str: str) -> str: by translating all non-ascii chars into their closest possible ASCII representation (eg: ó -> o, Ë -> E, ç -> c...). + Raises: + TypeError: the input argument isn't a string + See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`. .. warning:: @@ -1561,7 +1620,7 @@ def asciify(in_str: str) -> str: 'eeuuooaaeynAAACIINOE' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) # "NFKD" is the algorithm which is able to successfully translate # the most of non-ascii chars. @@ -1592,6 +1651,9 @@ def slugify(in_str: str, *, separator: str = "-") -> str: * all chars are encoded as ascii (by using :meth:`asciify`) * is safe for URL + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_slug` and :meth:`asciify`. >>> slugify('Top 10 Reasons To Love Dogs!!!') @@ -1600,7 +1662,7 @@ def slugify(in_str: str, *, separator: str = "-") -> str: 'monster-magnet' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) # replace any character that is NOT letter or number with spaces out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip() @@ -1632,6 +1694,9 @@ def to_bool(in_str: str) -> bool: Otherwise False is returned. + Raises: + TypeError: the input argument isn't a string + See also :mod:`pyutils.argparse_utils`. >>> to_bool('True') @@ -1653,7 +1718,7 @@ def to_bool(in_str: str) -> bool: True """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"]) @@ -1754,8 +1819,7 @@ def is_valid_date(in_str: str) -> bool: _ = d.parse(in_str) return True except dp.ParseException: # type: ignore - msg = f'Unable to parse date {in_str}.' - logger.warning(msg) + pass return False @@ -1783,8 +1847,7 @@ def to_datetime(in_str: str) -> Optional[datetime.datetime]: if isinstance(dt, datetime.datetime): return dt except Exception: - msg = f'Unable to parse datetime {in_str}.' - logger.warning(msg) + pass return None @@ -1810,8 +1873,6 @@ def valid_datetime(in_str: str) -> bool: _ = to_datetime(in_str) if _ is not None: return True - msg = f'Unable to parse datetime {in_str}.' - logger.warning(msg) return False @@ -1868,13 +1929,16 @@ def indent(in_str: str, amount: int) -> str: Returns: An indented string created by prepending amount spaces. + Raises: + TypeError: the input argument isn't a string + See also :meth:`dedent`. >>> indent('This is a test', 4) ' This is a test' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) line_separator = '\n' lines = [" " * amount + line for line in in_str.split(line_separator)] return line_separator.join(lines) @@ -2218,7 +2282,56 @@ def thify(n: int) -> str: return "th" -def ngrams(txt: str, n: int): +get_cardinal_suffix = thify + + +def add_cardinal_suffix(n: int): + """ + Args: + n: the number to return as a string with a cardinal suffix. + + Returns: + A string containing the number with its cardinal suffix. + + >>> add_cardinal_suffix(123) + '123rd' + + >>> add_cardinal_suffix(1) + '1st' + + >>> add_cardinal_suffix(0) + '0th' + + >>> add_cardinal_suffix(-123) + '-123rd' + """ + return f'{n}{get_cardinal_suffix(n)}' + + +def remove_cardinal_suffix(txt: str) -> Optional[str]: + """ + Args: + txt: the number with cardinal suffix to strip. + + Returns: + The same string with its cardinal suffix removed or None on error. + + >>> remove_cardinal_suffix('123rd') + '123' + + >>> remove_cardinal_suffix('-10th') + '-10' + + >>> remove_cardinal_suffix('1ero') is None + True + """ + suffix = txt[-2:] + if suffix in set(['st', 'nd', 'rd', 'th']): + return txt[:-2] + return None + + +def ngrams(txt: str, n: int) -> Generator[str, str, None]: """ Args: txt: the string to create ngrams using @@ -2240,7 +2353,9 @@ def ngrams(txt: str, n: int): yield ret.strip() -def ngrams_presplit(words: Sequence[str], n: int): +def ngrams_presplit( + words: Sequence[str], n: int +) -> Generator[Sequence[str], str, None]: """ Same as :meth:`ngrams` but with the string pre-split. @@ -2249,7 +2364,7 @@ def ngrams_presplit(words: Sequence[str], n: int): return list_utils.ngrams(words, n) -def bigrams(txt: str): +def bigrams(txt: str) -> Generator[str, str, None]: """Generates the bigrams (n=2) of the given string. See also :meth:`ngrams`, :meth:`trigrams`. @@ -2260,7 +2375,7 @@ def bigrams(txt: str): return ngrams(txt, 2) -def trigrams(txt: str): +def trigrams(txt: str) -> Generator[str, str, None]: """Generates the trigrams (n=3) of the given string. See also :meth:`ngrams`, :meth:`bigrams`. @@ -2380,6 +2495,9 @@ def to_ascii(txt: str): Returns: txt encoded as an ASCII byte string. + Raises: + TypeError: the input argument isn't a string or bytes + See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`, :meth:`generate_random_alphanumeric_string`, :meth:`asciify`. @@ -2393,7 +2511,7 @@ def to_ascii(txt: str): return txt.encode('ascii') if isinstance(txt, bytes): return txt - raise Exception('to_ascii works with strings and bytes') + raise TypeError('to_ascii works with strings and bytes') def to_base64(