X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=src%2Fpyutils%2Fstring_utils.py;h=a016d075647ee8e371cd1f946e0d648f006dd230;hb=HEAD;hp=f6056d0f69e5097355bc018d62c9640c8a2377c5;hpb=993b0992473c12294ed659e52b532e1c8cf9cd1e;p=pyutils.git diff --git a/src/pyutils/string_utils.py b/src/pyutils/string_utils.py index f6056d0..a016d07 100644 --- a/src/pyutils/string_utils.py +++ b/src/pyutils/string_utils.py @@ -27,7 +27,7 @@ SOFTWARE. This class is based on: https://github.com/daveoncode/python-string-utils. See `NOTICE -<[https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`_ +`__ in the root of this module for a detailed enumeration of what work is Davide's and what work was added by Scott. @@ -50,6 +50,7 @@ from typing import ( Any, Callable, Dict, + Generator, Iterable, List, Literal, @@ -83,9 +84,9 @@ URLS_RAW_STRING = ( r"(#\S*)?" # hash ) -URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE) +URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE) -URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE) +URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE) ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@') @@ -93,9 +94,9 @@ EMAILS_RAW_STRING = ( r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}" ) -EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING)) +EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$") -EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING)) +EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})") CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$") @@ -165,7 +166,7 @@ NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE MARGIN_RE = re.compile(r"^[^\S\r\n]+") -ESCAPE_SEQUENCE_RE = re.compile(r"\[[^A-Za-z]*[A-Za-z]") +ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]") NUM_SUFFIXES = { "Pb": (1024**5), @@ -262,7 +263,7 @@ def is_none_or_empty(in_str: Optional[str]) -> bool: return in_str is None or len(in_str.strip()) == 0 -def is_string(obj: Any) -> bool: +def is_string(in_str: Any) -> bool: """ Args: in_str: the object to test @@ -281,7 +282,7 @@ def is_string(obj: Any) -> bool: >>> is_string([1, 2, 3]) False """ - return isinstance(obj, str) + return isinstance(in_str, str) def is_empty_string(in_str: Any) -> bool: @@ -355,6 +356,9 @@ def is_number(in_str: str) -> bool: True if the string contains a valid numberic value and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`, etc... @@ -362,7 +366,7 @@ def is_number(in_str: str) -> bool: >>> is_number(100.5) Traceback (most recent call last): ... - ValueError: 100.5 + TypeError: 100.5 >>> is_number("100.5") True >>> is_number("test") @@ -372,10 +376,10 @@ def is_number(in_str: str) -> bool: >>> is_number([1, 2, 3]) Traceback (most recent call last): ... - ValueError: [1, 2, 3] + TypeError: [1, 2, 3] """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return NUMBER_RE.match(in_str) is not None @@ -414,6 +418,9 @@ def is_hexidecimal_integer_number(in_str: str) -> bool: Returns: True if the string is a hex integer number and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc... @@ -430,18 +437,18 @@ def is_hexidecimal_integer_number(in_str: str) -> bool: >>> is_hexidecimal_integer_number(12345) # Not a string Traceback (most recent call last): ... - ValueError: 12345 + TypeError: 12345 >>> is_hexidecimal_integer_number(101.4) Traceback (most recent call last): ... - ValueError: 101.4 + TypeError: 101.4 >>> is_hexidecimal_integer_number(0x1A3E) Traceback (most recent call last): ... - ValueError: 6718 + TypeError: 6718 """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return HEX_NUMBER_RE.match(in_str) is not None @@ -453,6 +460,9 @@ def is_octal_integer_number(in_str: str) -> bool: Returns: True if the string is a valid octal integral number and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`, etc... @@ -469,7 +479,7 @@ def is_octal_integer_number(in_str: str) -> bool: False """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return OCT_NUMBER_RE.match(in_str) is not None @@ -481,6 +491,9 @@ def is_binary_integer_number(in_str: str) -> bool: Returns: True if the string contains a binary integral number and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`, etc... @@ -499,7 +512,7 @@ def is_binary_integer_number(in_str: str) -> bool: False """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return BIN_NUMBER_RE.match(in_str) is not None @@ -509,7 +522,10 @@ def to_int(in_str: str) -> int: in_str: the string to convert Returns: - The integral value of the string or raises on error. + The integral value of the string. + + Raises: + TypeError: the input argument isn't a string See also :meth:`is_integer_number`, :meth:`is_decimal_number`, :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`, @@ -527,9 +543,13 @@ def to_int(in_str: str) -> int: Traceback (most recent call last): ... ValueError: invalid literal for int() with base 10: 'test' + >>> to_int(123) + Traceback (most recent call last): + ... + TypeError: 123 """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) if is_binary_integer_number(in_str): return int(in_str, 2) if is_octal_integer_number(in_str): @@ -549,6 +569,9 @@ def number_string_to_integer(in_str: str) -> int: Returns: The integer whose value was parsed from in_str. + Raises: + ValueError: unable to parse a chunk of the number string + See also :meth:`integer_to_number_string`. .. warning:: @@ -568,19 +591,19 @@ def number_string_to_integer(in_str: str) -> int: ... ValueError: Unknown word: xyzzy """ - if type(in_str) == int: + if isinstance(in_str, int): return int(in_str) current = result = 0 in_str = in_str.replace('-', ' ') - for word in in_str.split(): - if word not in NUM_WORDS: - if is_integer_number(word): - current += int(word) + for w in in_str.split(): + if w not in NUM_WORDS: + if is_integer_number(w): + current += int(w) continue else: - raise ValueError("Unknown word: " + word) - scale, increment = NUM_WORDS[word] + raise ValueError("Unknown word: " + w) + scale, increment = NUM_WORDS[w] current = current * scale + increment if scale > 100: result += current @@ -683,14 +706,16 @@ def strip_escape_sequences(in_str: str) -> str: by a regular expression. While this gets common ones, there may exist valid sequences that it doesn't match. - >>> strip_escape_sequences('this is a test!') + >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!') 'this is a test!' """ in_str = ESCAPE_SEQUENCE_RE.sub("", in_str) return in_str -def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str: +def add_thousands_separator( + in_str: str, *, separator_char: str = ',', places: int = 3 +) -> str: """ Args: in_str: string or number to which to add thousands separator(s) @@ -700,6 +725,9 @@ def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str Returns: A numeric string with thousands separators added appropriately. + Raises: + ValueError: a non-numeric string argument is presented + >>> add_thousands_separator('12345678') '12,345,678' >>> add_thousands_separator(12345678) @@ -728,7 +756,7 @@ def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> st (in_str, decimal_part) = in_str.split('.') tmp = [iter(in_str[::-1])] * places ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1] - if len(decimal_part) > 0: + if decimal_part: ret += '.' ret += decimal_part return ret @@ -800,7 +828,7 @@ def is_email(in_str: Any) -> bool: head = head.replace(" ", "")[1:-1] return EMAIL_RE.match(head + "@" + tail) is not None - except ValueError: + except (TypeError, ValueError): # borderline case in which we have multiple "@" signs but the # head part is correctly escaped. if ESCAPED_AT_SIGN.search(in_str) is not None: @@ -825,6 +853,11 @@ def suffix_string_to_number(in_str: str) -> Optional[int]: 1048576 >>> suffix_string_to_number('13.1Gb') 14066017894 + >>> suffix_string_to_number('12345') + 12345 + >>> x = suffix_string_to_number('a lot') + >>> x is None + True """ def suffix_capitalize(s: str) -> str: @@ -902,6 +935,9 @@ def is_credit_card(in_str: Any, card_type: str = None) -> bool: Returns: True if in_str is a valid credit card number. + Raises: + KeyError: card_type is invalid + .. warning:: This code is not verifying the authenticity of the credit card (i.e. not checking whether it's a real card that can be charged); rather @@ -946,6 +982,7 @@ def is_snake_case(in_str: Any, *, separator: str = "_") -> bool: """ Args: in_str: the string to test + separator: the snake case separator character to use Returns: True if the string is snake case and False otherwise. A string is considered snake case when: @@ -1003,6 +1040,7 @@ def is_uuid(in_str: Any, allow_hex: bool = False) -> bool: """ Args: in_str: the string to test + allow_hex: should we allow hexidecimal digits in valid uuids? Returns: True if the in_str contains a valid UUID and False otherwise. @@ -1192,6 +1230,7 @@ def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]: """ Args: in_str: the string from which to extract a MAC address. + separator: the MAC address hex byte separator to use. Returns: The first MAC address found in in_str or None to indicate no @@ -1221,6 +1260,7 @@ def is_slug(in_str: Any, separator: str = "-") -> bool: """ Args: in_str: string to test + separator: the slug character to use Returns: True if in_str is a slug string and False otherwise. @@ -1247,6 +1287,9 @@ def contains_html(in_str: str) -> bool: True if the given string contains HTML/XML tags and False otherwise. + Raises: + TypeError: the input argument isn't a string + See also :meth:`strip_html`. .. warning:: @@ -1262,7 +1305,7 @@ def contains_html(in_str: str) -> bool: """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return HTML_RE.search(in_str) is not None @@ -1274,6 +1317,9 @@ def words_count(in_str: str) -> int: Returns: The number of words contained in the given string. + Raises: + TypeError: the input argument isn't a string + .. note:: This method is "smart" in that it does consider only sequences of one or more letter and/or numbers to be "words". Thus a @@ -1288,7 +1334,7 @@ def words_count(in_str: str) -> int: 4 """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return len(WORDS_COUNT_RE.findall(in_str)) @@ -1345,6 +1391,9 @@ def generate_random_alphanumeric_string(size: int) -> str: A string of the specified size containing random characters (uppercase/lowercase ascii letters and digits). + Raises: + ValueError: size < 1 + See also :meth:`asciify`, :meth:`generate_uuid`. >>> random.seed(22) @@ -1366,24 +1415,31 @@ def reverse(in_str: str) -> str: Returns: The reversed (chracter by character) string. + Raises: + TypeError: the input argument isn't a string + >>> reverse('test') 'tset' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) return in_str[::-1] -def camel_case_to_snake_case(in_str, *, separator="_"): +def camel_case_to_snake_case(in_str: str, *, separator: str = "_"): """ Args: in_str: the camel case string to convert + separator: the snake case separator character to use Returns: A snake case string equivalent to the camel case input or the original string if it is not a valid camel case string or some other error occurs. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`. >>> camel_case_to_snake_case('MacAddressExtractorFactory') @@ -1392,7 +1448,7 @@ def camel_case_to_snake_case(in_str, *, separator="_"): 'Luke Skywalker' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) if not is_camel_case(in_str): return in_str return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower() @@ -1404,12 +1460,17 @@ def snake_case_to_camel_case( """ Args: in_str: the snake case string to convert + upper_case_first: should we capitalize the first letter? + separator: the separator character to use Returns: A camel case string that is equivalent to the snake case string provided or the original string back again if it is not valid snake case or another error occurs. + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`. >>> snake_case_to_camel_case('this_is_a_test') @@ -1418,7 +1479,7 @@ def snake_case_to_camel_case( 'Han Solo' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) if not is_snake_case(in_str, separator=separator): return in_str tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)] @@ -1514,6 +1575,9 @@ def strip_html(in_str: str, keep_tag_content: bool = False) -> str: A string with all HTML tags removed (optionally with tag contents preserved). + Raises: + TypeError: the input argument isn't a string + See also :meth:`contains_html`. .. note:: @@ -1528,7 +1592,7 @@ def strip_html(in_str: str, keep_tag_content: bool = False) -> str: 'test: click here' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE return r.sub("", in_str) @@ -1544,6 +1608,9 @@ def asciify(in_str: str) -> str: by translating all non-ascii chars into their closest possible ASCII representation (eg: ó -> o, Ë -> E, ç -> c...). + Raises: + TypeError: the input argument isn't a string + See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`. .. warning:: @@ -1553,7 +1620,7 @@ def asciify(in_str: str) -> str: 'eeuuooaaeynAAACIINOE' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) # "NFKD" is the algorithm which is able to successfully translate # the most of non-ascii chars. @@ -1584,6 +1651,9 @@ def slugify(in_str: str, *, separator: str = "-") -> str: * all chars are encoded as ascii (by using :meth:`asciify`) * is safe for URL + Raises: + TypeError: the input argument isn't a string + See also :meth:`is_slug` and :meth:`asciify`. >>> slugify('Top 10 Reasons To Love Dogs!!!') @@ -1592,7 +1662,7 @@ def slugify(in_str: str, *, separator: str = "-") -> str: 'monster-magnet' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) # replace any character that is NOT letter or number with spaces out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip() @@ -1624,6 +1694,9 @@ def to_bool(in_str: str) -> bool: Otherwise False is returned. + Raises: + TypeError: the input argument isn't a string + See also :mod:`pyutils.argparse_utils`. >>> to_bool('True') @@ -1645,8 +1718,8 @@ def to_bool(in_str: str) -> bool: True """ if not is_string(in_str): - raise ValueError(in_str) - return in_str.lower() in ("true", "1", "yes", "y", "t", "on") + raise TypeError(in_str) + return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"]) def to_date(in_str: str) -> Optional[datetime.date]: @@ -1657,24 +1730,23 @@ def to_date(in_str: str) -> Optional[datetime.date]: Returns: The datetime.date the string contained or None to indicate an error. This parser is relatively clever; see - :class:`datetimez.dateparse_utils` docs for details. + :class:`datetimes.dateparse_utils` docs for details. - See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`extract_date`, + See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`, :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`. >>> to_date('9/11/2001') datetime.date(2001, 9, 11) >>> to_date('xyzzy') """ - import pyutils.datetimez.dateparse_utils as du + import pyutils.datetimes.dateparse_utils as du try: d = du.DateParser() # type: ignore d.parse(in_str) return d.get_date() except du.ParseException: # type: ignore - msg = f'Unable to parse date {in_str}.' - logger.warning(msg) + pass return None @@ -1687,7 +1759,7 @@ def extract_date(in_str: Any) -> Optional[datetime.datetime]: Returns: a datetime if date was found, otherwise None - See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`to_date`, + See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`, :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`. >>> extract_date("filename.txt dec 13, 2022") @@ -1698,7 +1770,7 @@ def extract_date(in_str: Any) -> Optional[datetime.datetime]: """ import itertools - import pyutils.datetimez.dateparse_utils as du + import pyutils.datetimes.dateparse_utils as du d = du.DateParser() # type: ignore chunks = in_str.split() @@ -1710,7 +1782,7 @@ def extract_date(in_str: Any) -> Optional[datetime.datetime]: ): try: expr = " ".join(ngram) - logger.debug(f"Trying {expr}") + logger.debug("Trying %s", expr) if d.parse(expr): return d.get_datetime() except du.ParseException: # type: ignore @@ -1726,9 +1798,9 @@ def is_valid_date(in_str: str) -> bool: Returns: True if the string represents a valid date that we can recognize and False otherwise. This parser is relatively clever; see - :class:`datetimez.dateparse_utils` docs for details. + :class:`datetimes.dateparse_utils` docs for details. - See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`to_date`, + See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`, :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`. >>> is_valid_date('1/2/2022') @@ -1740,15 +1812,14 @@ def is_valid_date(in_str: str) -> bool: >>> is_valid_date('xyzzy') False """ - import pyutils.datetimez.dateparse_utils as dp + import pyutils.datetimes.dateparse_utils as dp try: d = dp.DateParser() # type: ignore _ = d.parse(in_str) return True except dp.ParseException: # type: ignore - msg = f'Unable to parse date {in_str}.' - logger.warning(msg) + pass return False @@ -1760,15 +1831,15 @@ def to_datetime(in_str: str) -> Optional[datetime.datetime]: Returns: A python datetime parsed from in_str or None to indicate an error. This parser is relatively clever; see - :class:`datetimez.dateparse_utils` docs for details. + :class:`datetimes.dateparse_utils` docs for details. - See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`to_date`, + See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`, :meth:`extract_date`, :meth:`valid_datetime`. >>> to_datetime('7/20/1969 02:56 GMT') datetime.datetime(1969, 7, 20, 2, 56, tzinfo=) """ - import pyutils.datetimez.dateparse_utils as dp + import pyutils.datetimes.dateparse_utils as dp try: d = dp.DateParser() # type: ignore @@ -1776,8 +1847,7 @@ def to_datetime(in_str: str) -> Optional[datetime.datetime]: if isinstance(dt, datetime.datetime): return dt except Exception: - msg = f'Unable to parse datetime {in_str}.' - logger.warning(msg) + pass return None @@ -1789,7 +1859,7 @@ def valid_datetime(in_str: str) -> bool: Returns: True if in_str contains a valid datetime and False otherwise. This parser is relatively clever; see - :class:`datetimez.dateparse_utils` docs for details. + :class:`datetimes.dateparse_utils` docs for details. >>> valid_datetime('next wednesday at noon') True @@ -1803,8 +1873,6 @@ def valid_datetime(in_str: str) -> bool: _ = to_datetime(in_str) if _ is not None: return True - msg = f'Unable to parse datetime {in_str}.' - logger.warning(msg) return False @@ -1861,13 +1929,16 @@ def indent(in_str: str, amount: int) -> str: Returns: An indented string created by prepending amount spaces. + Raises: + TypeError: the input argument isn't a string + See also :meth:`dedent`. >>> indent('This is a test', 4) ' This is a test' """ if not is_string(in_str): - raise ValueError(in_str) + raise TypeError(in_str) line_separator = '\n' lines = [" " * amount + line for line in in_str.split(line_separator)] return line_separator.join(lines) @@ -1894,8 +1965,8 @@ def _sprintf(*args, **kwargs) -> str: sep = " " if end is None: end = "\n" - for i, arg in enumerate(args): - if i: + for n, arg in enumerate(args): + if n: ret += sep if isinstance(arg, str): ret += arg @@ -2159,7 +2230,7 @@ def make_contractions(txt: str) -> str: for second in second_list: # Disallow there're/where're. They're valid English # but sound weird. - if (first in ('there', 'where')) and second == 'a(re)': + if (first in set(['there', 'where'])) and second == 'a(re)': continue pattern = fr'\b({first})\s+{second}\b' @@ -2211,7 +2282,56 @@ def thify(n: int) -> str: return "th" -def ngrams(txt: str, n: int): +get_cardinal_suffix = thify + + +def add_cardinal_suffix(n: int): + """ + Args: + n: the number to return as a string with a cardinal suffix. + + Returns: + A string containing the number with its cardinal suffix. + + >>> add_cardinal_suffix(123) + '123rd' + + >>> add_cardinal_suffix(1) + '1st' + + >>> add_cardinal_suffix(0) + '0th' + + >>> add_cardinal_suffix(-123) + '-123rd' + """ + return f'{n}{get_cardinal_suffix(n)}' + + +def remove_cardinal_suffix(txt: str) -> Optional[str]: + """ + Args: + txt: the number with cardinal suffix to strip. + + Returns: + The same string with its cardinal suffix removed or None on error. + + >>> remove_cardinal_suffix('123rd') + '123' + + >>> remove_cardinal_suffix('-10th') + '-10' + + >>> remove_cardinal_suffix('1ero') is None + True + """ + suffix = txt[-2:] + if suffix in set(['st', 'nd', 'rd', 'th']): + return txt[:-2] + return None + + +def ngrams(txt: str, n: int) -> Generator[str, str, None]: """ Args: txt: the string to create ngrams using @@ -2228,12 +2348,14 @@ def ngrams(txt: str, n: int): words = txt.split() for ngram in ngrams_presplit(words, n): ret = '' - for word in ngram: - ret += f'{word} ' + for w in ngram: + ret += f'{w} ' yield ret.strip() -def ngrams_presplit(words: Sequence[str], n: int): +def ngrams_presplit( + words: Sequence[str], n: int +) -> Generator[Sequence[str], str, None]: """ Same as :meth:`ngrams` but with the string pre-split. @@ -2242,7 +2364,7 @@ def ngrams_presplit(words: Sequence[str], n: int): return list_utils.ngrams(words, n) -def bigrams(txt: str): +def bigrams(txt: str) -> Generator[str, str, None]: """Generates the bigrams (n=2) of the given string. See also :meth:`ngrams`, :meth:`trigrams`. @@ -2253,7 +2375,7 @@ def bigrams(txt: str): return ngrams(txt, 2) -def trigrams(txt: str): +def trigrams(txt: str) -> Generator[str, str, None]: """Generates the trigrams (n=3) of the given string. See also :meth:`ngrams`, :meth:`bigrams`. @@ -2262,7 +2384,7 @@ def trigrams(txt: str): def shuffle_columns_into_list( - input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim='' + input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = '' ) -> Iterable[str]: """Helper to shuffle / parse columnar data and return the results as a list. @@ -2307,7 +2429,7 @@ def shuffle_columns_into_list( def shuffle_columns_into_dict( input_lines: Sequence[str], column_specs: Iterable[Tuple[str, Iterable[int]]], - delim='', + delim: str = '', ) -> Dict[str, str]: """Helper to shuffle / parse columnar data and return the results as a dict. @@ -2373,6 +2495,9 @@ def to_ascii(txt: str): Returns: txt encoded as an ASCII byte string. + Raises: + TypeError: the input argument isn't a string or bytes + See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`, :meth:`generate_random_alphanumeric_string`, :meth:`asciify`. @@ -2386,13 +2511,17 @@ def to_ascii(txt: str): return txt.encode('ascii') if isinstance(txt, bytes): return txt - raise Exception('to_ascii works with strings and bytes') + raise TypeError('to_ascii works with strings and bytes') -def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes: +def to_base64( + txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass' +) -> bytes: """ Args: txt: the input data to encode + encoding: the encoding to use during conversion + errors: how to handle encoding errors Returns: txt encoded with a 64-chracter alphabet. Similar to and compatible @@ -2437,10 +2566,14 @@ def is_base64(txt: str) -> bool: return True -def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str: +def from_base64( + b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass' +) -> str: """ Args: b64: bytestring of 64-bit encoded data to decode / convert. + encoding: the encoding to use during conversion + errors: how to handle encoding errors Returns: The decoded form of b64 as a normal python string. Similar to @@ -2474,7 +2607,7 @@ def chunk(txt: str, chunk_size: int): yield txt[x : x + chunk_size] -def to_bitstring(txt: str, *, delimiter='') -> str: +def to_bitstring(txt: str, *, delimiter: str = '') -> str: """ Args: txt: the string to convert into a bitstring @@ -2525,11 +2658,14 @@ def is_bitstring(txt: str) -> bool: return is_binary_integer_number(f'0b{txt}') -def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str: +def from_bitstring( + bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass' +) -> str: """ Args: bits: the bitstring to convert back into a python string - encoding: the encoding to use + encoding: the encoding to use during conversion + errors: how to handle encoding errors Returns: The regular python string represented by bits. Note that this