This class is based on:
https://github.com/daveoncode/python-string-utils. See `NOTICE
-<[https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`_
+<https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
in the root of this module for a detailed enumeration of what work is
Davide's and what work was added by Scott.
Any,
Callable,
Dict,
+ Generator,
Iterable,
List,
Literal,
r"(#\S*)?" # hash
)
-URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
+URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
-URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
+URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
)
-EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
+EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
-EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
+EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
MARGIN_RE = re.compile(r"^[^\S\r\n]+")
-ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
+ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
NUM_SUFFIXES = {
"Pb": (1024**5),
return in_str is None or len(in_str.strip()) == 0
-def is_string(obj: Any) -> bool:
+def is_string(in_str: Any) -> bool:
"""
Args:
in_str: the object to test
>>> is_string([1, 2, 3])
False
"""
- return isinstance(obj, str)
+ return isinstance(in_str, str)
def is_empty_string(in_str: Any) -> bool:
True if the string contains a valid numberic value and
False otherwise.
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
:meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
etc...
>>> is_number(100.5)
Traceback (most recent call last):
...
- ValueError: 100.5
+ TypeError: 100.5
>>> is_number("100.5")
True
>>> is_number("test")
>>> is_number([1, 2, 3])
Traceback (most recent call last):
...
- ValueError: [1, 2, 3]
+ TypeError: [1, 2, 3]
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
return NUMBER_RE.match(in_str) is not None
Returns:
True if the string is a hex integer number and False otherwise.
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
:meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
>>> is_hexidecimal_integer_number(12345) # Not a string
Traceback (most recent call last):
...
- ValueError: 12345
+ TypeError: 12345
>>> is_hexidecimal_integer_number(101.4)
Traceback (most recent call last):
...
- ValueError: 101.4
+ TypeError: 101.4
>>> is_hexidecimal_integer_number(0x1A3E)
Traceback (most recent call last):
...
- ValueError: 6718
+ TypeError: 6718
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
return HEX_NUMBER_RE.match(in_str) is not None
Returns:
True if the string is a valid octal integral number and False otherwise.
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
:meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
etc...
False
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
return OCT_NUMBER_RE.match(in_str) is not None
Returns:
True if the string contains a binary integral number and False otherwise.
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
:meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
etc...
False
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
return BIN_NUMBER_RE.match(in_str) is not None
in_str: the string to convert
Returns:
- The integral value of the string or raises on error.
+ The integral value of the string.
+
+ Raises:
+ TypeError: the input argument isn't a string
See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
:meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
Traceback (most recent call last):
...
ValueError: invalid literal for int() with base 10: 'test'
+ >>> to_int(123)
+ Traceback (most recent call last):
+ ...
+ TypeError: 123
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
if is_binary_integer_number(in_str):
return int(in_str, 2)
if is_octal_integer_number(in_str):
Returns:
The integer whose value was parsed from in_str.
+ Raises:
+ ValueError: unable to parse a chunk of the number string
+
See also :meth:`integer_to_number_string`.
.. warning::
...
ValueError: Unknown word: xyzzy
"""
- if type(in_str) == int:
- return in_str
+ if isinstance(in_str, int):
+ return int(in_str)
current = result = 0
in_str = in_str.replace('-', ' ')
- for word in in_str.split():
- if word not in NUM_WORDS:
- if is_integer_number(word):
- current += int(word)
+ for w in in_str.split():
+ if w not in NUM_WORDS:
+ if is_integer_number(w):
+ current += int(w)
continue
else:
- raise ValueError("Unknown word: " + word)
- scale, increment = NUM_WORDS[word]
+ raise ValueError("Unknown word: " + w)
+ scale, increment = NUM_WORDS[w]
current = current * scale + increment
if scale > 100:
result += current
by a regular expression. While this gets common ones,
there may exist valid sequences that it doesn't match.
- >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
+ >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
'this is a test!'
"""
in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
return in_str
-def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
+def add_thousands_separator(
+ in_str: str, *, separator_char: str = ',', places: int = 3
+) -> str:
"""
Args:
in_str: string or number to which to add thousands separator(s)
Returns:
A numeric string with thousands separators added appropriately.
+ Raises:
+ ValueError: a non-numeric string argument is presented
+
>>> add_thousands_separator('12345678')
'12,345,678'
>>> add_thousands_separator(12345678)
(in_str, decimal_part) = in_str.split('.')
tmp = [iter(in_str[::-1])] * places
ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
- if len(decimal_part) > 0:
+ if decimal_part:
ret += '.'
ret += decimal_part
return ret
head = head.replace(" ", "")[1:-1]
return EMAIL_RE.match(head + "@" + tail) is not None
- except ValueError:
+ except (TypeError, ValueError):
# borderline case in which we have multiple "@" signs but the
# head part is correctly escaped.
if ESCAPED_AT_SIGN.search(in_str) is not None:
1048576
>>> suffix_string_to_number('13.1Gb')
14066017894
+ >>> suffix_string_to_number('12345')
+ 12345
+ >>> x = suffix_string_to_number('a lot')
+ >>> x is None
+ True
"""
def suffix_capitalize(s: str) -> str:
Returns:
True if in_str is a valid credit card number.
+ Raises:
+ KeyError: card_type is invalid
+
.. warning::
This code is not verifying the authenticity of the credit card (i.e.
not checking whether it's a real card that can be charged); rather
"""
Args:
in_str: the string to test
+ separator: the snake case separator character to use
Returns: True if the string is snake case and False otherwise. A
string is considered snake case when:
"""
Args:
in_str: the string to test
+ allow_hex: should we allow hexidecimal digits in valid uuids?
Returns:
True if the in_str contains a valid UUID and False otherwise.
"""
Args:
in_str: the string from which to extract a MAC address.
+ separator: the MAC address hex byte separator to use.
Returns:
The first MAC address found in in_str or None to indicate no
"""
Args:
in_str: string to test
+ separator: the slug character to use
Returns:
True if in_str is a slug string and False otherwise.
True if the given string contains HTML/XML tags and False
otherwise.
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`strip_html`.
.. warning::
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
return HTML_RE.search(in_str) is not None
Returns:
The number of words contained in the given string.
+ Raises:
+ TypeError: the input argument isn't a string
+
.. note::
This method is "smart" in that it does consider only sequences
of one or more letter and/or numbers to be "words". Thus a
4
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
return len(WORDS_COUNT_RE.findall(in_str))
A string of the specified size containing random characters
(uppercase/lowercase ascii letters and digits).
+ Raises:
+ ValueError: size < 1
+
See also :meth:`asciify`, :meth:`generate_uuid`.
>>> random.seed(22)
Returns:
The reversed (chracter by character) string.
+ Raises:
+ TypeError: the input argument isn't a string
+
>>> reverse('test')
'tset'
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
return in_str[::-1]
-def camel_case_to_snake_case(in_str, *, separator="_"):
+def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
"""
Args:
in_str: the camel case string to convert
+ separator: the snake case separator character to use
Returns:
A snake case string equivalent to the camel case input or the
original string if it is not a valid camel case string or some
other error occurs.
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
>>> camel_case_to_snake_case('MacAddressExtractorFactory')
'Luke Skywalker'
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
if not is_camel_case(in_str):
return in_str
return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
"""
Args:
in_str: the snake case string to convert
+ upper_case_first: should we capitalize the first letter?
+ separator: the separator character to use
Returns:
A camel case string that is equivalent to the snake case string
provided or the original string back again if it is not valid
snake case or another error occurs.
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
>>> snake_case_to_camel_case('this_is_a_test')
'Han Solo'
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
if not is_snake_case(in_str, separator=separator):
return in_str
tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
A string with all HTML tags removed (optionally with tag contents
preserved).
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`contains_html`.
.. note::
'test: click here'
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
return r.sub("", in_str)
by translating all non-ascii chars into their closest possible
ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
.. warning::
'eeuuooaaeynAAACIINOE'
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
# "NFKD" is the algorithm which is able to successfully translate
# the most of non-ascii chars.
* all chars are encoded as ascii (by using :meth:`asciify`)
* is safe for URL
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`is_slug` and :meth:`asciify`.
>>> slugify('Top 10 Reasons To Love Dogs!!!')
'monster-magnet'
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
# replace any character that is NOT letter or number with spaces
out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
Otherwise False is returned.
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :mod:`pyutils.argparse_utils`.
>>> to_bool('True')
True
"""
if not is_string(in_str):
- raise ValueError(in_str)
- return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
+ raise TypeError(in_str)
+ return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
def to_date(in_str: str) -> Optional[datetime.date]:
Returns:
The datetime.date the string contained or None to indicate
an error. This parser is relatively clever; see
- :class:`datetimez.dateparse_utils` docs for details.
+ :class:`datetimes.dateparse_utils` docs for details.
- See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`extract_date`,
+ See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
:meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
>>> to_date('9/11/2001')
datetime.date(2001, 9, 11)
>>> to_date('xyzzy')
"""
- import pyutils.datetimez.dateparse_utils as du
+ import pyutils.datetimes.dateparse_utils as du
try:
d = du.DateParser() # type: ignore
d.parse(in_str)
return d.get_date()
except du.ParseException: # type: ignore
- msg = f'Unable to parse date {in_str}.'
- logger.warning(msg)
+ pass
return None
Returns:
a datetime if date was found, otherwise None
- See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`to_date`,
+ See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
:meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
>>> extract_date("filename.txt dec 13, 2022")
"""
import itertools
- import pyutils.datetimez.dateparse_utils as du
+ import pyutils.datetimes.dateparse_utils as du
d = du.DateParser() # type: ignore
chunks = in_str.split()
):
try:
expr = " ".join(ngram)
- logger.debug(f"Trying {expr}")
+ logger.debug("Trying %s", expr)
if d.parse(expr):
return d.get_datetime()
except du.ParseException: # type: ignore
Returns:
True if the string represents a valid date that we can recognize
and False otherwise. This parser is relatively clever; see
- :class:`datetimez.dateparse_utils` docs for details.
+ :class:`datetimes.dateparse_utils` docs for details.
- See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`to_date`,
+ See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
:meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
>>> is_valid_date('1/2/2022')
>>> is_valid_date('xyzzy')
False
"""
- import pyutils.datetimez.dateparse_utils as dp
+ import pyutils.datetimes.dateparse_utils as dp
try:
d = dp.DateParser() # type: ignore
_ = d.parse(in_str)
return True
except dp.ParseException: # type: ignore
- msg = f'Unable to parse date {in_str}.'
- logger.warning(msg)
+ pass
return False
Returns:
A python datetime parsed from in_str or None to indicate
an error. This parser is relatively clever; see
- :class:`datetimez.dateparse_utils` docs for details.
+ :class:`datetimes.dateparse_utils` docs for details.
- See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`to_date`,
+ See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
:meth:`extract_date`, :meth:`valid_datetime`.
>>> to_datetime('7/20/1969 02:56 GMT')
datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
"""
- import pyutils.datetimez.dateparse_utils as dp
+ import pyutils.datetimes.dateparse_utils as dp
try:
d = dp.DateParser() # type: ignore
if isinstance(dt, datetime.datetime):
return dt
except Exception:
- msg = f'Unable to parse datetime {in_str}.'
- logger.warning(msg)
+ pass
return None
Returns:
True if in_str contains a valid datetime and False otherwise.
This parser is relatively clever; see
- :class:`datetimez.dateparse_utils` docs for details.
+ :class:`datetimes.dateparse_utils` docs for details.
>>> valid_datetime('next wednesday at noon')
True
_ = to_datetime(in_str)
if _ is not None:
return True
- msg = f'Unable to parse datetime {in_str}.'
- logger.warning(msg)
return False
Returns:
An indented string created by prepending amount spaces.
+ Raises:
+ TypeError: the input argument isn't a string
+
See also :meth:`dedent`.
>>> indent('This is a test', 4)
' This is a test'
"""
if not is_string(in_str):
- raise ValueError(in_str)
+ raise TypeError(in_str)
line_separator = '\n'
lines = [" " * amount + line for line in in_str.split(line_separator)]
return line_separator.join(lines)
sep = " "
if end is None:
end = "\n"
- for i, arg in enumerate(args):
- if i:
+ for n, arg in enumerate(args):
+ if n:
ret += sep
if isinstance(arg, str):
ret += arg
for second in second_list:
# Disallow there're/where're. They're valid English
# but sound weird.
- if (first in ('there', 'where')) and second == 'a(re)':
+ if (first in set(['there', 'where'])) and second == 'a(re)':
continue
pattern = fr'\b({first})\s+{second}\b'
return "th"
-def ngrams(txt: str, n: int):
+get_cardinal_suffix = thify
+
+
+def add_cardinal_suffix(n: int):
+ """
+ Args:
+ n: the number to return as a string with a cardinal suffix.
+
+ Returns:
+ A string containing the number with its cardinal suffix.
+
+ >>> add_cardinal_suffix(123)
+ '123rd'
+
+ >>> add_cardinal_suffix(1)
+ '1st'
+
+ >>> add_cardinal_suffix(0)
+ '0th'
+
+ >>> add_cardinal_suffix(-123)
+ '-123rd'
+ """
+ return f'{n}{get_cardinal_suffix(n)}'
+
+
+def remove_cardinal_suffix(txt: str) -> Optional[str]:
+ """
+ Args:
+ txt: the number with cardinal suffix to strip.
+
+ Returns:
+ The same string with its cardinal suffix removed or None on error.
+
+ >>> remove_cardinal_suffix('123rd')
+ '123'
+
+ >>> remove_cardinal_suffix('-10th')
+ '-10'
+
+ >>> remove_cardinal_suffix('1ero') is None
+ True
+ """
+ suffix = txt[-2:]
+ if suffix in set(['st', 'nd', 'rd', 'th']):
+ return txt[:-2]
+ return None
+
+
+def ngrams(txt: str, n: int) -> Generator[str, str, None]:
"""
Args:
txt: the string to create ngrams using
words = txt.split()
for ngram in ngrams_presplit(words, n):
ret = ''
- for word in ngram:
- ret += f'{word} '
+ for w in ngram:
+ ret += f'{w} '
yield ret.strip()
-def ngrams_presplit(words: Sequence[str], n: int):
+def ngrams_presplit(
+ words: Sequence[str], n: int
+) -> Generator[Sequence[str], str, None]:
"""
Same as :meth:`ngrams` but with the string pre-split.
return list_utils.ngrams(words, n)
-def bigrams(txt: str):
+def bigrams(txt: str) -> Generator[str, str, None]:
"""Generates the bigrams (n=2) of the given string.
See also :meth:`ngrams`, :meth:`trigrams`.
return ngrams(txt, 2)
-def trigrams(txt: str):
+def trigrams(txt: str) -> Generator[str, str, None]:
"""Generates the trigrams (n=3) of the given string.
See also :meth:`ngrams`, :meth:`bigrams`.
def shuffle_columns_into_list(
- input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
+ input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
) -> Iterable[str]:
"""Helper to shuffle / parse columnar data and return the results as a
list.
def shuffle_columns_into_dict(
input_lines: Sequence[str],
column_specs: Iterable[Tuple[str, Iterable[int]]],
- delim='',
+ delim: str = '',
) -> Dict[str, str]:
"""Helper to shuffle / parse columnar data and return the results
as a dict.
Returns:
txt encoded as an ASCII byte string.
+ Raises:
+ TypeError: the input argument isn't a string or bytes
+
See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
:meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
return txt.encode('ascii')
if isinstance(txt, bytes):
return txt
- raise Exception('to_ascii works with strings and bytes')
+ raise TypeError('to_ascii works with strings and bytes')
-def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
+def to_base64(
+ txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
+) -> bytes:
"""
Args:
txt: the input data to encode
+ encoding: the encoding to use during conversion
+ errors: how to handle encoding errors
Returns:
txt encoded with a 64-chracter alphabet. Similar to and compatible
return True
-def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
+def from_base64(
+ b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
+) -> str:
"""
Args:
b64: bytestring of 64-bit encoded data to decode / convert.
+ encoding: the encoding to use during conversion
+ errors: how to handle encoding errors
Returns:
The decoded form of b64 as a normal python string. Similar to
yield txt[x : x + chunk_size]
-def to_bitstring(txt: str, *, delimiter='') -> str:
+def to_bitstring(txt: str, *, delimiter: str = '') -> str:
"""
Args:
txt: the string to convert into a bitstring
return is_binary_integer_number(f'0b{txt}')
-def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
+def from_bitstring(
+ bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
+) -> str:
"""
Args:
bits: the bitstring to convert back into a python string
- encoding: the encoding to use
+ encoding: the encoding to use during conversion
+ errors: how to handle encoding errors
Returns:
The regular python string represented by bits. Note that this