X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=string_utils.py;h=1ed9b4ad10790685290838cd30a2322d59b1b373;hb=e596cc5a8f9cd58d88ad11d177a21eb25895f083;hp=9f67207cdf11af0772a6f3872a17fa5a1be67306;hpb=9d3650cc7009183d92422e70cf0089b2674e1e9b;p=python_utils.git diff --git a/string_utils.py b/string_utils.py index 9f67207..1ed9b4a 100644 --- a/string_utils.py +++ b/string_utils.py @@ -27,29 +27,20 @@ This class is based on: https://github.com/daveoncode/python-string-utils. """ import base64 -import contextlib +import contextlib # type: ignore import datetime import io -from itertools import zip_longest import json import logging import numbers import random import re import string -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Optional, - Sequence, - Tuple, -) import unicodedata -from uuid import uuid4 import warnings +from itertools import zip_longest +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple +from uuid import uuid4 import list_utils @@ -1095,7 +1086,7 @@ def to_date(in_str: str) -> Optional[datetime.date]: """ Parses a date string. See DateParser docs for details. """ - import dateparse.dateparse_utils as dp + import dateparse.dateparse_utils as dp # type: ignore try: d = dp.DateParser() @@ -1241,7 +1232,7 @@ class SprintfStdout(object): def __init__(self) -> None: self.destination = io.StringIO() - self.recorder = None + self.recorder: contextlib.redirect_stdout def __enter__(self) -> Callable[[], str]: self.recorder = contextlib.redirect_stdout(self.destination) @@ -1254,6 +1245,32 @@ class SprintfStdout(object): return None # don't suppress exceptions +def capitalize_first_letter(txt: str) -> str: + """Capitalize the first letter of a string. + + >>> capitalize_first_letter('test') + 'Test' + >>> capitalize_first_letter("ALREADY!") + 'ALREADY!' + + """ + return txt[0].upper() + txt[1:] + + +def it_they(n: int) -> str: + """It or they? + + >>> it_they(1) + 'it' + >>> it_they(100) + 'they' + + """ + if n == 1: + return "it" + return "they" + + def is_are(n: int) -> str: """Is or are? @@ -1286,6 +1303,95 @@ def pluralize(n: int) -> str: return "s" +def make_contractions(txt: str) -> str: + """Glue words together to form contractions. + + >>> make_contractions('It is nice today.') + "It's nice today." + + >>> make_contractions('I can not even...') + "I can't even..." + + >>> make_contractions('She could not see!') + "She couldn't see!" + + >>> make_contractions('But she will not go.') + "But she won't go." + + >>> make_contractions('Verily, I shall not.') + "Verily, I shan't." + + >>> make_contractions('No you cannot.') + "No you can't." + + >>> make_contractions('I said you can not go.') + "I said you can't go." + + """ + + first_second = [ + ( + [ + 'are', + 'could', + 'did', + 'has', + 'have', + 'is', + 'must', + 'should', + 'was', + 'were', + 'would', + ], + ['(n)o(t)'], + ), + ( + [ + "I", + "you", + "he", + "she", + "it", + "we", + "they", + "how", + "why", + "when", + "where", + "who", + "there", + ], + ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'], + ), + ] + + # Special cases + txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE) + txt = re.sub( + r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE + ) + txt = re.sub( + r'\b(w)ill\s*(n)(o)(t)\b', r"\1\3\2'\4", txt, count=0, flags=re.IGNORECASE + ) + + for first_list, second_list in first_second: + for first in first_list: + for second in second_list: + # Disallow there're. It's valid English but sounds weird. + if first == 'there' and second == 'a(re)': + continue + + pattern = fr'\b({first})\s+{second}\b' + if second == '(n)o(t)': + replacement = r"\1\2'\3" + else: + replacement = r"\1'\2" + txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE) + + return txt + + def thify(n: int) -> str: """Return the proper cardinal suffix for a number. @@ -1338,7 +1444,7 @@ def trigrams(txt: str): def shuffle_columns_into_list( - input_lines: Iterable[str], column_specs: Iterable[Iterable[int]], delim='' + input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim='' ) -> Iterable[str]: """Helper to shuffle / parse columnar data and return the results as a list. The column_specs argument is an iterable collection of @@ -1368,7 +1474,7 @@ def shuffle_columns_into_list( def shuffle_columns_into_dict( - input_lines: Iterable[str], + input_lines: Sequence[str], column_specs: Iterable[Tuple[str, Iterable[int]]], delim='', ) -> Dict[str, str]: @@ -1425,7 +1531,7 @@ def to_ascii(x: str): raise Exception('to_ascii works with strings and bytes') -def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str: +def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes: """Encode txt and then encode the bytes with a 64-character alphabet. This is compatible with uudecode. @@ -1458,7 +1564,7 @@ def is_base64(txt: str) -> bool: return True -def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str: +def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str: """Convert base64 encoded string back to normal strings. >>> from_base64(b'aGVsbG8/\\n') @@ -1529,7 +1635,7 @@ def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str: return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0' -def ip_v4_sort_key(txt: str) -> Tuple[int]: +def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]: """Turn an IPv4 address into a tuple for sorting purposes. >>> ip_v4_sort_key('10.0.0.18') @@ -1546,7 +1652,7 @@ def ip_v4_sort_key(txt: str) -> Tuple[int]: return tuple([int(x) for x in txt.split('.')]) -def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str]: +def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]: """Chunk up a file path so that parent/ancestor paths sort before children/descendant paths.