Contractions and stuff in string_utils.
[python_utils.git] / string_utils.py
index 9f67207cdf11af0772a6f3872a17fa5a1be67306..1ed9b4ad10790685290838cd30a2322d59b1b373 100644 (file)
@@ -27,29 +27,20 @@ This class is based on: https://github.com/daveoncode/python-string-utils.
 """
 
 import base64
-import contextlib
+import contextlib  # type: ignore
 import datetime
 import io
-from itertools import zip_longest
 import json
 import logging
 import numbers
 import random
 import re
 import string
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Sequence,
-    Tuple,
-)
 import unicodedata
-from uuid import uuid4
 import warnings
+from itertools import zip_longest
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
+from uuid import uuid4
 
 import list_utils
 
@@ -1095,7 +1086,7 @@ def to_date(in_str: str) -> Optional[datetime.date]:
     """
     Parses a date string.  See DateParser docs for details.
     """
-    import dateparse.dateparse_utils as dp
+    import dateparse.dateparse_utils as dp  # type: ignore
 
     try:
         d = dp.DateParser()
@@ -1241,7 +1232,7 @@ class SprintfStdout(object):
 
     def __init__(self) -> None:
         self.destination = io.StringIO()
-        self.recorder = None
+        self.recorder: contextlib.redirect_stdout
 
     def __enter__(self) -> Callable[[], str]:
         self.recorder = contextlib.redirect_stdout(self.destination)
@@ -1254,6 +1245,32 @@ class SprintfStdout(object):
         return None  # don't suppress exceptions
 
 
+def capitalize_first_letter(txt: str) -> str:
+    """Capitalize the first letter of a string.
+
+    >>> capitalize_first_letter('test')
+    'Test'
+    >>> capitalize_first_letter("ALREADY!")
+    'ALREADY!'
+
+    """
+    return txt[0].upper() + txt[1:]
+
+
+def it_they(n: int) -> str:
+    """It or they?
+
+    >>> it_they(1)
+    'it'
+    >>> it_they(100)
+    'they'
+
+    """
+    if n == 1:
+        return "it"
+    return "they"
+
+
 def is_are(n: int) -> str:
     """Is or are?
 
@@ -1286,6 +1303,95 @@ def pluralize(n: int) -> str:
     return "s"
 
 
+def make_contractions(txt: str) -> str:
+    """Glue words together to form contractions.
+
+    >>> make_contractions('It is nice today.')
+    "It's nice today."
+
+    >>> make_contractions('I can    not even...')
+    "I can't even..."
+
+    >>> make_contractions('She could not see!')
+    "She couldn't see!"
+
+    >>> make_contractions('But she will not go.')
+    "But she won't go."
+
+    >>> make_contractions('Verily, I shall not.')
+    "Verily, I shan't."
+
+    >>> make_contractions('No you cannot.')
+    "No you can't."
+
+    >>> make_contractions('I said you can not go.')
+    "I said you can't go."
+
+    """
+
+    first_second = [
+        (
+            [
+                'are',
+                'could',
+                'did',
+                'has',
+                'have',
+                'is',
+                'must',
+                'should',
+                'was',
+                'were',
+                'would',
+            ],
+            ['(n)o(t)'],
+        ),
+        (
+            [
+                "I",
+                "you",
+                "he",
+                "she",
+                "it",
+                "we",
+                "they",
+                "how",
+                "why",
+                "when",
+                "where",
+                "who",
+                "there",
+            ],
+            ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
+        ),
+    ]
+
+    # Special cases
+    txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
+    txt = re.sub(
+        r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
+    )
+    txt = re.sub(
+        r'\b(w)ill\s*(n)(o)(t)\b', r"\1\3\2'\4", txt, count=0, flags=re.IGNORECASE
+    )
+
+    for first_list, second_list in first_second:
+        for first in first_list:
+            for second in second_list:
+                # Disallow there're.  It's valid English but sounds weird.
+                if first == 'there' and second == 'a(re)':
+                    continue
+
+                pattern = fr'\b({first})\s+{second}\b'
+                if second == '(n)o(t)':
+                    replacement = r"\1\2'\3"
+                else:
+                    replacement = r"\1'\2"
+                txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
+
+    return txt
+
+
 def thify(n: int) -> str:
     """Return the proper cardinal suffix for a number.
 
@@ -1338,7 +1444,7 @@ def trigrams(txt: str):
 
 
 def shuffle_columns_into_list(
-    input_lines: Iterable[str], column_specs: Iterable[Iterable[int]], delim=''
+    input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
 ) -> Iterable[str]:
     """Helper to shuffle / parse columnar data and return the results as a
     list.  The column_specs argument is an iterable collection of
@@ -1368,7 +1474,7 @@ def shuffle_columns_into_list(
 
 
 def shuffle_columns_into_dict(
-    input_lines: Iterable[str],
+    input_lines: Sequence[str],
     column_specs: Iterable[Tuple[str, Iterable[int]]],
     delim='',
 ) -> Dict[str, str]:
@@ -1425,7 +1531,7 @@ def to_ascii(x: str):
     raise Exception('to_ascii works with strings and bytes')
 
 
-def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
+def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
     """Encode txt and then encode the bytes with a 64-character
     alphabet.  This is compatible with uudecode.
 
@@ -1458,7 +1564,7 @@ def is_base64(txt: str) -> bool:
     return True
 
 
-def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
+def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
     """Convert base64 encoded string back to normal strings.
 
     >>> from_base64(b'aGVsbG8/\\n')
@@ -1529,7 +1635,7 @@ def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
 
 
-def ip_v4_sort_key(txt: str) -> Tuple[int]:
+def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
     """Turn an IPv4 address into a tuple for sorting purposes.
 
     >>> ip_v4_sort_key('10.0.0.18')
@@ -1546,7 +1652,7 @@ def ip_v4_sort_key(txt: str) -> Tuple[int]:
     return tuple([int(x) for x in txt.split('.')])
 
 
-def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str]:
+def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
     """Chunk up a file path so that parent/ancestor paths sort before
     children/descendant paths.