X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=string_utils.py;h=6f3cc90ed46f5c238b0887848c1cf7504ec3bcc0;hb=4d03debb5b84b5b3e096add468ecd87c55ed0f5f;hp=9a204660432693032c6dfef79722714d1e133a65;hpb=9eba12cba5641d6a0b988038694cbc2dd52800c5;p=python_utils.git

diff --git a/string_utils.py b/string_utils.py
index 9a20466..6f3cc90 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -30,26 +30,17 @@ import base64
 import contextlib  # type: ignore
 import datetime
 import io
-from itertools import zip_longest
 import json
 import logging
 import numbers
 import random
 import re
 import string
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Sequence,
-    Tuple,
-)
 import unicodedata
-from uuid import uuid4
 import warnings
+from itertools import zip_longest
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
+from uuid import uuid4
 
 import list_utils
 
@@ -81,9 +72,7 @@ URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
 
 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
 
-EMAILS_RAW_STRING = (
-    r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
-)
+EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
 
 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
 
@@ -93,13 +82,9 @@ CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d
 
 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
 
-SNAKE_CASE_TEST_RE = re.compile(
-    r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
-)
+SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
 
-SNAKE_CASE_TEST_DASH_RE = re.compile(
-    r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
-)
+SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
 
 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 
@@ -116,9 +101,7 @@ CREDIT_CARDS = {
 
 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 
-UUID_RE = re.compile(
-    r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
-)
+UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
 
 UUID_HEX_OK_RE = re.compile(
     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
@@ -135,9 +118,7 @@ ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 
 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 
-ANYWHERE_MAC_ADDRESS_RE = re.compile(
-    r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
-)
+ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
 
 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 
@@ -427,9 +408,7 @@ def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str
     if isinstance(in_str, numbers.Number):
         in_str = f'{in_str}'
     if is_number(in_str):
-        return _add_thousands_separator(
-            in_str, separator_char=separator_char, places=places
-        )
+        return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
     raise ValueError(in_str)
 
 
@@ -1254,6 +1233,32 @@ class SprintfStdout(object):
         return None  # don't suppress exceptions
 
 
+def capitalize_first_letter(txt: str) -> str:
+    """Capitalize the first letter of a string.
+
+    >>> capitalize_first_letter('test')
+    'Test'
+    >>> capitalize_first_letter("ALREADY!")
+    'ALREADY!'
+
+    """
+    return txt[0].upper() + txt[1:]
+
+
+def it_they(n: int) -> str:
+    """It or they?
+
+    >>> it_they(1)
+    'it'
+    >>> it_they(100)
+    'they'
+
+    """
+    if n == 1:
+        return "it"
+    return "they"
+
+
 def is_are(n: int) -> str:
     """Is or are?
 
@@ -1286,6 +1291,98 @@ def pluralize(n: int) -> str:
     return "s"
 
 
+def make_contractions(txt: str) -> str:
+    """Glue words together to form contractions.
+
+    >>> make_contractions('It is nice today.')
+    "It's nice today."
+
+    >>> make_contractions('I can    not even...')
+    "I can't even..."
+
+    >>> make_contractions('She could not see!')
+    "She couldn't see!"
+
+    >>> make_contractions('But she will not go.')
+    "But she won't go."
+
+    >>> make_contractions('Verily, I shall not.')
+    "Verily, I shan't."
+
+    >>> make_contractions('No you cannot.')
+    "No you can't."
+
+    >>> make_contractions('I said you can not go.')
+    "I said you can't go."
+
+    """
+
+    first_second = [
+        (
+            [
+                'are',
+                'could',
+                'did',
+                'has',
+                'have',
+                'is',
+                'must',
+                'should',
+                'was',
+                'were',
+                'would',
+            ],
+            ['(n)o(t)'],
+        ),
+        (
+            [
+                "I",
+                "you",
+                "he",
+                "she",
+                "it",
+                "we",
+                "they",
+                "how",
+                "why",
+                "when",
+                "where",
+                "who",
+                "there",
+            ],
+            ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
+        ),
+    ]
+
+    # Special cases: can't, shan't and won't.
+    txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
+    txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
+    txt = re.sub(
+        r'\b(w)ill\s*(n)(o)(t)\b',
+        r"\1\3\2'\4",
+        txt,
+        count=0,
+        flags=re.IGNORECASE,
+    )
+
+    for first_list, second_list in first_second:
+        for first in first_list:
+            for second in second_list:
+                # Disallow there're/where're.  They're valid English
+                # but sound weird.
+                if (first == 'there' or first == 'where') and second == 'a(re)':
+                    continue
+
+                pattern = fr'\b({first})\s+{second}\b'
+                if second == '(n)o(t)':
+                    replacement = r"\1\2'\3"
+                else:
+                    replacement = r"\1'\2"
+                txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
+
+    return txt
+
+
 def thify(n: int) -> str:
     """Return the proper cardinal suffix for a number.
 
@@ -1483,9 +1580,7 @@ def chunk(txt: str, chunk_size):
         yield txt[x : x + chunk_size]
 
 
-def to_bitstring(
-    txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass'
-) -> str:
+def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
     """Encode txt and then chop it into bytes.  Note: only bitstrings
     with delimiter='' are interpretable by from_bitstring.