X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=string_utils.py;h=bca2b70d5cd18bc8bb1198782d356f2707c1cbd5;hb=b10d30a46e601c9ee1f843241f2d69a1f90f7a94;hp=b586ae1a7e82d62e92ba567b20e5a440254fe8b3;hpb=497fb9e21f45ec08e1486abaee6dfa7b20b8a691;p=python_utils.git

diff --git a/string_utils.py b/string_utils.py
index b586ae1..bca2b70 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -1,13 +1,20 @@
 #!/usr/bin/env python3
 
+import contextlib
+import datetime
+import io
+from itertools import zip_longest
 import json
+import logging
 import random
 import re
 import string
-from typing import Any, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 import unicodedata
 from uuid import uuid4
 
+logger = logging.getLogger(__name__)
+
 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
 
 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
@@ -220,6 +227,36 @@ def strip_escape_sequences(in_str: str) -> str:
     return in_str
 
 
+def add_thousands_separator(
+        in_str: str,
+        *,
+        separator_char = ',',
+        places = 3
+) -> str:
+    if isinstance(in_str, int):
+        in_str = f'{in_str}'
+    if is_number(in_str):
+        return _add_thousands_separator(
+            in_str,
+            separator_char = separator_char,
+            places = places
+        )
+    raise ValueError(in_str)
+
+
+def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
+    decimal_part = ""
+    if '.' in in_str:
+        (in_str, decimal_part) = in_str.split('.')
+    tmp = [iter(in_str[::-1])] * places
+    ret = separator_char.join(
+        "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
+    if len(decimal_part) > 0:
+        ret += '.'
+        ret += decimal_part
+    return ret
+
+
 # Full url example:
 # scheme://username:password@www.domain.com:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
@@ -326,13 +363,14 @@ def number_to_suffix_string(num: int) -> Optional[str]:
     d = 0.0
     suffix = None
     for (sfx, size) in NUM_SUFFIXES.items():
-        if num > size:
+        if num >= size:
             d = num / size
             suffix = sfx
             break
     if suffix is not None:
         return f"{d:.1f}{suffix}"
-    return None
+    else:
+        return f'{num:d}'
 
 
 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
@@ -779,6 +817,48 @@ def to_bool(in_str: str) -> bool:
     return in_str.lower() in ("true", "1", "yes", "y", "t")
 
 
+def to_date(in_str: str) -> Optional[datetime.date]:
+    import dateparse.dateparse_utils as dp
+    try:
+        d = dp.DateParser()
+        d.parse(in_str)
+        return d.get_date()
+    except dp.ParseException:
+        logger.warning(f'Unable to parse date {in_str}.')
+    return None
+
+
+def valid_date(in_str: str) -> bool:
+    import dateparse.dateparse_utils as dp
+    try:
+        d = dp.DateParser()
+        _ = d.parse(in_str)
+        return True
+    except dp.ParseException:
+        logger.warning(f'Unable to parse date {in_str}.')
+    return False
+
+
+def to_datetime(in_str: str) -> Optional[datetime.datetime]:
+    import dateparse.dateparse_utils as dp
+    try:
+        d = dp.DateParser()
+        dt = d.parse(in_str)
+        if type(dt) == datetime.datetime:
+            return dt
+    except ValueError:
+        logger.warning(f'Unable to parse datetime {in_str}.')
+    return None
+
+
+def valid_datetime(in_str: str) -> bool:
+    _ = to_datetime(in_str)
+    if _ is not None:
+        return True
+    logger.warning(f'Unable to parse datetime {in_str}.')
+    return False
+
+
 def dedent(in_str: str) -> str:
     """
     Removes tab indentation from multi line strings (inspired by analogous Scala function).
@@ -841,3 +921,97 @@ def sprintf(*args, **kwargs) -> str:
             ret += str(arg)
     ret += end
     return ret
+
+
+class SprintfStdout(object):
+    def __init__(self) -> None:
+        self.destination = io.StringIO()
+        self.recorder = None
+
+    def __enter__(self) -> Callable[[], str]:
+        self.recorder = contextlib.redirect_stdout(self.destination)
+        self.recorder.__enter__()
+        return lambda: self.destination.getvalue()
+
+    def __exit__(self, *args) -> None:
+        self.recorder.__exit__(*args)
+        self.destination.seek(0)
+        return None  # don't suppress exceptions
+
+
+def is_are(n: int) -> str:
+    if n == 1:
+        return "is"
+    return "are"
+
+
+def pluralize(n: int) -> str:
+    if n == 1:
+        return ""
+    return "s"
+
+
+def thify(n: int) -> str:
+    digit = str(n)
+    assert is_integer_number(digit)
+    digit = digit[-1:]
+    if digit == "1":
+        return "st"
+    elif digit == "2":
+        return "nd"
+    elif digit == "3":
+        return "rd"
+    else:
+        return "th"
+
+
+def ngrams(txt: str, n: int):
+    words = txt.split()
+    return ngrams_presplit(words, n)
+
+
+def ngrams_presplit(words: Iterable[str], n: int):
+    for ngram in zip(*[words[i:] for i in range(n)]):
+        yield(' '.join(ngram))
+
+
+def bigrams(txt: str):
+    return ngrams(txt, 2)
+
+
+def trigrams(txt: str):
+    return ngrams(txt, 3)
+
+
+def shuffle_columns(
+        txt: Iterable[str],
+        specs: Iterable[Iterable[int]],
+        delim=''
+) -> Iterable[str]:
+    out = []
+    for spec in specs:
+        chunk = ''
+        for n in spec:
+            chunk = chunk + delim + txt[n]
+        chunk = chunk.strip(delim)
+        out.append(chunk)
+    return out
+
+
+def shuffle_columns_into_dict(
+        txt: Iterable[str],
+        specs: Iterable[Tuple[str, Iterable[int]]],
+        delim=''
+) -> Dict[str, str]:
+    out = {}
+    for spec in specs:
+        chunk = ''
+        for n in spec[1]:
+            chunk = chunk + delim + txt[n]
+        chunk = chunk.strip(delim)
+        out[spec[0]] = chunk
+    return out
+
+
+def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
+    return sprintf(txt.format(**values), end='')