src/pyutils/string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7
   8 Modifications Copyright (c) 2021-2022 Scott Gasch
   9
  10 Permission is hereby granted, free of charge, to any person obtaining a copy
  11 of this software and associated documentation files (the "Software"), to deal
  12 in the Software without restriction, including without limitation the rights
  13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 copies of the Software, and to permit persons to whom the Software is
  15 furnished to do so, subject to the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included in all
  18 copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26 SOFTWARE.
  27
  28 This class is based on:
  29 https://github.com/daveoncode/python-string-utils.  See `NOTICE
  30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
  31 in the root of this module for a detailed enumeration of what work is
  32 Davide's and what work was added by Scott.
  33
  34 """
  35
  36 import base64
  37 import contextlib  # type: ignore
  38 import datetime
  39 import io
  40 import json
  41 import logging
  42 import numbers
  43 import random
  44 import re
  45 import string
  46 import unicodedata
  47 import warnings
  48 from itertools import zip_longest
  49 from typing import (
  50     Any,
  51     Callable,
  52     Dict,
  53     Generator,
  54     Iterable,
  55     List,
  56     Literal,
  57     Optional,
  58     Sequence,
  59     Tuple,
  60 )
  61 from uuid import uuid4
  62
  63 from pyutils import list_utils
  64
  65 logger = logging.getLogger(__name__)
  66
  67 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  68
  69 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  70
  71 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  72
  73 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  74
  75 URLS_RAW_STRING = (
  76     r"([a-z-]+://)"  # scheme
  77     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  78     r"(www\.)?"  # www.
  79     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  80     r"(:\d{2,})?"  # port number
  81     r"(/[a-z\d_%+-]*)*"  # folders
  82     r"(\.[a-z\d_%+-]+)*"  # file extension
  83     r"(\?[a-z\d_+%-=]*)?"  # query string
  84     r"(#\S*)?"  # hash
  85 )
  86
  87 URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
  88
  89 URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
  90
  91 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  92
  93 EMAILS_RAW_STRING = (
  94     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  95 )
  96
  97 EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
  98
  99 EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
 100
 101 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
 102
 103 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
 104
 105 SNAKE_CASE_TEST_RE = re.compile(
 106     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
 107 )
 108
 109 SNAKE_CASE_TEST_DASH_RE = re.compile(
 110     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 111 )
 112
 113 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 114
 115 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 116
 117 CREDIT_CARDS = {
 118     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 119     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 120     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 121     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 122     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 123     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 124 }
 125
 126 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 127
 128 UUID_RE = re.compile(
 129     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 130 )
 131
 132 UUID_HEX_OK_RE = re.compile(
 133     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 134     re.IGNORECASE,
 135 )
 136
 137 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 138
 139 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 140
 141 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 142
 143 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 144
 145 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 146
 147 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 148     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 149 )
 150
 151 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 152
 153 HTML_RE = re.compile(
 154     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 155     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 156 )
 157
 158 HTML_TAG_ONLY_RE = re.compile(
 159     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 160     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 161 )
 162
 163 SPACES_RE = re.compile(r"\s")
 164
 165 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 166
 167 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 168
 169 ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
 170
 171 NUM_SUFFIXES = {
 172     "Pb": (1024**5),
 173     "P": (1024**5),
 174     "Tb": (1024**4),
 175     "T": (1024**4),
 176     "Gb": (1024**3),
 177     "G": (1024**3),
 178     "Mb": (1024**2),
 179     "M": (1024**2),
 180     "Kb": (1024**1),
 181     "K": (1024**1),
 182 }
 183
 184 UNIT_WORDS = [
 185     "zero",
 186     "one",
 187     "two",
 188     "three",
 189     "four",
 190     "five",
 191     "six",
 192     "seven",
 193     "eight",
 194     "nine",
 195     "ten",
 196     "eleven",
 197     "twelve",
 198     "thirteen",
 199     "fourteen",
 200     "fifteen",
 201     "sixteen",
 202     "seventeen",
 203     "eighteen",
 204     "nineteen",
 205 ]
 206
 207 TENS_WORDS = [
 208     "",
 209     "",
 210     "twenty",
 211     "thirty",
 212     "forty",
 213     "fifty",
 214     "sixty",
 215     "seventy",
 216     "eighty",
 217     "ninety",
 218 ]
 219
 220 MAGNITUDE_SCALES = [
 221     "hundred",
 222     "thousand",
 223     "million",
 224     "billion",
 225     "trillion",
 226     "quadrillion",
 227 ]
 228
 229 NUM_WORDS = {}
 230 NUM_WORDS["and"] = (1, 0)
 231 for i, word in enumerate(UNIT_WORDS):
 232     NUM_WORDS[word] = (1, i)
 233 for i, word in enumerate(TENS_WORDS):
 234     NUM_WORDS[word] = (1, i * 10)
 235 for i, word in enumerate(MAGNITUDE_SCALES):
 236     if i == 0:
 237         NUM_WORDS[word] = (100, 0)
 238     else:
 239         NUM_WORDS[word] = (10 ** (i * 3), 0)
 240 NUM_WORDS['score'] = (20, 0)
 241
 242
 243 def is_none_or_empty(in_str: Optional[str]) -> bool:
 244     """
 245     Args:
 246         in_str: the string to test
 247
 248     Returns:
 249         True if the input string is either None or an empty string,
 250         False otherwise.
 251
 252     See also :meth:`is_string` and :meth:`is_empty_string`.
 253
 254     >>> is_none_or_empty("")
 255     True
 256     >>> is_none_or_empty(None)
 257     True
 258     >>> is_none_or_empty("   \t   ")
 259     True
 260     >>> is_none_or_empty('Test')
 261     False
 262     """
 263     return in_str is None or len(in_str.strip()) == 0
 264
 265
 266 def is_string(in_str: Any) -> bool:
 267     """
 268     Args:
 269         in_str: the object to test
 270
 271     Returns:
 272         True if the object is a string and False otherwise.
 273
 274     See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 275
 276     >>> is_string('test')
 277     True
 278     >>> is_string(123)
 279     False
 280     >>> is_string(100.3)
 281     False
 282     >>> is_string([1, 2, 3])
 283     False
 284     """
 285     return isinstance(in_str, str)
 286
 287
 288 def is_empty_string(in_str: Any) -> bool:
 289     """
 290     Args:
 291         in_str: the string to test
 292
 293     Returns:
 294         True if the string is empty and False otherwise.
 295
 296     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 297     """
 298     return is_empty(in_str)
 299
 300
 301 def is_empty(in_str: Any) -> bool:
 302     """
 303     Args:
 304         in_str: the string to test
 305
 306     Returns:
 307         True if the string is empty and false otherwise.
 308
 309     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 310
 311     >>> is_empty('')
 312     True
 313     >>> is_empty('    \t\t    ')
 314     True
 315     >>> is_empty('test')
 316     False
 317     >>> is_empty(100.88)
 318     False
 319     >>> is_empty([1, 2, 3])
 320     False
 321     """
 322     return is_string(in_str) and in_str.strip() == ""
 323
 324
 325 def is_full_string(in_str: Any) -> bool:
 326     """
 327     Args:
 328         in_str: the object to test
 329
 330     Returns:
 331         True if the object is a string and is not empty ('') and
 332         is not only composed of whitespace.
 333
 334     See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 335
 336     >>> is_full_string('test!')
 337     True
 338     >>> is_full_string('')
 339     False
 340     >>> is_full_string('      ')
 341     False
 342     >>> is_full_string(100.999)
 343     False
 344     >>> is_full_string({"a": 1, "b": 2})
 345     False
 346     """
 347     return is_string(in_str) and in_str.strip() != ""
 348
 349
 350 def is_number(in_str: str) -> bool:
 351     """
 352     Args:
 353         in_str: the string to test
 354
 355     Returns:
 356         True if the string contains a valid numberic value and
 357         False otherwise.
 358
 359     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 360     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 361     etc...
 362
 363     >>> is_number(100.5)
 364     Traceback (most recent call last):
 365     ...
 366     ValueError: 100.5
 367     >>> is_number("100.5")
 368     True
 369     >>> is_number("test")
 370     False
 371     >>> is_number("99")
 372     True
 373     >>> is_number([1, 2, 3])
 374     Traceback (most recent call last):
 375     ...
 376     ValueError: [1, 2, 3]
 377     """
 378     if not is_string(in_str):
 379         raise ValueError(in_str)
 380     return NUMBER_RE.match(in_str) is not None
 381
 382
 383 def is_integer_number(in_str: str) -> bool:
 384     """
 385     Args:
 386         in_str: the string to test
 387
 388     Returns:
 389         True if the string contains a valid (signed or unsigned,
 390         decimal, hex, or octal, regular or scientific) integral
 391         expression and False otherwise.
 392
 393     See also :meth:`is_number`, :meth:`is_decimal_number`,
 394     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 395     etc...
 396
 397     >>> is_integer_number('42')
 398     True
 399     >>> is_integer_number('42.0')
 400     False
 401     """
 402     return (
 403         (is_number(in_str) and "." not in in_str)
 404         or is_hexidecimal_integer_number(in_str)
 405         or is_octal_integer_number(in_str)
 406         or is_binary_integer_number(in_str)
 407     )
 408
 409
 410 def is_hexidecimal_integer_number(in_str: str) -> bool:
 411     """
 412     Args:
 413         in_str: the string to test
 414
 415     Returns:
 416         True if the string is a hex integer number and False otherwise.
 417
 418     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 419     :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
 420
 421     >>> is_hexidecimal_integer_number('0x12345')
 422     True
 423     >>> is_hexidecimal_integer_number('0x1A3E')
 424     True
 425     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 426     False
 427     >>> is_hexidecimal_integer_number('-0xff')
 428     True
 429     >>> is_hexidecimal_integer_number('test')
 430     False
 431     >>> is_hexidecimal_integer_number(12345)  # Not a string
 432     Traceback (most recent call last):
 433     ...
 434     ValueError: 12345
 435     >>> is_hexidecimal_integer_number(101.4)
 436     Traceback (most recent call last):
 437     ...
 438     ValueError: 101.4
 439     >>> is_hexidecimal_integer_number(0x1A3E)
 440     Traceback (most recent call last):
 441     ...
 442     ValueError: 6718
 443     """
 444     if not is_string(in_str):
 445         raise ValueError(in_str)
 446     return HEX_NUMBER_RE.match(in_str) is not None
 447
 448
 449 def is_octal_integer_number(in_str: str) -> bool:
 450     """
 451     Args:
 452         in_str: the string to test
 453
 454     Returns:
 455         True if the string is a valid octal integral number and False otherwise.
 456
 457     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 458     :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
 459     etc...
 460
 461     >>> is_octal_integer_number('0o777')
 462     True
 463     >>> is_octal_integer_number('-0O115')
 464     True
 465     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 466     False
 467     >>> is_octal_integer_number('7777')  # Needs 0o
 468     False
 469     >>> is_octal_integer_number('test')
 470     False
 471     """
 472     if not is_string(in_str):
 473         raise ValueError(in_str)
 474     return OCT_NUMBER_RE.match(in_str) is not None
 475
 476
 477 def is_binary_integer_number(in_str: str) -> bool:
 478     """
 479     Args:
 480         in_str: the string to test
 481
 482     Returns:
 483         True if the string contains a binary integral number and False otherwise.
 484
 485     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 486     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 487     etc...
 488
 489     >>> is_binary_integer_number('0b10111')
 490     True
 491     >>> is_binary_integer_number('-0b111')
 492     True
 493     >>> is_binary_integer_number('0B10101')
 494     True
 495     >>> is_binary_integer_number('0b10102')
 496     False
 497     >>> is_binary_integer_number('0xFFF')
 498     False
 499     >>> is_binary_integer_number('test')
 500     False
 501     """
 502     if not is_string(in_str):
 503         raise ValueError(in_str)
 504     return BIN_NUMBER_RE.match(in_str) is not None
 505
 506
 507 def to_int(in_str: str) -> int:
 508     """
 509     Args:
 510         in_str: the string to convert
 511
 512     Returns:
 513         The integral value of the string or raises on error.
 514
 515     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 516     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 517     :meth:`is_binary_integer_number`, etc...
 518
 519     >>> to_int('1234')
 520     1234
 521     >>> to_int('0x1234')
 522     4660
 523     >>> to_int('0b01101')
 524     13
 525     >>> to_int('0o777')
 526     511
 527     >>> to_int('test')
 528     Traceback (most recent call last):
 529     ...
 530     ValueError: invalid literal for int() with base 10: 'test'
 531     """
 532     if not is_string(in_str):
 533         raise ValueError(in_str)
 534     if is_binary_integer_number(in_str):
 535         return int(in_str, 2)
 536     if is_octal_integer_number(in_str):
 537         return int(in_str, 8)
 538     if is_hexidecimal_integer_number(in_str):
 539         return int(in_str, 16)
 540     return int(in_str)
 541
 542
 543 def number_string_to_integer(in_str: str) -> int:
 544     """Convert a string containing a written-out number into an int.
 545
 546     Args:
 547         in_str: the string containing the long-hand written out integer number
 548             in English.  See examples below.
 549
 550     Returns:
 551         The integer whose value was parsed from in_str.
 552
 553     See also :meth:`integer_to_number_string`.
 554
 555     .. warning::
 556         This code only handles integers; it will not work with decimals / floats.
 557
 558     >>> number_string_to_integer("one hundred fifty two")
 559     152
 560
 561     >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
 562     10200054003
 563
 564     >>> number_string_to_integer("four-score and 7")
 565     87
 566
 567     >>> number_string_to_integer("fifty xyzzy three")
 568     Traceback (most recent call last):
 569     ...
 570     ValueError: Unknown word: xyzzy
 571     """
 572     if isinstance(in_str, int):
 573         return int(in_str)
 574
 575     current = result = 0
 576     in_str = in_str.replace('-', ' ')
 577     for w in in_str.split():
 578         if w not in NUM_WORDS:
 579             if is_integer_number(w):
 580                 current += int(w)
 581                 continue
 582             else:
 583                 raise ValueError("Unknown word: " + w)
 584         scale, increment = NUM_WORDS[w]
 585         current = current * scale + increment
 586         if scale > 100:
 587             result += current
 588             current = 0
 589     return result + current
 590
 591
 592 def integer_to_number_string(num: int) -> str:
 593     """
 594     Opposite of :meth:`number_string_to_integer`; converts a number to a written out
 595     longhand format in English.
 596
 597     Args:
 598         num: the integer number to convert
 599
 600     Returns:
 601         The long-hand written out English form of the number.  See examples below.
 602
 603     See also :meth:`number_string_to_integer`.
 604
 605     .. warning::
 606         This method does not handle decimals or floats, only ints.
 607
 608     >>> integer_to_number_string(9)
 609     'nine'
 610
 611     >>> integer_to_number_string(42)
 612     'forty two'
 613
 614     >>> integer_to_number_string(123219982)
 615     'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
 616     """
 617
 618     if num < 20:
 619         return UNIT_WORDS[num]
 620     if num < 100:
 621         ret = TENS_WORDS[num // 10]
 622         leftover = num % 10
 623         if leftover != 0:
 624             ret += ' ' + UNIT_WORDS[leftover]
 625         return ret
 626
 627     # If num > 100 go find the highest chunk and convert that, then recursively
 628     # convert the rest.  NUM_WORDS contains items like 'thousand' -> (1000, 0).
 629     # The second item in the tuple is an increment that can be ignored; the first
 630     # is the numeric "scale" of the entry.  So find the greatest entry in NUM_WORDS
 631     # still less than num.  For 123,456 it would be thousand.  Then pull out the
 632     # 123, convert it, and append "thousand".  Then do the rest.
 633     scales = {}
 634     for name, val in NUM_WORDS.items():
 635         if val[0] <= num:
 636             scales[name] = val[0]
 637     scale = max(scales.items(), key=lambda _: _[1])
 638
 639     # scale[1] = numeric magnitude (e.g. 1000)
 640     # scale[0] = name (e.g. "thousand")
 641     ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
 642     leftover = num % scale[1]
 643     if leftover != 0:
 644         ret += ' ' + integer_to_number_string(leftover)
 645     return ret
 646
 647
 648 def is_decimal_number(in_str: str) -> bool:
 649     """
 650     Args:
 651         in_str: the string to check
 652
 653     Returns:
 654         True if the given string represents a decimal or False
 655         otherwise.  A decimal may be signed or unsigned or use
 656         a "scientific notation".
 657
 658     See also :meth:`is_integer_number`.
 659
 660     .. note::
 661         We do not consider integers without a decimal point
 662         to be decimals; they return False (see example).
 663
 664     >>> is_decimal_number('42.0')
 665     True
 666     >>> is_decimal_number('42')
 667     False
 668     """
 669     return is_number(in_str) and "." in in_str
 670
 671
 672 def strip_escape_sequences(in_str: str) -> str:
 673     """
 674     Args:
 675         in_str: the string to strip of escape sequences.
 676
 677     Returns:
 678         in_str with escape sequences removed.
 679
 680     See also: :mod:`pyutils.ansi`.
 681
 682     .. note::
 683         What is considered to be an "escape sequence" is defined
 684         by a regular expression.  While this gets common ones,
 685         there may exist valid sequences that it doesn't match.
 686
 687     >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
 688     'this is a test!'
 689     """
 690     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 691     return in_str
 692
 693
 694 def add_thousands_separator(
 695     in_str: str, *, separator_char: str = ',', places: int = 3
 696 ) -> str:
 697     """
 698     Args:
 699         in_str: string or number to which to add thousands separator(s)
 700         separator_char: the separator character to add (defaults to comma)
 701         places: add a separator every N places (defaults to three)
 702
 703     Returns:
 704         A numeric string with thousands separators added appropriately.
 705
 706     >>> add_thousands_separator('12345678')
 707     '12,345,678'
 708     >>> add_thousands_separator(12345678)
 709     '12,345,678'
 710     >>> add_thousands_separator(12345678.99)
 711     '12,345,678.99'
 712     >>> add_thousands_separator('test')
 713     Traceback (most recent call last):
 714     ...
 715     ValueError: test
 716
 717     """
 718     if isinstance(in_str, numbers.Number):
 719         in_str = f'{in_str}'
 720     if is_number(in_str):
 721         return _add_thousands_separator(
 722             in_str, separator_char=separator_char, places=places
 723         )
 724     raise ValueError(in_str)
 725
 726
 727 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 728     """Internal helper"""
 729     decimal_part = ""
 730     if '.' in in_str:
 731         (in_str, decimal_part) = in_str.split('.')
 732     tmp = [iter(in_str[::-1])] * places
 733     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 734     if decimal_part:
 735         ret += '.'
 736         ret += decimal_part
 737     return ret
 738
 739
 740 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 741     """
 742     Args:
 743         in_str: the string to test
 744         allowed_schemes: an optional list of allowed schemes (e.g.
 745             ['http', 'https', 'ftp'].  If passed, only URLs that
 746             begin with the one of the schemes passed will be considered
 747             to be valid.  Otherwise, any scheme:// will be considered
 748             valid.
 749
 750     Returns:
 751         True if in_str contains a valid URL and False otherwise.
 752
 753     >>> is_url('http://www.mysite.com')
 754     True
 755     >>> is_url('https://mysite.com')
 756     True
 757     >>> is_url('.mysite.com')
 758     False
 759     >>> is_url('scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash')
 760     True
 761     """
 762     if not is_full_string(in_str):
 763         return False
 764
 765     valid = URL_RE.match(in_str) is not None
 766
 767     if allowed_schemes:
 768         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 769     return valid
 770
 771
 772 def is_email(in_str: Any) -> bool:
 773     """
 774     Args:
 775         in_str: the email address to check
 776
 777     Returns: True if the in_str contains a valid email (as defined by
 778         https://tools.ietf.org/html/rfc3696#section-3) or False
 779         otherwise.
 780
 781     >>> is_email('[email protected]')
 782     True
 783     >>> is_email('@gmail.com')
 784     False
 785     """
 786     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 787         return False
 788
 789     try:
 790         # we expect 2 tokens, one before "@" and one after, otherwise
 791         # we have an exception and the email is not valid.
 792         head, tail = in_str.split("@")
 793
 794         # head's size must be <= 64, tail <= 255, head must not start
 795         # with a dot or contain multiple consecutive dots.
 796         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 797             return False
 798
 799         # removes escaped spaces, so that later on the test regex will
 800         # accept the string.
 801         head = head.replace("\\ ", "")
 802         if head.startswith('"') and head.endswith('"'):
 803             head = head.replace(" ", "")[1:-1]
 804         return EMAIL_RE.match(head + "@" + tail) is not None
 805
 806     except ValueError:
 807         # borderline case in which we have multiple "@" signs but the
 808         # head part is correctly escaped.
 809         if ESCAPED_AT_SIGN.search(in_str) is not None:
 810             # replace "@" with "a" in the head
 811             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 812         return False
 813
 814
 815 def suffix_string_to_number(in_str: str) -> Optional[int]:
 816     """Takes a string like "33Gb" and converts it into a number (of bytes)
 817     like 34603008.
 818
 819     Args:
 820         in_str: the string with a suffix to be interpreted and removed.
 821
 822     Returns:
 823         An integer number of bytes or None to indicate an error.
 824
 825     See also :meth:`number_to_suffix_string`.
 826
 827     >>> suffix_string_to_number('1Mb')
 828     1048576
 829     >>> suffix_string_to_number('13.1Gb')
 830     14066017894
 831     >>> suffix_string_to_number('12345')
 832     12345
 833     >>> x = suffix_string_to_number('a lot')
 834     >>> x is None
 835     True
 836     """
 837
 838     def suffix_capitalize(s: str) -> str:
 839         if len(s) == 1:
 840             return s.upper()
 841         elif len(s) == 2:
 842             return f"{s[0].upper()}{s[1].lower()}"
 843         return suffix_capitalize(s[0:1])
 844
 845     if is_string(in_str):
 846         if is_integer_number(in_str):
 847             return to_int(in_str)
 848         suffixes = [in_str[-2:], in_str[-1:]]
 849         rest = [in_str[:-2], in_str[:-1]]
 850         for x in range(len(suffixes)):
 851             s = suffixes[x]
 852             s = suffix_capitalize(s)
 853             multiplier = NUM_SUFFIXES.get(s, None)
 854             if multiplier is not None:
 855                 r = rest[x]
 856                 if is_integer_number(r):
 857                     return to_int(r) * multiplier
 858                 if is_decimal_number(r):
 859                     return int(float(r) * multiplier)
 860     return None
 861
 862
 863 def number_to_suffix_string(num: int) -> Optional[str]:
 864     """Take a number (of bytes) and returns a string like "43.8Gb".
 865
 866     Args:
 867         num: an integer number of bytes
 868
 869     Returns:
 870         A string with a suffix representing num bytes concisely or
 871         None to indicate an error.
 872
 873     See also: :meth:`suffix_string_to_number`.
 874
 875     >>> number_to_suffix_string(14066017894)
 876     '13.1Gb'
 877     >>> number_to_suffix_string(1024 * 1024)
 878     '1.0Mb'
 879     """
 880     d = 0.0
 881     suffix = None
 882     for (sfx, size) in NUM_SUFFIXES.items():
 883         if num >= size:
 884             d = num / size
 885             suffix = sfx
 886             break
 887     if suffix is not None:
 888         return f"{d:.1f}{suffix}"
 889     else:
 890         return f'{num:d}'
 891
 892
 893 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 894     """
 895     Args:
 896         in_str: a string to check
 897         card_type: if provided, contains the card type to validate
 898             with.  Otherwise, all known credit card number types will
 899             be accepted.
 900
 901             Supported card types are the following:
 902
 903             * VISA
 904             * MASTERCARD
 905             * AMERICAN_EXPRESS
 906             * DINERS_CLUB
 907             * DISCOVER
 908             * JCB
 909
 910     Returns:
 911         True if in_str is a valid credit card number.
 912
 913     .. warning::
 914         This code is not verifying the authenticity of the credit card (i.e.
 915         not checking whether it's a real card that can be charged); rather
 916         it's only checking that the number follows the "rules" for numbering
 917         established by credit card issuers.
 918
 919     """
 920     if not is_full_string(in_str):
 921         return False
 922
 923     if card_type is not None:
 924         if card_type not in CREDIT_CARDS:
 925             raise KeyError(
 926                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 927             )
 928         return CREDIT_CARDS[card_type].match(in_str) is not None
 929     for c in CREDIT_CARDS:
 930         if CREDIT_CARDS[c].match(in_str) is not None:
 931             return True
 932     return False
 933
 934
 935 def is_camel_case(in_str: Any) -> bool:
 936     """
 937     Args:
 938         in_str: the string to test
 939
 940     Returns:
 941         True if the string is formatted as camel case and False otherwise.
 942         A string is considered camel case when:
 943
 944         * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 945         * it contains both lowercase and uppercase letters
 946         * it does not start with a number
 947
 948     See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
 949     """
 950     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 951
 952
 953 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 954     """
 955     Args:
 956         in_str: the string to test
 957         separator: the snake case separator character to use
 958
 959     Returns: True if the string is snake case and False otherwise.  A
 960         string is considered snake case when:
 961
 962         * it's composed only by lowercase/uppercase letters and digits
 963         * it contains at least one underscore (or provided separator)
 964         * it does not start with a number
 965
 966     See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
 967
 968     >>> is_snake_case('this_is_a_test')
 969     True
 970     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 971     True
 972     >>> is_snake_case('this-is-a-test')
 973     False
 974     >>> is_snake_case('this-is-a-test', separator='-')
 975     True
 976     """
 977     if is_full_string(in_str):
 978         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 979         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 980         r = re_map.get(
 981             separator,
 982             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 983         )
 984         return r.match(in_str) is not None
 985     return False
 986
 987
 988 def is_json(in_str: Any) -> bool:
 989     """
 990     Args:
 991         in_str: the string to test
 992
 993     Returns:
 994         True if the in_str contains valid JSON and False otherwise.
 995
 996     >>> is_json('{"name": "Peter"}')
 997     True
 998     >>> is_json('[1, 2, 3]')
 999     True
1000     >>> is_json('{nope}')
1001     False
1002     """
1003     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
1004         try:
1005             return isinstance(json.loads(in_str), (dict, list))
1006         except (TypeError, ValueError, OverflowError):
1007             pass
1008     return False
1009
1010
1011 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1012     """
1013     Args:
1014         in_str: the string to test
1015         allow_hex: should we allow hexidecimal digits in valid uuids?
1016
1017     Returns:
1018         True if the in_str contains a valid UUID and False otherwise.
1019
1020     See also :meth:`generate_uuid`.
1021
1022     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1023     True
1024     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1025     False
1026     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1027     True
1028     """
1029     # string casting is used to allow UUID itself as input data type
1030     s = str(in_str)
1031     if allow_hex:
1032         return UUID_HEX_OK_RE.match(s) is not None
1033     return UUID_RE.match(s) is not None
1034
1035
1036 def is_ip_v4(in_str: Any) -> bool:
1037     """
1038     Args:
1039         in_str: the string to test
1040
1041     Returns:
1042         True if in_str contains a valid IPv4 address and False otherwise.
1043
1044     See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1045     and :meth:`is_ip`.
1046
1047     >>> is_ip_v4('255.200.100.75')
1048     True
1049     >>> is_ip_v4('nope')
1050     False
1051     >>> is_ip_v4('255.200.100.999')  # 999 out of range
1052     False
1053     """
1054     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1055         return False
1056
1057     # checks that each entry in the ip is in the valid range (0 to 255)
1058     for token in in_str.split("."):
1059         if not 0 <= int(token) <= 255:
1060             return False
1061     return True
1062
1063
1064 def extract_ip_v4(in_str: Any) -> Optional[str]:
1065     """
1066     Args:
1067         in_str: the string to extract an IPv4 address from.
1068
1069     Returns:
1070         The first extracted IPv4 address from in_str or None if
1071         none were found or an error occurred.
1072
1073     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1074     and :meth:`is_ip`.
1075
1076     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
1077     '127.0.0.1'
1078     >>> extract_ip_v4('Your mom dresses you funny.')
1079     """
1080     if not is_full_string(in_str):
1081         return None
1082     m = ANYWHERE_IP_V4_RE.search(in_str)
1083     if m is not None:
1084         return m.group(0)
1085     return None
1086
1087
1088 def is_ip_v6(in_str: Any) -> bool:
1089     """
1090     Args:
1091         in_str: the string to test.
1092
1093     Returns:
1094         True if in_str contains a valid IPv6 address and False otherwise.
1095
1096     See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1097     and :meth:`is_ip`.
1098
1099     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1100     True
1101     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
1102     False
1103     """
1104     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1105
1106
1107 def extract_ip_v6(in_str: Any) -> Optional[str]:
1108     """
1109     Args:
1110         in_str: the string from which to extract an IPv6 address.
1111
1112     Returns:
1113         The first IPv6 address found in in_str or None if no address
1114         was found or an error occurred.
1115
1116     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1117     and :meth:`is_ip`.
1118
1119     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1120     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1121     >>> extract_ip_v6("(and she's ugly too, btw)")
1122     """
1123     if not is_full_string(in_str):
1124         return None
1125     m = ANYWHERE_IP_V6_RE.search(in_str)
1126     if m is not None:
1127         return m.group(0)
1128     return None
1129
1130
1131 def is_ip(in_str: Any) -> bool:
1132     """
1133     Args:
1134         in_str: the string to test.
1135
1136     Returns:
1137         True if in_str contains a valid IP address (either IPv4 or
1138         IPv6).
1139
1140     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1141     and :meth:`extract_ip_v4`.
1142
1143     >>> is_ip('255.200.100.75')
1144     True
1145     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1146     True
1147     >>> is_ip('1.2.3')
1148     False
1149     >>> is_ip('1.2.3.999')
1150     False
1151     """
1152     return is_ip_v6(in_str) or is_ip_v4(in_str)
1153
1154
1155 def extract_ip(in_str: Any) -> Optional[str]:
1156     """
1157     Args:
1158         in_str: the string from which to extract in IP address.
1159
1160     Returns:
1161         The first IP address (IPv4 or IPv6) found in in_str or
1162         None to indicate none found or an error condition.
1163
1164     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1165     and :meth:`extract_ip_v4`.
1166
1167     >>> extract_ip('Attacker: 255.200.100.75')
1168     '255.200.100.75'
1169     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1170     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1171     >>> extract_ip('1.2.3')
1172     """
1173     ip = extract_ip_v4(in_str)
1174     if ip is None:
1175         ip = extract_ip_v6(in_str)
1176     return ip
1177
1178
1179 def is_mac_address(in_str: Any) -> bool:
1180     """
1181     Args:
1182         in_str: the string to test
1183
1184     Returns:
1185         True if in_str is a valid MAC address False otherwise.
1186
1187     See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1188
1189     >>> is_mac_address("34:29:8F:12:0D:2F")
1190     True
1191     >>> is_mac_address('34:29:8f:12:0d:2f')
1192     True
1193     >>> is_mac_address('34-29-8F-12-0D-2F')
1194     True
1195     >>> is_mac_address("test")
1196     False
1197     """
1198     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1199
1200
1201 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1202     """
1203     Args:
1204         in_str: the string from which to extract a MAC address.
1205         separator: the MAC address hex byte separator to use.
1206
1207     Returns:
1208         The first MAC address found in in_str or None to indicate no
1209         match or an error.
1210
1211     See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1212
1213     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1214     '34:29:8F:12:0D:2F'
1215
1216     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1217     'd8:5d:e2:34:54:86'
1218     """
1219     if not is_full_string(in_str):
1220         return None
1221     in_str.strip()
1222     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1223     if m is not None:
1224         mac = m.group(0)
1225         mac.replace(":", separator)
1226         mac.replace("-", separator)
1227         return mac
1228     return None
1229
1230
1231 def is_slug(in_str: Any, separator: str = "-") -> bool:
1232     """
1233     Args:
1234         in_str: string to test
1235         separator: the slug character to use
1236
1237     Returns:
1238         True if in_str is a slug string and False otherwise.
1239
1240     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1241
1242     >>> is_slug('my-blog-post-title')
1243     True
1244     >>> is_slug('My blog post title')
1245     False
1246     """
1247     if not is_full_string(in_str):
1248         return False
1249     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1250     return re.match(rex, in_str) is not None
1251
1252
1253 def contains_html(in_str: str) -> bool:
1254     """
1255     Args:
1256         in_str: the string to check for tags in
1257
1258     Returns:
1259         True if the given string contains HTML/XML tags and False
1260         otherwise.
1261
1262     See also :meth:`strip_html`.
1263
1264     .. warning::
1265         By design, this function matches ANY type of tag, so don't expect
1266         to use it as an HTML validator.  It's a quick sanity check at
1267         best.  See something like BeautifulSoup for a more full-featuered
1268         HTML parser.
1269
1270     >>> contains_html('my string is <strong>bold</strong>')
1271     True
1272     >>> contains_html('my string is not bold')
1273     False
1274
1275     """
1276     if not is_string(in_str):
1277         raise ValueError(in_str)
1278     return HTML_RE.search(in_str) is not None
1279
1280
1281 def words_count(in_str: str) -> int:
1282     """
1283     Args:
1284         in_str: the string to count words in
1285
1286     Returns:
1287         The number of words contained in the given string.
1288
1289     .. note::
1290         This method is "smart" in that it does consider only sequences
1291         of one or more letter and/or numbers to be "words".  Thus a
1292         string like this: "! @ # % ... []" will return zero.  Moreover
1293         it is aware of punctuation, so the count for a string like
1294         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1295         in the string).
1296
1297     >>> words_count('hello world')
1298     2
1299     >>> words_count('one,two,three.stop')
1300     4
1301     """
1302     if not is_string(in_str):
1303         raise ValueError(in_str)
1304     return len(WORDS_COUNT_RE.findall(in_str))
1305
1306
1307 def word_count(in_str: str) -> int:
1308     """
1309     Args:
1310         in_str: the string to count words in
1311
1312     Returns:
1313         The number of words contained in the given string.
1314
1315     .. note::
1316         This method is "smart" in that it does consider only sequences
1317         of one or more letter and/or numbers to be "words".  Thus a
1318         string like this: "! @ # % ... []" will return zero.  Moreover
1319         it is aware of punctuation, so the count for a string like
1320         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1321         in the string).
1322
1323     >>> word_count('hello world')
1324     2
1325     >>> word_count('one,two,three.stop')
1326     4
1327     """
1328     return words_count(in_str)
1329
1330
1331 def generate_uuid(omit_dashes: bool = False) -> str:
1332     """
1333     Args:
1334         omit_dashes: should we omit the dashes in the generated UUID?
1335
1336     Returns:
1337         A generated UUID string (using `uuid.uuid4()`) with or without
1338         dashes per the omit_dashes arg.
1339
1340     See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1341
1342     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1343     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1344     """
1345     uid = uuid4()
1346     if omit_dashes:
1347         return uid.hex
1348     return str(uid)
1349
1350
1351 def generate_random_alphanumeric_string(size: int) -> str:
1352     """
1353     Args:
1354         size: number of characters to generate
1355
1356     Returns:
1357         A string of the specified size containing random characters
1358         (uppercase/lowercase ascii letters and digits).
1359
1360     See also :meth:`asciify`, :meth:`generate_uuid`.
1361
1362     >>> random.seed(22)
1363     >>> generate_random_alphanumeric_string(9)
1364     '96ipbNClS'
1365     """
1366     if size < 1:
1367         raise ValueError("size must be >= 1")
1368     chars = string.ascii_letters + string.digits
1369     buffer = [random.choice(chars) for _ in range(size)]
1370     return from_char_list(buffer)
1371
1372
1373 def reverse(in_str: str) -> str:
1374     """
1375     Args:
1376         in_str: the string to reverse
1377
1378     Returns:
1379         The reversed (chracter by character) string.
1380
1381     >>> reverse('test')
1382     'tset'
1383     """
1384     if not is_string(in_str):
1385         raise ValueError(in_str)
1386     return in_str[::-1]
1387
1388
1389 def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
1390     """
1391     Args:
1392         in_str: the camel case string to convert
1393         separator: the snake case separator character to use
1394
1395     Returns:
1396         A snake case string equivalent to the camel case input or the
1397         original string if it is not a valid camel case string or some
1398         other error occurs.
1399
1400     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1401
1402     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1403     'mac_address_extractor_factory'
1404     >>> camel_case_to_snake_case('Luke Skywalker')
1405     'Luke Skywalker'
1406     """
1407     if not is_string(in_str):
1408         raise ValueError(in_str)
1409     if not is_camel_case(in_str):
1410         return in_str
1411     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1412
1413
1414 def snake_case_to_camel_case(
1415     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1416 ) -> str:
1417     """
1418     Args:
1419         in_str: the snake case string to convert
1420         upper_case_first: should we capitalize the first letter?
1421         separator: the separator character to use
1422
1423     Returns:
1424         A camel case string that is equivalent to the snake case string
1425         provided or the original string back again if it is not valid
1426         snake case or another error occurs.
1427
1428     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1429
1430     >>> snake_case_to_camel_case('this_is_a_test')
1431     'ThisIsATest'
1432     >>> snake_case_to_camel_case('Han Solo')
1433     'Han Solo'
1434     """
1435     if not is_string(in_str):
1436         raise ValueError(in_str)
1437     if not is_snake_case(in_str, separator=separator):
1438         return in_str
1439     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1440     if not upper_case_first:
1441         tokens[0] = tokens[0].lower()
1442     return from_char_list(tokens)
1443
1444
1445 def to_char_list(in_str: str) -> List[str]:
1446     """
1447     Args:
1448         in_str: the string to split into a char list
1449
1450     Returns:
1451         A list of strings of length one each.
1452
1453     See also :meth:`from_char_list`.
1454
1455     >>> to_char_list('test')
1456     ['t', 'e', 's', 't']
1457     """
1458     if not is_string(in_str):
1459         return []
1460     return list(in_str)
1461
1462
1463 def from_char_list(in_list: List[str]) -> str:
1464     """
1465     Args:
1466         in_list: A list of characters to convert into a string.
1467
1468     Returns:
1469         The string resulting from gluing the characters in in_list
1470         together.
1471
1472     See also :meth:`to_char_list`.
1473
1474     >>> from_char_list(['t', 'e', 's', 't'])
1475     'test'
1476     """
1477     return "".join(in_list)
1478
1479
1480 def shuffle(in_str: str) -> Optional[str]:
1481     """
1482     Args:
1483         in_str: a string to shuffle randomly by character
1484
1485     Returns:
1486         A new string containing same chars of the given one but in
1487         a randomized order.  Note that in rare cases this could result
1488         in the same original string as no check is done.  Returns
1489         None to indicate error conditions.
1490
1491     >>> random.seed(22)
1492     >>> shuffle('awesome')
1493     'meosaew'
1494     """
1495     if not is_string(in_str):
1496         return None
1497     chars = to_char_list(in_str)
1498     random.shuffle(chars)
1499     return from_char_list(chars)
1500
1501
1502 def scramble(in_str: str) -> Optional[str]:
1503     """
1504     Args:
1505         in_str: a string to shuffle randomly by character
1506
1507     Returns:
1508         A new string containing same chars of the given one but in
1509         a randomized order.  Note that in rare cases this could result
1510         in the same original string as no check is done.  Returns
1511         None to indicate error conditions.
1512
1513     See also :mod:`pyutils.unscrambler`.
1514
1515     >>> random.seed(22)
1516     >>> scramble('awesome')
1517     'meosaew'
1518     """
1519     return shuffle(in_str)
1520
1521
1522 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1523     """
1524     Args:
1525         in_str: the string to strip tags from
1526         keep_tag_content: should we keep the inner contents of tags?
1527
1528     Returns:
1529         A string with all HTML tags removed (optionally with tag contents
1530         preserved).
1531
1532     See also :meth:`contains_html`.
1533
1534     .. note::
1535         This method uses simple regular expressions to strip tags and is
1536         not a full fledged HTML parser by any means.  Consider using
1537         something like BeautifulSoup if your needs are more than this
1538         simple code can fulfill.
1539
1540     >>> strip_html('test: <a href="foo/bar">click here</a>')
1541     'test: '
1542     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1543     'test: click here'
1544     """
1545     if not is_string(in_str):
1546         raise ValueError(in_str)
1547     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1548     return r.sub("", in_str)
1549
1550
1551 def asciify(in_str: str) -> str:
1552     """
1553     Args:
1554         in_str: the string to asciify.
1555
1556     Returns:
1557         An output string roughly equivalent to the original string
1558         where all content to are ascii-only.  This is accomplished
1559         by translating all non-ascii chars into their closest possible
1560         ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1561
1562     See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1563
1564     .. warning::
1565         Some chars may be lost if impossible to translate.
1566
1567     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1568     'eeuuooaaeynAAACIINOE'
1569     """
1570     if not is_string(in_str):
1571         raise ValueError(in_str)
1572
1573     # "NFKD" is the algorithm which is able to successfully translate
1574     # the most of non-ascii chars.
1575     normalized = unicodedata.normalize("NFKD", in_str)
1576
1577     # encode string forcing ascii and ignore any errors
1578     # (unrepresentable chars will be stripped out)
1579     ascii_bytes = normalized.encode("ascii", "ignore")
1580
1581     # turns encoded bytes into an utf-8 string
1582     return ascii_bytes.decode("utf-8")
1583
1584
1585 def slugify(in_str: str, *, separator: str = "-") -> str:
1586     """
1587     Args:
1588         in_str: the string to slugify
1589         separator: the character to use during sligification (default
1590             is a dash)
1591
1592     Returns:
1593         The converted string.  The returned string has the following properties:
1594
1595         * it has no spaces
1596         * all letters are in lower case
1597         * all punctuation signs and non alphanumeric chars are removed
1598         * words are divided using provided separator
1599         * all chars are encoded as ascii (by using :meth:`asciify`)
1600         * is safe for URL
1601
1602     See also :meth:`is_slug` and :meth:`asciify`.
1603
1604     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1605     'top-10-reasons-to-love-dogs'
1606     >>> slugify('Mönstér Mägnët')
1607     'monster-magnet'
1608     """
1609     if not is_string(in_str):
1610         raise ValueError(in_str)
1611
1612     # replace any character that is NOT letter or number with spaces
1613     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1614
1615     # replace spaces with join sign
1616     out = SPACES_RE.sub(separator, out)
1617
1618     # normalize joins (remove duplicates)
1619     out = re.sub(re.escape(separator) + r"+", separator, out)
1620     return asciify(out)
1621
1622
1623 def to_bool(in_str: str) -> bool:
1624     """
1625     Args:
1626         in_str: the string to convert to boolean
1627
1628     Returns:
1629         A boolean equivalent of the original string based on its contents.
1630         All conversion is case insensitive.  A positive boolean (True) is
1631         returned if the string value is any of the following:
1632
1633         * "true"
1634         * "t"
1635         * "1"
1636         * "yes"
1637         * "y"
1638         * "on"
1639
1640         Otherwise False is returned.
1641
1642     See also :mod:`pyutils.argparse_utils`.
1643
1644     >>> to_bool('True')
1645     True
1646
1647     >>> to_bool('1')
1648     True
1649
1650     >>> to_bool('yes')
1651     True
1652
1653     >>> to_bool('no')
1654     False
1655
1656     >>> to_bool('huh?')
1657     False
1658
1659     >>> to_bool('on')
1660     True
1661     """
1662     if not is_string(in_str):
1663         raise ValueError(in_str)
1664     return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
1665
1666
1667 def to_date(in_str: str) -> Optional[datetime.date]:
1668     """
1669     Args:
1670         in_str: the string to convert into a date
1671
1672     Returns:
1673         The datetime.date the string contained or None to indicate
1674         an error.  This parser is relatively clever; see
1675         :class:`datetimes.dateparse_utils` docs for details.
1676
1677     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
1678     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1679
1680     >>> to_date('9/11/2001')
1681     datetime.date(2001, 9, 11)
1682     >>> to_date('xyzzy')
1683     """
1684     import pyutils.datetimes.dateparse_utils as du
1685
1686     try:
1687         d = du.DateParser()  # type: ignore
1688         d.parse(in_str)
1689         return d.get_date()
1690     except du.ParseException:  # type: ignore
1691         pass
1692     return None
1693
1694
1695 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1696     """Finds and extracts a date from the string, if possible.
1697
1698     Args:
1699         in_str: the string to extract a date from
1700
1701     Returns:
1702         a datetime if date was found, otherwise None
1703
1704     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1705     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1706
1707     >>> extract_date("filename.txt    dec 13, 2022")
1708     datetime.datetime(2022, 12, 13, 0, 0)
1709
1710     >>> extract_date("Dear Santa, please get me a pony.")
1711
1712     """
1713     import itertools
1714
1715     import pyutils.datetimes.dateparse_utils as du
1716
1717     d = du.DateParser()  # type: ignore
1718     chunks = in_str.split()
1719     for ngram in itertools.chain(
1720         list_utils.ngrams(chunks, 5),
1721         list_utils.ngrams(chunks, 4),
1722         list_utils.ngrams(chunks, 3),
1723         list_utils.ngrams(chunks, 2),
1724     ):
1725         try:
1726             expr = " ".join(ngram)
1727             logger.debug("Trying %s", expr)
1728             if d.parse(expr):
1729                 return d.get_datetime()
1730         except du.ParseException:  # type: ignore
1731             pass
1732     return None
1733
1734
1735 def is_valid_date(in_str: str) -> bool:
1736     """
1737     Args:
1738         in_str: the string to check
1739
1740     Returns:
1741         True if the string represents a valid date that we can recognize
1742         and False otherwise.  This parser is relatively clever; see
1743         :class:`datetimes.dateparse_utils` docs for details.
1744
1745     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1746     :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1747
1748     >>> is_valid_date('1/2/2022')
1749     True
1750     >>> is_valid_date('christmas')
1751     True
1752     >>> is_valid_date('next wednesday')
1753     True
1754     >>> is_valid_date('xyzzy')
1755     False
1756     """
1757     import pyutils.datetimes.dateparse_utils as dp
1758
1759     try:
1760         d = dp.DateParser()  # type: ignore
1761         _ = d.parse(in_str)
1762         return True
1763     except dp.ParseException:  # type: ignore
1764         pass
1765     return False
1766
1767
1768 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1769     """
1770     Args:
1771         in_str: string to parse into a datetime
1772
1773     Returns:
1774         A python datetime parsed from in_str or None to indicate
1775         an error.  This parser is relatively clever; see
1776         :class:`datetimes.dateparse_utils` docs for details.
1777
1778     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1779     :meth:`extract_date`, :meth:`valid_datetime`.
1780
1781     >>> to_datetime('7/20/1969 02:56 GMT')
1782     datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1783     """
1784     import pyutils.datetimes.dateparse_utils as dp
1785
1786     try:
1787         d = dp.DateParser()  # type: ignore
1788         dt = d.parse(in_str)
1789         if isinstance(dt, datetime.datetime):
1790             return dt
1791     except Exception:
1792         pass
1793     return None
1794
1795
1796 def valid_datetime(in_str: str) -> bool:
1797     """
1798     Args:
1799         in_str: the string to check
1800
1801     Returns:
1802         True if in_str contains a valid datetime and False otherwise.
1803         This parser is relatively clever; see
1804         :class:`datetimes.dateparse_utils` docs for details.
1805
1806     >>> valid_datetime('next wednesday at noon')
1807     True
1808     >>> valid_datetime('3 weeks ago at midnight')
1809     True
1810     >>> valid_datetime('next easter at 5:00 am')
1811     True
1812     >>> valid_datetime('sometime soon')
1813     False
1814     """
1815     _ = to_datetime(in_str)
1816     if _ is not None:
1817         return True
1818     return False
1819
1820
1821 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1822     """
1823     Args:
1824         in_str: the string to squeeze
1825         character_to_squeeze: the character to remove runs of
1826             more than one in a row (default = space)
1827
1828     Returns: A "squeezed string" where runs of more than one
1829         character_to_squeeze into one.
1830
1831     >>> squeeze(' this        is       a    test    ')
1832     ' this is a test '
1833
1834     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1835     'one|!|two|!|three'
1836
1837     """
1838     return re.sub(
1839         r'(' + re.escape(character_to_squeeze) + r')+',
1840         character_to_squeeze,
1841         in_str,
1842     )
1843
1844
1845 def dedent(in_str: str) -> Optional[str]:
1846     """
1847     Args:
1848         in_str: the string to dedent
1849
1850     Returns:
1851         A string with tab indentation removed or None on error.
1852
1853     See also :meth:`indent`.
1854
1855     >>> dedent('\t\ttest\\n\t\ting')
1856     'test\\ning'
1857     """
1858     if not is_string(in_str):
1859         return None
1860     line_separator = '\n'
1861     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1862     return line_separator.join(lines)
1863
1864
1865 def indent(in_str: str, amount: int) -> str:
1866     """
1867     Args:
1868         in_str: the string to indent
1869         amount: count of spaces to indent each line by
1870
1871     Returns:
1872         An indented string created by prepending amount spaces.
1873
1874     See also :meth:`dedent`.
1875
1876     >>> indent('This is a test', 4)
1877     '    This is a test'
1878     """
1879     if not is_string(in_str):
1880         raise ValueError(in_str)
1881     line_separator = '\n'
1882     lines = [" " * amount + line for line in in_str.split(line_separator)]
1883     return line_separator.join(lines)
1884
1885
1886 def _sprintf(*args, **kwargs) -> str:
1887     """Internal helper."""
1888     ret = ""
1889
1890     sep = kwargs.pop("sep", None)
1891     if sep is not None:
1892         if not isinstance(sep, str):
1893             raise TypeError("sep must be None or a string")
1894
1895     end = kwargs.pop("end", None)
1896     if end is not None:
1897         if not isinstance(end, str):
1898             raise TypeError("end must be None or a string")
1899
1900     if kwargs:
1901         raise TypeError("invalid keyword arguments to sprint()")
1902
1903     if sep is None:
1904         sep = " "
1905     if end is None:
1906         end = "\n"
1907     for n, arg in enumerate(args):
1908         if n:
1909             ret += sep
1910         if isinstance(arg, str):
1911             ret += arg
1912         else:
1913             ret += str(arg)
1914     ret += end
1915     return ret
1916
1917
1918 def strip_ansi_sequences(in_str: str) -> str:
1919     """
1920     Args:
1921         in_str: the string to strip
1922
1923     Returns:
1924         in_str with recognized ANSI escape sequences removed.
1925
1926     See also :mod:`pyutils.ansi`.
1927
1928     .. warning::
1929         This method works by using a regular expression.
1930         It works for all ANSI escape sequences I've tested with but
1931         may miss some; caveat emptor.
1932
1933     >>> import ansi as a
1934     >>> s = a.fg('blue') + 'blue!' + a.reset()
1935     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1936     18
1937     >>> len(strip_ansi_sequences(s))
1938     5
1939     >>> strip_ansi_sequences(s)
1940     'blue!'
1941
1942     """
1943     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1944
1945
1946 class SprintfStdout(contextlib.AbstractContextManager):
1947     """
1948     A context manager that captures outputs to stdout to a buffer
1949     without printing them.
1950
1951     >>> with SprintfStdout() as buf:
1952     ...     print("test")
1953     ...     print("1, 2, 3")
1954     ...
1955     >>> print(buf(), end='')
1956     test
1957     1, 2, 3
1958     """
1959
1960     def __init__(self) -> None:
1961         self.destination = io.StringIO()
1962         self.recorder: contextlib.redirect_stdout
1963
1964     def __enter__(self) -> Callable[[], str]:
1965         self.recorder = contextlib.redirect_stdout(self.destination)
1966         self.recorder.__enter__()
1967         return lambda: self.destination.getvalue()
1968
1969     def __exit__(self, *args) -> Literal[False]:
1970         self.recorder.__exit__(*args)
1971         self.destination.seek(0)
1972         return False
1973
1974
1975 def capitalize_first_letter(in_str: str) -> str:
1976     """
1977     Args:
1978         in_str: the string to capitalize
1979
1980     Returns:
1981         in_str with the first character capitalized.
1982
1983     >>> capitalize_first_letter('test')
1984     'Test'
1985     >>> capitalize_first_letter("ALREADY!")
1986     'ALREADY!'
1987     """
1988     return in_str[0].upper() + in_str[1:]
1989
1990
1991 def it_they(n: int) -> str:
1992     """
1993     Args:
1994         n: how many of them are there?
1995
1996     Returns:
1997         'it' if n is one or 'they' otherwize.
1998
1999     See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
2000     :meth:`thify`.
2001
2002     Suggested usage::
2003
2004         n = num_files_saved_to_tmp()
2005         print(f'Saved file{pluralize(n)} successfully.')
2006         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2007
2008     >>> it_they(1)
2009     'it'
2010     >>> it_they(100)
2011     'they'
2012     """
2013     if n == 1:
2014         return "it"
2015     return "they"
2016
2017
2018 def is_are(n: int) -> str:
2019     """
2020     Args:
2021         n: how many of them are there?
2022
2023     Returns:
2024         'is' if n is one or 'are' otherwize.
2025
2026     See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2027     :meth:`thify`.
2028
2029     Suggested usage::
2030
2031         n = num_files_saved_to_tmp()
2032         print(f'Saved file{pluralize(n)} successfully.')
2033         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2034
2035     >>> is_are(1)
2036     'is'
2037     >>> is_are(2)
2038     'are'
2039
2040     """
2041     if n == 1:
2042         return "is"
2043     return "are"
2044
2045
2046 def pluralize(n: int) -> str:
2047     """
2048     Args:
2049         n: how many of them are there?
2050
2051     Returns:
2052         's' if n is greater than one otherwize ''.
2053
2054     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2055     :meth:`thify`.
2056
2057     Suggested usage::
2058
2059         n = num_files_saved_to_tmp()
2060         print(f'Saved file{pluralize(n)} successfully.')
2061         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2062
2063     >>> pluralize(15)
2064     's'
2065     >>> count = 1
2066     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2067     There is 1 file.
2068     >>> count = 4
2069     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2070     There are 4 files.
2071     """
2072     if n == 1:
2073         return ""
2074     return "s"
2075
2076
2077 def make_contractions(txt: str) -> str:
2078     """This code glues words in txt together to form (English)
2079     contractions.
2080
2081     Args:
2082         txt: the input text to be contractionized.
2083
2084     Returns:
2085         Output text identical to original input except for any
2086         recognized contractions are formed.
2087
2088     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2089
2090     .. note::
2091         The order in which we create contractions is defined by the
2092         implementation and what I thought made more sense when writing
2093         this code.
2094
2095     >>> make_contractions('It is nice today.')
2096     "It's nice today."
2097
2098     >>> make_contractions('I can    not even...')
2099     "I can't even..."
2100
2101     >>> make_contractions('She could not see!')
2102     "She couldn't see!"
2103
2104     >>> make_contractions('But she will not go.')
2105     "But she won't go."
2106
2107     >>> make_contractions('Verily, I shall not.')
2108     "Verily, I shan't."
2109
2110     >>> make_contractions('No you cannot.')
2111     "No you can't."
2112
2113     >>> make_contractions('I said you can not go.')
2114     "I said you can't go."
2115     """
2116
2117     first_second = [
2118         (
2119             [
2120                 'are',
2121                 'could',
2122                 'did',
2123                 'has',
2124                 'have',
2125                 'is',
2126                 'must',
2127                 'should',
2128                 'was',
2129                 'were',
2130                 'would',
2131             ],
2132             ['(n)o(t)'],
2133         ),
2134         (
2135             [
2136                 "I",
2137                 "you",
2138                 "he",
2139                 "she",
2140                 "it",
2141                 "we",
2142                 "they",
2143                 "how",
2144                 "why",
2145                 "when",
2146                 "where",
2147                 "who",
2148                 "there",
2149             ],
2150             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2151         ),
2152     ]
2153
2154     # Special cases: can't, shan't and won't.
2155     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2156     txt = re.sub(
2157         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2158     )
2159     txt = re.sub(
2160         r'\b(w)ill\s*(n)(o)(t)\b',
2161         r"\1\3\2'\4",
2162         txt,
2163         count=0,
2164         flags=re.IGNORECASE,
2165     )
2166
2167     for first_list, second_list in first_second:
2168         for first in first_list:
2169             for second in second_list:
2170                 # Disallow there're/where're.  They're valid English
2171                 # but sound weird.
2172                 if (first in set(['there', 'where'])) and second == 'a(re)':
2173                     continue
2174
2175                 pattern = fr'\b({first})\s+{second}\b'
2176                 if second == '(n)o(t)':
2177                     replacement = r"\1\2'\3"
2178                 else:
2179                     replacement = r"\1'\2"
2180                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2181
2182     return txt
2183
2184
2185 def thify(n: int) -> str:
2186     """
2187     Args:
2188         n: how many of them are there?
2189
2190     Returns:
2191         The proper cardinal suffix for a number.
2192
2193     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2194
2195     Suggested usage::
2196
2197         attempt_count = 0
2198         while True:
2199             attempt_count += 1
2200             if try_the_thing():
2201                 break
2202             print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2203
2204     >>> thify(1)
2205     'st'
2206     >>> thify(33)
2207     'rd'
2208     >>> thify(16)
2209     'th'
2210     """
2211     digit = str(n)
2212     assert is_integer_number(digit)
2213     digit = digit[-1:]
2214     if digit == "1":
2215         return "st"
2216     elif digit == "2":
2217         return "nd"
2218     elif digit == "3":
2219         return "rd"
2220     else:
2221         return "th"
2222
2223
2224 get_cardinal_suffix = thify
2225
2226
2227 def add_cardinal_suffix(n: int):
2228     """
2229     Args:
2230         n: the number to return as a string with a cardinal suffix.
2231
2232     Returns:
2233         A string containing the number with its cardinal suffix.
2234
2235     >>> add_cardinal_suffix(123)
2236     '123rd'
2237
2238     >>> add_cardinal_suffix(1)
2239     '1st'
2240
2241     >>> add_cardinal_suffix(0)
2242     '0th'
2243
2244     >>> add_cardinal_suffix(-123)
2245     '-123rd'
2246     """
2247     return f'{n}{get_cardinal_suffix(n)}'
2248
2249
2250 def remove_cardinal_suffix(txt: str) -> Optional[str]:
2251     """
2252     Args:
2253         txt: the number with cardinal suffix to strip.
2254
2255     Returns:
2256         The same string with its cardinal suffix removed or None on error.
2257
2258     >>> remove_cardinal_suffix('123rd')
2259     '123'
2260
2261     >>> remove_cardinal_suffix('-10th')
2262     '-10'
2263
2264     >>> remove_cardinal_suffix('1ero') is None
2265     True
2266     """
2267     suffix = txt[-2:]
2268     if suffix in set(['st', 'nd', 'rd', 'th']):
2269         return txt[:-2]
2270     return None
2271
2272
2273 def ngrams(txt: str, n: int) -> Generator[str, str, None]:
2274     """
2275     Args:
2276         txt: the string to create ngrams using
2277         n: how many words per ngram created?
2278
2279     Returns:
2280         Generates the ngrams from the input string.
2281
2282     See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2283
2284     >>> [x for x in ngrams('This is a test', 2)]
2285     ['This is', 'is a', 'a test']
2286     """
2287     words = txt.split()
2288     for ngram in ngrams_presplit(words, n):
2289         ret = ''
2290         for w in ngram:
2291             ret += f'{w} '
2292         yield ret.strip()
2293
2294
2295 def ngrams_presplit(
2296     words: Sequence[str], n: int
2297 ) -> Generator[Sequence[str], str, None]:
2298     """
2299     Same as :meth:`ngrams` but with the string pre-split.
2300
2301     See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2302     """
2303     return list_utils.ngrams(words, n)
2304
2305
2306 def bigrams(txt: str) -> Generator[str, str, None]:
2307     """Generates the bigrams (n=2) of the given string.
2308
2309     See also :meth:`ngrams`, :meth:`trigrams`.
2310
2311     >>> [x for x in bigrams('this is a test')]
2312     ['this is', 'is a', 'a test']
2313     """
2314     return ngrams(txt, 2)
2315
2316
2317 def trigrams(txt: str) -> Generator[str, str, None]:
2318     """Generates the trigrams (n=3) of the given string.
2319
2320     See also :meth:`ngrams`, :meth:`bigrams`.
2321     """
2322     return ngrams(txt, 3)
2323
2324
2325 def shuffle_columns_into_list(
2326     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
2327 ) -> Iterable[str]:
2328     """Helper to shuffle / parse columnar data and return the results as a
2329     list.
2330
2331     Args:
2332         input_lines: A sequence of strings that represents text that
2333             has been broken into columns by the caller
2334         column_specs: an iterable collection of numeric sequences that
2335             indicate one or more column numbers to copy to form the Nth
2336             position in the output list.  See example below.
2337         delim: for column_specs that indicate we should copy more than
2338             one column from the input into this position, use delim to
2339             separate source data.  Defaults to ''.
2340
2341     Returns:
2342         A list of string created by following the instructions set forth
2343         in column_specs.
2344
2345     See also :meth:`shuffle_columns_into_dict`.
2346
2347     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2348     >>> shuffle_columns_into_list(
2349     ...     cols,
2350     ...     [ [8], [2, 3], [5, 6, 7] ],
2351     ...     delim='!',
2352     ... )
2353     ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2354     """
2355     out = []
2356
2357     # Column specs map input lines' columns into outputs.
2358     # [col1, col2...]
2359     for spec in column_specs:
2360         hunk = ''
2361         for n in spec:
2362             hunk = hunk + delim + input_lines[n]
2363         hunk = hunk.strip(delim)
2364         out.append(hunk)
2365     return out
2366
2367
2368 def shuffle_columns_into_dict(
2369     input_lines: Sequence[str],
2370     column_specs: Iterable[Tuple[str, Iterable[int]]],
2371     delim: str = '',
2372 ) -> Dict[str, str]:
2373     """Helper to shuffle / parse columnar data and return the results
2374     as a dict.
2375
2376     Args:
2377         input_lines: a sequence of strings that represents text that
2378             has been broken into columns by the caller
2379         column_specs: instructions for what dictionary keys to apply
2380             to individual or compound input column data.  See example
2381             below.
2382         delim: when forming compound output data by gluing more than
2383             one input column together, use this character to separate
2384             the source data.  Defaults to ''.
2385
2386     Returns:
2387         A dict formed by applying the column_specs instructions.
2388
2389     See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2390
2391     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2392     >>> shuffle_columns_into_dict(
2393     ...     cols,
2394     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2395     ...     delim='!',
2396     ... )
2397     {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2398     """
2399     out = {}
2400
2401     # Column specs map input lines' columns into outputs.
2402     # "key", [col1, col2...]
2403     for spec in column_specs:
2404         hunk = ''
2405         for n in spec[1]:
2406             hunk = hunk + delim + input_lines[n]
2407         hunk = hunk.strip(delim)
2408         out[spec[0]] = hunk
2409     return out
2410
2411
2412 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2413     """
2414     Interpolate a string with data from a dict.
2415
2416     Args:
2417         txt: the mad libs template
2418         values: what you and your kids chose for each category.
2419
2420     See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2421
2422     >>> interpolate_using_dict('This is a {adjective} {noun}.',
2423     ...                        {'adjective': 'good', 'noun': 'example'})
2424     'This is a good example.'
2425     """
2426     return _sprintf(txt.format(**values), end='')
2427
2428
2429 def to_ascii(txt: str):
2430     """
2431     Args:
2432         txt: the input data to encode
2433
2434     Returns:
2435         txt encoded as an ASCII byte string.
2436
2437     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2438     :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2439
2440     >>> to_ascii('test')
2441     b'test'
2442
2443     >>> to_ascii(b'1, 2, 3')
2444     b'1, 2, 3'
2445     """
2446     if isinstance(txt, str):
2447         return txt.encode('ascii')
2448     if isinstance(txt, bytes):
2449         return txt
2450     raise Exception('to_ascii works with strings and bytes')
2451
2452
2453 def to_base64(
2454     txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2455 ) -> bytes:
2456     """
2457     Args:
2458         txt: the input data to encode
2459         encoding: the encoding to use during conversion
2460         errors: how to handle encoding errors
2461
2462     Returns:
2463         txt encoded with a 64-chracter alphabet.  Similar to and compatible
2464         with uuencode/uudecode.
2465
2466     See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2467     :meth:`from_base64`.
2468
2469     >>> to_base64('hello?')
2470     b'aGVsbG8/\\n'
2471     """
2472     return base64.encodebytes(txt.encode(encoding, errors))
2473
2474
2475 def is_base64(txt: str) -> bool:
2476     """
2477     Args:
2478         txt: the string to check
2479
2480     Returns:
2481         True if txt is a valid base64 encoded string.  This assumes
2482         txt was encoded with Python's standard base64 alphabet which
2483         is the same as what uuencode/uudecode uses).
2484
2485     See also :meth:`to_base64`, :meth:`from_base64`.
2486
2487     >>> is_base64('test')    # all letters in the b64 alphabet
2488     True
2489
2490     >>> is_base64('another test, how do you like this one?')
2491     False
2492
2493     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
2494     True
2495
2496     """
2497     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2498     alphabet = set(a.encode('ascii'))
2499     for char in to_ascii(txt.strip()):
2500         if char not in alphabet:
2501             return False
2502     return True
2503
2504
2505 def from_base64(
2506     b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2507 ) -> str:
2508     """
2509     Args:
2510         b64: bytestring of 64-bit encoded data to decode / convert.
2511         encoding: the encoding to use during conversion
2512         errors: how to handle encoding errors
2513
2514     Returns:
2515         The decoded form of b64 as a normal python string.  Similar to
2516         and compatible with uuencode / uudecode.
2517
2518     See also :meth:`to_base64`, :meth:`is_base64`.
2519
2520     >>> from_base64(b'aGVsbG8/\\n')
2521     'hello?'
2522     """
2523     return base64.decodebytes(b64).decode(encoding, errors)
2524
2525
2526 def chunk(txt: str, chunk_size: int):
2527     """
2528     Args:
2529         txt: a string to be chunked into evenly spaced pieces.
2530         chunk_size: the size of each chunk to make
2531
2532     Returns:
2533         The original string chunked into evenly spaced pieces.
2534
2535     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2536     '01001101 11000101 10101010 10101010 10011111 10101000'
2537     """
2538     if len(txt) % chunk_size != 0:
2539         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2540         logger.warning(msg)
2541         warnings.warn(msg, stacklevel=2)
2542     for x in range(0, len(txt), chunk_size):
2543         yield txt[x : x + chunk_size]
2544
2545
2546 def to_bitstring(txt: str, *, delimiter: str = '') -> str:
2547     """
2548     Args:
2549         txt: the string to convert into a bitstring
2550         delimiter: character to insert between adjacent bytes.  Note that
2551             only bitstrings with delimiter='' are interpretable by
2552             :meth:`from_bitstring`.
2553
2554     Returns:
2555         txt converted to ascii/binary and then chopped into bytes.
2556
2557     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2558     :meth:`chunk`.
2559
2560     >>> to_bitstring('hello?')
2561     '011010000110010101101100011011000110111100111111'
2562
2563     >>> to_bitstring('test', delimiter=' ')
2564     '01110100 01100101 01110011 01110100'
2565
2566     >>> to_bitstring(b'test')
2567     '01110100011001010111001101110100'
2568     """
2569     etxt = to_ascii(txt)
2570     bits = bin(int.from_bytes(etxt, 'big'))
2571     bits = bits[2:]
2572     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2573
2574
2575 def is_bitstring(txt: str) -> bool:
2576     """
2577     Args:
2578         txt: the string to check
2579
2580     Returns:
2581         True if txt is a recognized bitstring and False otherwise.
2582         Note that if delimiter is non empty this code will not
2583         recognize the bitstring.
2584
2585     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2586     :meth:`chunk`.
2587
2588     >>> is_bitstring('011010000110010101101100011011000110111100111111')
2589     True
2590
2591     >>> is_bitstring('1234')
2592     False
2593     """
2594     return is_binary_integer_number(f'0b{txt}')
2595
2596
2597 def from_bitstring(
2598     bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2599 ) -> str:
2600     """
2601     Args:
2602         bits: the bitstring to convert back into a python string
2603         encoding: the encoding to use during conversion
2604         errors: how to handle encoding errors
2605
2606     Returns:
2607         The regular python string represented by bits.  Note that this
2608         code does not work with to_bitstring when delimiter is non-empty.
2609
2610     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2611     :meth:`chunk`.
2612
2613     >>> from_bitstring('011010000110010101101100011011000110111100111111')
2614     'hello?'
2615     """
2616     n = int(bits, 2)
2617     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2618
2619
2620 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2621     """
2622     Args:
2623         txt: an IP address to chunk up for sorting purposes
2624
2625     Returns:
2626         A tuple of IP components arranged such that the sorting of
2627         IP addresses using a normal comparator will do something sane
2628         and desireable.
2629
2630     See also :meth:`is_ip_v4`.
2631
2632     >>> ip_v4_sort_key('10.0.0.18')
2633     (10, 0, 0, 18)
2634
2635     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2636     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2637     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2638     """
2639     if not is_ip_v4(txt):
2640         print(f"not IP: {txt}")
2641         return None
2642     return tuple(int(x) for x in txt.split('.'))
2643
2644
2645 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2646     """
2647     Args:
2648         volume: the string to chunk up for sorting purposes
2649
2650     Returns:
2651         A tuple of volume's components such that the sorting of
2652         volumes using a normal comparator will do something sane
2653         and desireable.
2654
2655     See also :mod:`pyutils.files.file_utils`.
2656
2657     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2658     ('usr', 'local', 'bin')
2659
2660     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2661     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2662     ['/usr', '/usr/local', '/usr/local/bin']
2663     """
2664     return tuple(x for x in volume.split('/') if len(x) > 0)
2665
2666
2667 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2668     """
2669     Execute several replace operations in a row.
2670
2671     Args:
2672         in_str: the string in which to replace characters
2673         replace_set: the set of target characters to replace
2674         replacement: the character to replace any member of replace_set
2675             with
2676
2677     See also :meth:`replace_nth`.
2678
2679     Returns:
2680         The string with replacements executed.
2681
2682     >>> s = 'this_is a-test!'
2683     >>> replace_all(s, ' _-!', '')
2684     'thisisatest'
2685     """
2686     for char in replace_set:
2687         in_str = in_str.replace(char, replacement)
2688     return in_str
2689
2690
2691 def replace_nth(in_str: str, source: str, target: str, nth: int):
2692     """
2693     Replaces the nth occurrance of a substring within a string.
2694
2695     Args:
2696         in_str: the string in which to run the replacement
2697         source: the substring to replace
2698         target: the replacement text
2699         nth: which occurrance of source to replace?
2700
2701     See also :meth:`replace_all`.
2702
2703     >>> replace_nth('this is a test', ' ', '-', 3)
2704     'this is a-test'
2705     """
2706     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2707     before = in_str[:where]
2708     after = in_str[where:]
2709     after = after.replace(source, target, 1)
2710     return before + after
2711
2712
2713 if __name__ == '__main__':
2714     import doctest
2715
2716     doctest.testmod()