src/pyutils/string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7
   8 Modifications Copyright (c) 2021-2022 Scott Gasch
   9
  10 Permission is hereby granted, free of charge, to any person obtaining a copy
  11 of this software and associated documentation files (the "Software"), to deal
  12 in the Software without restriction, including without limitation the rights
  13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 copies of the Software, and to permit persons to whom the Software is
  15 furnished to do so, subject to the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included in all
  18 copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26 SOFTWARE.
  27
  28 This class is based on:
  29 https://github.com/daveoncode/python-string-utils.  See `NOTICE
  30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
  31 in the root of this module for a detailed enumeration of what work is
  32 Davide's and what work was added by Scott.
  33
  34 """
  35
  36 import base64
  37 import contextlib  # type: ignore
  38 import datetime
  39 import io
  40 import json
  41 import logging
  42 import numbers
  43 import random
  44 import re
  45 import string
  46 import unicodedata
  47 import warnings
  48 from itertools import zip_longest
  49 from typing import (
  50     Any,
  51     Callable,
  52     Dict,
  53     Iterable,
  54     List,
  55     Literal,
  56     Optional,
  57     Sequence,
  58     Tuple,
  59 )
  60 from uuid import uuid4
  61
  62 from pyutils import list_utils
  63
  64 logger = logging.getLogger(__name__)
  65
  66 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  67
  68 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  69
  70 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  71
  72 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  73
  74 URLS_RAW_STRING = (
  75     r"([a-z-]+://)"  # scheme
  76     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  77     r"(www\.)?"  # www.
  78     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  79     r"(:\d{2,})?"  # port number
  80     r"(/[a-z\d_%+-]*)*"  # folders
  81     r"(\.[a-z\d_%+-]+)*"  # file extension
  82     r"(\?[a-z\d_+%-=]*)?"  # query string
  83     r"(#\S*)?"  # hash
  84 )
  85
  86 URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
  87
  88 URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
  89
  90 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  91
  92 EMAILS_RAW_STRING = (
  93     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  94 )
  95
  96 EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
  97
  98 EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
  99
 100 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
 101
 102 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
 103
 104 SNAKE_CASE_TEST_RE = re.compile(
 105     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
 106 )
 107
 108 SNAKE_CASE_TEST_DASH_RE = re.compile(
 109     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 110 )
 111
 112 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 113
 114 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 115
 116 CREDIT_CARDS = {
 117     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 118     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 119     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 120     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 121     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 122     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 123 }
 124
 125 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 126
 127 UUID_RE = re.compile(
 128     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 129 )
 130
 131 UUID_HEX_OK_RE = re.compile(
 132     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 133     re.IGNORECASE,
 134 )
 135
 136 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 137
 138 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 139
 140 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 141
 142 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 143
 144 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 145
 146 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 147     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 148 )
 149
 150 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 151
 152 HTML_RE = re.compile(
 153     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 154     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 155 )
 156
 157 HTML_TAG_ONLY_RE = re.compile(
 158     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 159     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 160 )
 161
 162 SPACES_RE = re.compile(r"\s")
 163
 164 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 165
 166 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 167
 168 ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
 169
 170 NUM_SUFFIXES = {
 171     "Pb": (1024**5),
 172     "P": (1024**5),
 173     "Tb": (1024**4),
 174     "T": (1024**4),
 175     "Gb": (1024**3),
 176     "G": (1024**3),
 177     "Mb": (1024**2),
 178     "M": (1024**2),
 179     "Kb": (1024**1),
 180     "K": (1024**1),
 181 }
 182
 183 UNIT_WORDS = [
 184     "zero",
 185     "one",
 186     "two",
 187     "three",
 188     "four",
 189     "five",
 190     "six",
 191     "seven",
 192     "eight",
 193     "nine",
 194     "ten",
 195     "eleven",
 196     "twelve",
 197     "thirteen",
 198     "fourteen",
 199     "fifteen",
 200     "sixteen",
 201     "seventeen",
 202     "eighteen",
 203     "nineteen",
 204 ]
 205
 206 TENS_WORDS = [
 207     "",
 208     "",
 209     "twenty",
 210     "thirty",
 211     "forty",
 212     "fifty",
 213     "sixty",
 214     "seventy",
 215     "eighty",
 216     "ninety",
 217 ]
 218
 219 MAGNITUDE_SCALES = [
 220     "hundred",
 221     "thousand",
 222     "million",
 223     "billion",
 224     "trillion",
 225     "quadrillion",
 226 ]
 227
 228 NUM_WORDS = {}
 229 NUM_WORDS["and"] = (1, 0)
 230 for i, word in enumerate(UNIT_WORDS):
 231     NUM_WORDS[word] = (1, i)
 232 for i, word in enumerate(TENS_WORDS):
 233     NUM_WORDS[word] = (1, i * 10)
 234 for i, word in enumerate(MAGNITUDE_SCALES):
 235     if i == 0:
 236         NUM_WORDS[word] = (100, 0)
 237     else:
 238         NUM_WORDS[word] = (10 ** (i * 3), 0)
 239 NUM_WORDS['score'] = (20, 0)
 240
 241
 242 def is_none_or_empty(in_str: Optional[str]) -> bool:
 243     """
 244     Args:
 245         in_str: the string to test
 246
 247     Returns:
 248         True if the input string is either None or an empty string,
 249         False otherwise.
 250
 251     See also :meth:`is_string` and :meth:`is_empty_string`.
 252
 253     >>> is_none_or_empty("")
 254     True
 255     >>> is_none_or_empty(None)
 256     True
 257     >>> is_none_or_empty("   \t   ")
 258     True
 259     >>> is_none_or_empty('Test')
 260     False
 261     """
 262     return in_str is None or len(in_str.strip()) == 0
 263
 264
 265 def is_string(in_str: Any) -> bool:
 266     """
 267     Args:
 268         in_str: the object to test
 269
 270     Returns:
 271         True if the object is a string and False otherwise.
 272
 273     See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 274
 275     >>> is_string('test')
 276     True
 277     >>> is_string(123)
 278     False
 279     >>> is_string(100.3)
 280     False
 281     >>> is_string([1, 2, 3])
 282     False
 283     """
 284     return isinstance(in_str, str)
 285
 286
 287 def is_empty_string(in_str: Any) -> bool:
 288     """
 289     Args:
 290         in_str: the string to test
 291
 292     Returns:
 293         True if the string is empty and False otherwise.
 294
 295     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 296     """
 297     return is_empty(in_str)
 298
 299
 300 def is_empty(in_str: Any) -> bool:
 301     """
 302     Args:
 303         in_str: the string to test
 304
 305     Returns:
 306         True if the string is empty and false otherwise.
 307
 308     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 309
 310     >>> is_empty('')
 311     True
 312     >>> is_empty('    \t\t    ')
 313     True
 314     >>> is_empty('test')
 315     False
 316     >>> is_empty(100.88)
 317     False
 318     >>> is_empty([1, 2, 3])
 319     False
 320     """
 321     return is_string(in_str) and in_str.strip() == ""
 322
 323
 324 def is_full_string(in_str: Any) -> bool:
 325     """
 326     Args:
 327         in_str: the object to test
 328
 329     Returns:
 330         True if the object is a string and is not empty ('') and
 331         is not only composed of whitespace.
 332
 333     See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 334
 335     >>> is_full_string('test!')
 336     True
 337     >>> is_full_string('')
 338     False
 339     >>> is_full_string('      ')
 340     False
 341     >>> is_full_string(100.999)
 342     False
 343     >>> is_full_string({"a": 1, "b": 2})
 344     False
 345     """
 346     return is_string(in_str) and in_str.strip() != ""
 347
 348
 349 def is_number(in_str: str) -> bool:
 350     """
 351     Args:
 352         in_str: the string to test
 353
 354     Returns:
 355         True if the string contains a valid numberic value and
 356         False otherwise.
 357
 358     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 359     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 360     etc...
 361
 362     >>> is_number(100.5)
 363     Traceback (most recent call last):
 364     ...
 365     ValueError: 100.5
 366     >>> is_number("100.5")
 367     True
 368     >>> is_number("test")
 369     False
 370     >>> is_number("99")
 371     True
 372     >>> is_number([1, 2, 3])
 373     Traceback (most recent call last):
 374     ...
 375     ValueError: [1, 2, 3]
 376     """
 377     if not is_string(in_str):
 378         raise ValueError(in_str)
 379     return NUMBER_RE.match(in_str) is not None
 380
 381
 382 def is_integer_number(in_str: str) -> bool:
 383     """
 384     Args:
 385         in_str: the string to test
 386
 387     Returns:
 388         True if the string contains a valid (signed or unsigned,
 389         decimal, hex, or octal, regular or scientific) integral
 390         expression and False otherwise.
 391
 392     See also :meth:`is_number`, :meth:`is_decimal_number`,
 393     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 394     etc...
 395
 396     >>> is_integer_number('42')
 397     True
 398     >>> is_integer_number('42.0')
 399     False
 400     """
 401     return (
 402         (is_number(in_str) and "." not in in_str)
 403         or is_hexidecimal_integer_number(in_str)
 404         or is_octal_integer_number(in_str)
 405         or is_binary_integer_number(in_str)
 406     )
 407
 408
 409 def is_hexidecimal_integer_number(in_str: str) -> bool:
 410     """
 411     Args:
 412         in_str: the string to test
 413
 414     Returns:
 415         True if the string is a hex integer number and False otherwise.
 416
 417     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 418     :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
 419
 420     >>> is_hexidecimal_integer_number('0x12345')
 421     True
 422     >>> is_hexidecimal_integer_number('0x1A3E')
 423     True
 424     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 425     False
 426     >>> is_hexidecimal_integer_number('-0xff')
 427     True
 428     >>> is_hexidecimal_integer_number('test')
 429     False
 430     >>> is_hexidecimal_integer_number(12345)  # Not a string
 431     Traceback (most recent call last):
 432     ...
 433     ValueError: 12345
 434     >>> is_hexidecimal_integer_number(101.4)
 435     Traceback (most recent call last):
 436     ...
 437     ValueError: 101.4
 438     >>> is_hexidecimal_integer_number(0x1A3E)
 439     Traceback (most recent call last):
 440     ...
 441     ValueError: 6718
 442     """
 443     if not is_string(in_str):
 444         raise ValueError(in_str)
 445     return HEX_NUMBER_RE.match(in_str) is not None
 446
 447
 448 def is_octal_integer_number(in_str: str) -> bool:
 449     """
 450     Args:
 451         in_str: the string to test
 452
 453     Returns:
 454         True if the string is a valid octal integral number and False otherwise.
 455
 456     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 457     :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
 458     etc...
 459
 460     >>> is_octal_integer_number('0o777')
 461     True
 462     >>> is_octal_integer_number('-0O115')
 463     True
 464     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 465     False
 466     >>> is_octal_integer_number('7777')  # Needs 0o
 467     False
 468     >>> is_octal_integer_number('test')
 469     False
 470     """
 471     if not is_string(in_str):
 472         raise ValueError(in_str)
 473     return OCT_NUMBER_RE.match(in_str) is not None
 474
 475
 476 def is_binary_integer_number(in_str: str) -> bool:
 477     """
 478     Args:
 479         in_str: the string to test
 480
 481     Returns:
 482         True if the string contains a binary integral number and False otherwise.
 483
 484     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 485     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 486     etc...
 487
 488     >>> is_binary_integer_number('0b10111')
 489     True
 490     >>> is_binary_integer_number('-0b111')
 491     True
 492     >>> is_binary_integer_number('0B10101')
 493     True
 494     >>> is_binary_integer_number('0b10102')
 495     False
 496     >>> is_binary_integer_number('0xFFF')
 497     False
 498     >>> is_binary_integer_number('test')
 499     False
 500     """
 501     if not is_string(in_str):
 502         raise ValueError(in_str)
 503     return BIN_NUMBER_RE.match(in_str) is not None
 504
 505
 506 def to_int(in_str: str) -> int:
 507     """
 508     Args:
 509         in_str: the string to convert
 510
 511     Returns:
 512         The integral value of the string or raises on error.
 513
 514     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 515     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 516     :meth:`is_binary_integer_number`, etc...
 517
 518     >>> to_int('1234')
 519     1234
 520     >>> to_int('0x1234')
 521     4660
 522     >>> to_int('0b01101')
 523     13
 524     >>> to_int('0o777')
 525     511
 526     >>> to_int('test')
 527     Traceback (most recent call last):
 528     ...
 529     ValueError: invalid literal for int() with base 10: 'test'
 530     """
 531     if not is_string(in_str):
 532         raise ValueError(in_str)
 533     if is_binary_integer_number(in_str):
 534         return int(in_str, 2)
 535     if is_octal_integer_number(in_str):
 536         return int(in_str, 8)
 537     if is_hexidecimal_integer_number(in_str):
 538         return int(in_str, 16)
 539     return int(in_str)
 540
 541
 542 def number_string_to_integer(in_str: str) -> int:
 543     """Convert a string containing a written-out number into an int.
 544
 545     Args:
 546         in_str: the string containing the long-hand written out integer number
 547             in English.  See examples below.
 548
 549     Returns:
 550         The integer whose value was parsed from in_str.
 551
 552     See also :meth:`integer_to_number_string`.
 553
 554     .. warning::
 555         This code only handles integers; it will not work with decimals / floats.
 556
 557     >>> number_string_to_integer("one hundred fifty two")
 558     152
 559
 560     >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
 561     10200054003
 562
 563     >>> number_string_to_integer("four-score and 7")
 564     87
 565
 566     >>> number_string_to_integer("fifty xyzzy three")
 567     Traceback (most recent call last):
 568     ...
 569     ValueError: Unknown word: xyzzy
 570     """
 571     if isinstance(in_str, int):
 572         return int(in_str)
 573
 574     current = result = 0
 575     in_str = in_str.replace('-', ' ')
 576     for w in in_str.split():
 577         if w not in NUM_WORDS:
 578             if is_integer_number(w):
 579                 current += int(w)
 580                 continue
 581             else:
 582                 raise ValueError("Unknown word: " + w)
 583         scale, increment = NUM_WORDS[w]
 584         current = current * scale + increment
 585         if scale > 100:
 586             result += current
 587             current = 0
 588     return result + current
 589
 590
 591 def integer_to_number_string(num: int) -> str:
 592     """
 593     Opposite of :meth:`number_string_to_integer`; converts a number to a written out
 594     longhand format in English.
 595
 596     Args:
 597         num: the integer number to convert
 598
 599     Returns:
 600         The long-hand written out English form of the number.  See examples below.
 601
 602     See also :meth:`number_string_to_integer`.
 603
 604     .. warning::
 605         This method does not handle decimals or floats, only ints.
 606
 607     >>> integer_to_number_string(9)
 608     'nine'
 609
 610     >>> integer_to_number_string(42)
 611     'forty two'
 612
 613     >>> integer_to_number_string(123219982)
 614     'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
 615     """
 616
 617     if num < 20:
 618         return UNIT_WORDS[num]
 619     if num < 100:
 620         ret = TENS_WORDS[num // 10]
 621         leftover = num % 10
 622         if leftover != 0:
 623             ret += ' ' + UNIT_WORDS[leftover]
 624         return ret
 625
 626     # If num > 100 go find the highest chunk and convert that, then recursively
 627     # convert the rest.  NUM_WORDS contains items like 'thousand' -> (1000, 0).
 628     # The second item in the tuple is an increment that can be ignored; the first
 629     # is the numeric "scale" of the entry.  So find the greatest entry in NUM_WORDS
 630     # still less than num.  For 123,456 it would be thousand.  Then pull out the
 631     # 123, convert it, and append "thousand".  Then do the rest.
 632     scales = {}
 633     for name, val in NUM_WORDS.items():
 634         if val[0] <= num:
 635             scales[name] = val[0]
 636     scale = max(scales.items(), key=lambda _: _[1])
 637
 638     # scale[1] = numeric magnitude (e.g. 1000)
 639     # scale[0] = name (e.g. "thousand")
 640     ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
 641     leftover = num % scale[1]
 642     if leftover != 0:
 643         ret += ' ' + integer_to_number_string(leftover)
 644     return ret
 645
 646
 647 def is_decimal_number(in_str: str) -> bool:
 648     """
 649     Args:
 650         in_str: the string to check
 651
 652     Returns:
 653         True if the given string represents a decimal or False
 654         otherwise.  A decimal may be signed or unsigned or use
 655         a "scientific notation".
 656
 657     See also :meth:`is_integer_number`.
 658
 659     .. note::
 660         We do not consider integers without a decimal point
 661         to be decimals; they return False (see example).
 662
 663     >>> is_decimal_number('42.0')
 664     True
 665     >>> is_decimal_number('42')
 666     False
 667     """
 668     return is_number(in_str) and "." in in_str
 669
 670
 671 def strip_escape_sequences(in_str: str) -> str:
 672     """
 673     Args:
 674         in_str: the string to strip of escape sequences.
 675
 676     Returns:
 677         in_str with escape sequences removed.
 678
 679     See also: :mod:`pyutils.ansi`.
 680
 681     .. note::
 682         What is considered to be an "escape sequence" is defined
 683         by a regular expression.  While this gets common ones,
 684         there may exist valid sequences that it doesn't match.
 685
 686     >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
 687     'this is a test!'
 688     """
 689     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 690     return in_str
 691
 692
 693 def add_thousands_separator(
 694     in_str: str, *, separator_char: str = ',', places: int = 3
 695 ) -> str:
 696     """
 697     Args:
 698         in_str: string or number to which to add thousands separator(s)
 699         separator_char: the separator character to add (defaults to comma)
 700         places: add a separator every N places (defaults to three)
 701
 702     Returns:
 703         A numeric string with thousands separators added appropriately.
 704
 705     >>> add_thousands_separator('12345678')
 706     '12,345,678'
 707     >>> add_thousands_separator(12345678)
 708     '12,345,678'
 709     >>> add_thousands_separator(12345678.99)
 710     '12,345,678.99'
 711     >>> add_thousands_separator('test')
 712     Traceback (most recent call last):
 713     ...
 714     ValueError: test
 715
 716     """
 717     if isinstance(in_str, numbers.Number):
 718         in_str = f'{in_str}'
 719     if is_number(in_str):
 720         return _add_thousands_separator(
 721             in_str, separator_char=separator_char, places=places
 722         )
 723     raise ValueError(in_str)
 724
 725
 726 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 727     """Internal helper"""
 728     decimal_part = ""
 729     if '.' in in_str:
 730         (in_str, decimal_part) = in_str.split('.')
 731     tmp = [iter(in_str[::-1])] * places
 732     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 733     if len(decimal_part) > 0:
 734         ret += '.'
 735         ret += decimal_part
 736     return ret
 737
 738
 739 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 740     """
 741     Args:
 742         in_str: the string to test
 743         allowed_schemes: an optional list of allowed schemes (e.g.
 744             ['http', 'https', 'ftp'].  If passed, only URLs that
 745             begin with the one of the schemes passed will be considered
 746             to be valid.  Otherwise, any scheme:// will be considered
 747             valid.
 748
 749     Returns:
 750         True if in_str contains a valid URL and False otherwise.
 751
 752     >>> is_url('http://www.mysite.com')
 753     True
 754     >>> is_url('https://mysite.com')
 755     True
 756     >>> is_url('.mysite.com')
 757     False
 758     >>> is_url('scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash')
 759     True
 760     """
 761     if not is_full_string(in_str):
 762         return False
 763
 764     valid = URL_RE.match(in_str) is not None
 765
 766     if allowed_schemes:
 767         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 768     return valid
 769
 770
 771 def is_email(in_str: Any) -> bool:
 772     """
 773     Args:
 774         in_str: the email address to check
 775
 776     Returns: True if the in_str contains a valid email (as defined by
 777         https://tools.ietf.org/html/rfc3696#section-3) or False
 778         otherwise.
 779
 780     >>> is_email('[email protected]')
 781     True
 782     >>> is_email('@gmail.com')
 783     False
 784     """
 785     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 786         return False
 787
 788     try:
 789         # we expect 2 tokens, one before "@" and one after, otherwise
 790         # we have an exception and the email is not valid.
 791         head, tail = in_str.split("@")
 792
 793         # head's size must be <= 64, tail <= 255, head must not start
 794         # with a dot or contain multiple consecutive dots.
 795         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 796             return False
 797
 798         # removes escaped spaces, so that later on the test regex will
 799         # accept the string.
 800         head = head.replace("\\ ", "")
 801         if head.startswith('"') and head.endswith('"'):
 802             head = head.replace(" ", "")[1:-1]
 803         return EMAIL_RE.match(head + "@" + tail) is not None
 804
 805     except ValueError:
 806         # borderline case in which we have multiple "@" signs but the
 807         # head part is correctly escaped.
 808         if ESCAPED_AT_SIGN.search(in_str) is not None:
 809             # replace "@" with "a" in the head
 810             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 811         return False
 812
 813
 814 def suffix_string_to_number(in_str: str) -> Optional[int]:
 815     """Takes a string like "33Gb" and converts it into a number (of bytes)
 816     like 34603008.
 817
 818     Args:
 819         in_str: the string with a suffix to be interpreted and removed.
 820
 821     Returns:
 822         An integer number of bytes or None to indicate an error.
 823
 824     See also :meth:`number_to_suffix_string`.
 825
 826     >>> suffix_string_to_number('1Mb')
 827     1048576
 828     >>> suffix_string_to_number('13.1Gb')
 829     14066017894
 830     """
 831
 832     def suffix_capitalize(s: str) -> str:
 833         if len(s) == 1:
 834             return s.upper()
 835         elif len(s) == 2:
 836             return f"{s[0].upper()}{s[1].lower()}"
 837         return suffix_capitalize(s[0:1])
 838
 839     if is_string(in_str):
 840         if is_integer_number(in_str):
 841             return to_int(in_str)
 842         suffixes = [in_str[-2:], in_str[-1:]]
 843         rest = [in_str[:-2], in_str[:-1]]
 844         for x in range(len(suffixes)):
 845             s = suffixes[x]
 846             s = suffix_capitalize(s)
 847             multiplier = NUM_SUFFIXES.get(s, None)
 848             if multiplier is not None:
 849                 r = rest[x]
 850                 if is_integer_number(r):
 851                     return to_int(r) * multiplier
 852                 if is_decimal_number(r):
 853                     return int(float(r) * multiplier)
 854     return None
 855
 856
 857 def number_to_suffix_string(num: int) -> Optional[str]:
 858     """Take a number (of bytes) and returns a string like "43.8Gb".
 859
 860     Args:
 861         num: an integer number of bytes
 862
 863     Returns:
 864         A string with a suffix representing num bytes concisely or
 865         None to indicate an error.
 866
 867     See also: :meth:`suffix_string_to_number`.
 868
 869     >>> number_to_suffix_string(14066017894)
 870     '13.1Gb'
 871     >>> number_to_suffix_string(1024 * 1024)
 872     '1.0Mb'
 873     """
 874     d = 0.0
 875     suffix = None
 876     for (sfx, size) in NUM_SUFFIXES.items():
 877         if num >= size:
 878             d = num / size
 879             suffix = sfx
 880             break
 881     if suffix is not None:
 882         return f"{d:.1f}{suffix}"
 883     else:
 884         return f'{num:d}'
 885
 886
 887 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 888     """
 889     Args:
 890         in_str: a string to check
 891         card_type: if provided, contains the card type to validate
 892             with.  Otherwise, all known credit card number types will
 893             be accepted.
 894
 895             Supported card types are the following:
 896
 897             * VISA
 898             * MASTERCARD
 899             * AMERICAN_EXPRESS
 900             * DINERS_CLUB
 901             * DISCOVER
 902             * JCB
 903
 904     Returns:
 905         True if in_str is a valid credit card number.
 906
 907     .. warning::
 908         This code is not verifying the authenticity of the credit card (i.e.
 909         not checking whether it's a real card that can be charged); rather
 910         it's only checking that the number follows the "rules" for numbering
 911         established by credit card issuers.
 912
 913     """
 914     if not is_full_string(in_str):
 915         return False
 916
 917     if card_type is not None:
 918         if card_type not in CREDIT_CARDS:
 919             raise KeyError(
 920                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 921             )
 922         return CREDIT_CARDS[card_type].match(in_str) is not None
 923     for c in CREDIT_CARDS:
 924         if CREDIT_CARDS[c].match(in_str) is not None:
 925             return True
 926     return False
 927
 928
 929 def is_camel_case(in_str: Any) -> bool:
 930     """
 931     Args:
 932         in_str: the string to test
 933
 934     Returns:
 935         True if the string is formatted as camel case and False otherwise.
 936         A string is considered camel case when:
 937
 938         * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 939         * it contains both lowercase and uppercase letters
 940         * it does not start with a number
 941
 942     See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
 943     """
 944     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 945
 946
 947 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 948     """
 949     Args:
 950         in_str: the string to test
 951         separator: the snake case separator character to use
 952
 953     Returns: True if the string is snake case and False otherwise.  A
 954         string is considered snake case when:
 955
 956         * it's composed only by lowercase/uppercase letters and digits
 957         * it contains at least one underscore (or provided separator)
 958         * it does not start with a number
 959
 960     See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
 961
 962     >>> is_snake_case('this_is_a_test')
 963     True
 964     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 965     True
 966     >>> is_snake_case('this-is-a-test')
 967     False
 968     >>> is_snake_case('this-is-a-test', separator='-')
 969     True
 970     """
 971     if is_full_string(in_str):
 972         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 973         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 974         r = re_map.get(
 975             separator,
 976             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 977         )
 978         return r.match(in_str) is not None
 979     return False
 980
 981
 982 def is_json(in_str: Any) -> bool:
 983     """
 984     Args:
 985         in_str: the string to test
 986
 987     Returns:
 988         True if the in_str contains valid JSON and False otherwise.
 989
 990     >>> is_json('{"name": "Peter"}')
 991     True
 992     >>> is_json('[1, 2, 3]')
 993     True
 994     >>> is_json('{nope}')
 995     False
 996     """
 997     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 998         try:
 999             return isinstance(json.loads(in_str), (dict, list))
1000         except (TypeError, ValueError, OverflowError):
1001             pass
1002     return False
1003
1004
1005 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1006     """
1007     Args:
1008         in_str: the string to test
1009         allow_hex: should we allow hexidecimal digits in valid uuids?
1010
1011     Returns:
1012         True if the in_str contains a valid UUID and False otherwise.
1013
1014     See also :meth:`generate_uuid`.
1015
1016     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1017     True
1018     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1019     False
1020     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1021     True
1022     """
1023     # string casting is used to allow UUID itself as input data type
1024     s = str(in_str)
1025     if allow_hex:
1026         return UUID_HEX_OK_RE.match(s) is not None
1027     return UUID_RE.match(s) is not None
1028
1029
1030 def is_ip_v4(in_str: Any) -> bool:
1031     """
1032     Args:
1033         in_str: the string to test
1034
1035     Returns:
1036         True if in_str contains a valid IPv4 address and False otherwise.
1037
1038     See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1039     and :meth:`is_ip`.
1040
1041     >>> is_ip_v4('255.200.100.75')
1042     True
1043     >>> is_ip_v4('nope')
1044     False
1045     >>> is_ip_v4('255.200.100.999')  # 999 out of range
1046     False
1047     """
1048     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1049         return False
1050
1051     # checks that each entry in the ip is in the valid range (0 to 255)
1052     for token in in_str.split("."):
1053         if not 0 <= int(token) <= 255:
1054             return False
1055     return True
1056
1057
1058 def extract_ip_v4(in_str: Any) -> Optional[str]:
1059     """
1060     Args:
1061         in_str: the string to extract an IPv4 address from.
1062
1063     Returns:
1064         The first extracted IPv4 address from in_str or None if
1065         none were found or an error occurred.
1066
1067     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1068     and :meth:`is_ip`.
1069
1070     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
1071     '127.0.0.1'
1072     >>> extract_ip_v4('Your mom dresses you funny.')
1073     """
1074     if not is_full_string(in_str):
1075         return None
1076     m = ANYWHERE_IP_V4_RE.search(in_str)
1077     if m is not None:
1078         return m.group(0)
1079     return None
1080
1081
1082 def is_ip_v6(in_str: Any) -> bool:
1083     """
1084     Args:
1085         in_str: the string to test.
1086
1087     Returns:
1088         True if in_str contains a valid IPv6 address and False otherwise.
1089
1090     See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1091     and :meth:`is_ip`.
1092
1093     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1094     True
1095     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
1096     False
1097     """
1098     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1099
1100
1101 def extract_ip_v6(in_str: Any) -> Optional[str]:
1102     """
1103     Args:
1104         in_str: the string from which to extract an IPv6 address.
1105
1106     Returns:
1107         The first IPv6 address found in in_str or None if no address
1108         was found or an error occurred.
1109
1110     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1111     and :meth:`is_ip`.
1112
1113     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1114     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1115     >>> extract_ip_v6("(and she's ugly too, btw)")
1116     """
1117     if not is_full_string(in_str):
1118         return None
1119     m = ANYWHERE_IP_V6_RE.search(in_str)
1120     if m is not None:
1121         return m.group(0)
1122     return None
1123
1124
1125 def is_ip(in_str: Any) -> bool:
1126     """
1127     Args:
1128         in_str: the string to test.
1129
1130     Returns:
1131         True if in_str contains a valid IP address (either IPv4 or
1132         IPv6).
1133
1134     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1135     and :meth:`extract_ip_v4`.
1136
1137     >>> is_ip('255.200.100.75')
1138     True
1139     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1140     True
1141     >>> is_ip('1.2.3')
1142     False
1143     >>> is_ip('1.2.3.999')
1144     False
1145     """
1146     return is_ip_v6(in_str) or is_ip_v4(in_str)
1147
1148
1149 def extract_ip(in_str: Any) -> Optional[str]:
1150     """
1151     Args:
1152         in_str: the string from which to extract in IP address.
1153
1154     Returns:
1155         The first IP address (IPv4 or IPv6) found in in_str or
1156         None to indicate none found or an error condition.
1157
1158     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1159     and :meth:`extract_ip_v4`.
1160
1161     >>> extract_ip('Attacker: 255.200.100.75')
1162     '255.200.100.75'
1163     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1164     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1165     >>> extract_ip('1.2.3')
1166     """
1167     ip = extract_ip_v4(in_str)
1168     if ip is None:
1169         ip = extract_ip_v6(in_str)
1170     return ip
1171
1172
1173 def is_mac_address(in_str: Any) -> bool:
1174     """
1175     Args:
1176         in_str: the string to test
1177
1178     Returns:
1179         True if in_str is a valid MAC address False otherwise.
1180
1181     See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1182
1183     >>> is_mac_address("34:29:8F:12:0D:2F")
1184     True
1185     >>> is_mac_address('34:29:8f:12:0d:2f')
1186     True
1187     >>> is_mac_address('34-29-8F-12-0D-2F')
1188     True
1189     >>> is_mac_address("test")
1190     False
1191     """
1192     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1193
1194
1195 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1196     """
1197     Args:
1198         in_str: the string from which to extract a MAC address.
1199
1200     Returns:
1201         The first MAC address found in in_str or None to indicate no
1202         match or an error.
1203
1204     See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1205
1206     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1207     '34:29:8F:12:0D:2F'
1208
1209     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1210     'd8:5d:e2:34:54:86'
1211     """
1212     if not is_full_string(in_str):
1213         return None
1214     in_str.strip()
1215     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1216     if m is not None:
1217         mac = m.group(0)
1218         mac.replace(":", separator)
1219         mac.replace("-", separator)
1220         return mac
1221     return None
1222
1223
1224 def is_slug(in_str: Any, separator: str = "-") -> bool:
1225     """
1226     Args:
1227         in_str: string to test
1228         separator: the slug character to use
1229
1230     Returns:
1231         True if in_str is a slug string and False otherwise.
1232
1233     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1234
1235     >>> is_slug('my-blog-post-title')
1236     True
1237     >>> is_slug('My blog post title')
1238     False
1239     """
1240     if not is_full_string(in_str):
1241         return False
1242     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1243     return re.match(rex, in_str) is not None
1244
1245
1246 def contains_html(in_str: str) -> bool:
1247     """
1248     Args:
1249         in_str: the string to check for tags in
1250
1251     Returns:
1252         True if the given string contains HTML/XML tags and False
1253         otherwise.
1254
1255     See also :meth:`strip_html`.
1256
1257     .. warning::
1258         By design, this function matches ANY type of tag, so don't expect
1259         to use it as an HTML validator.  It's a quick sanity check at
1260         best.  See something like BeautifulSoup for a more full-featuered
1261         HTML parser.
1262
1263     >>> contains_html('my string is <strong>bold</strong>')
1264     True
1265     >>> contains_html('my string is not bold')
1266     False
1267
1268     """
1269     if not is_string(in_str):
1270         raise ValueError(in_str)
1271     return HTML_RE.search(in_str) is not None
1272
1273
1274 def words_count(in_str: str) -> int:
1275     """
1276     Args:
1277         in_str: the string to count words in
1278
1279     Returns:
1280         The number of words contained in the given string.
1281
1282     .. note::
1283         This method is "smart" in that it does consider only sequences
1284         of one or more letter and/or numbers to be "words".  Thus a
1285         string like this: "! @ # % ... []" will return zero.  Moreover
1286         it is aware of punctuation, so the count for a string like
1287         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1288         in the string).
1289
1290     >>> words_count('hello world')
1291     2
1292     >>> words_count('one,two,three.stop')
1293     4
1294     """
1295     if not is_string(in_str):
1296         raise ValueError(in_str)
1297     return len(WORDS_COUNT_RE.findall(in_str))
1298
1299
1300 def word_count(in_str: str) -> int:
1301     """
1302     Args:
1303         in_str: the string to count words in
1304
1305     Returns:
1306         The number of words contained in the given string.
1307
1308     .. note::
1309         This method is "smart" in that it does consider only sequences
1310         of one or more letter and/or numbers to be "words".  Thus a
1311         string like this: "! @ # % ... []" will return zero.  Moreover
1312         it is aware of punctuation, so the count for a string like
1313         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1314         in the string).
1315
1316     >>> word_count('hello world')
1317     2
1318     >>> word_count('one,two,three.stop')
1319     4
1320     """
1321     return words_count(in_str)
1322
1323
1324 def generate_uuid(omit_dashes: bool = False) -> str:
1325     """
1326     Args:
1327         omit_dashes: should we omit the dashes in the generated UUID?
1328
1329     Returns:
1330         A generated UUID string (using `uuid.uuid4()`) with or without
1331         dashes per the omit_dashes arg.
1332
1333     See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1334
1335     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1336     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1337     """
1338     uid = uuid4()
1339     if omit_dashes:
1340         return uid.hex
1341     return str(uid)
1342
1343
1344 def generate_random_alphanumeric_string(size: int) -> str:
1345     """
1346     Args:
1347         size: number of characters to generate
1348
1349     Returns:
1350         A string of the specified size containing random characters
1351         (uppercase/lowercase ascii letters and digits).
1352
1353     See also :meth:`asciify`, :meth:`generate_uuid`.
1354
1355     >>> random.seed(22)
1356     >>> generate_random_alphanumeric_string(9)
1357     '96ipbNClS'
1358     """
1359     if size < 1:
1360         raise ValueError("size must be >= 1")
1361     chars = string.ascii_letters + string.digits
1362     buffer = [random.choice(chars) for _ in range(size)]
1363     return from_char_list(buffer)
1364
1365
1366 def reverse(in_str: str) -> str:
1367     """
1368     Args:
1369         in_str: the string to reverse
1370
1371     Returns:
1372         The reversed (chracter by character) string.
1373
1374     >>> reverse('test')
1375     'tset'
1376     """
1377     if not is_string(in_str):
1378         raise ValueError(in_str)
1379     return in_str[::-1]
1380
1381
1382 def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
1383     """
1384     Args:
1385         in_str: the camel case string to convert
1386         separator: the snake case separator character to use
1387
1388     Returns:
1389         A snake case string equivalent to the camel case input or the
1390         original string if it is not a valid camel case string or some
1391         other error occurs.
1392
1393     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1394
1395     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1396     'mac_address_extractor_factory'
1397     >>> camel_case_to_snake_case('Luke Skywalker')
1398     'Luke Skywalker'
1399     """
1400     if not is_string(in_str):
1401         raise ValueError(in_str)
1402     if not is_camel_case(in_str):
1403         return in_str
1404     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1405
1406
1407 def snake_case_to_camel_case(
1408     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1409 ) -> str:
1410     """
1411     Args:
1412         in_str: the snake case string to convert
1413         upper_case_first: should we capitalize the first letter?
1414         separator: the separator character to use
1415
1416     Returns:
1417         A camel case string that is equivalent to the snake case string
1418         provided or the original string back again if it is not valid
1419         snake case or another error occurs.
1420
1421     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1422
1423     >>> snake_case_to_camel_case('this_is_a_test')
1424     'ThisIsATest'
1425     >>> snake_case_to_camel_case('Han Solo')
1426     'Han Solo'
1427     """
1428     if not is_string(in_str):
1429         raise ValueError(in_str)
1430     if not is_snake_case(in_str, separator=separator):
1431         return in_str
1432     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1433     if not upper_case_first:
1434         tokens[0] = tokens[0].lower()
1435     return from_char_list(tokens)
1436
1437
1438 def to_char_list(in_str: str) -> List[str]:
1439     """
1440     Args:
1441         in_str: the string to split into a char list
1442
1443     Returns:
1444         A list of strings of length one each.
1445
1446     See also :meth:`from_char_list`.
1447
1448     >>> to_char_list('test')
1449     ['t', 'e', 's', 't']
1450     """
1451     if not is_string(in_str):
1452         return []
1453     return list(in_str)
1454
1455
1456 def from_char_list(in_list: List[str]) -> str:
1457     """
1458     Args:
1459         in_list: A list of characters to convert into a string.
1460
1461     Returns:
1462         The string resulting from gluing the characters in in_list
1463         together.
1464
1465     See also :meth:`to_char_list`.
1466
1467     >>> from_char_list(['t', 'e', 's', 't'])
1468     'test'
1469     """
1470     return "".join(in_list)
1471
1472
1473 def shuffle(in_str: str) -> Optional[str]:
1474     """
1475     Args:
1476         in_str: a string to shuffle randomly by character
1477
1478     Returns:
1479         A new string containing same chars of the given one but in
1480         a randomized order.  Note that in rare cases this could result
1481         in the same original string as no check is done.  Returns
1482         None to indicate error conditions.
1483
1484     >>> random.seed(22)
1485     >>> shuffle('awesome')
1486     'meosaew'
1487     """
1488     if not is_string(in_str):
1489         return None
1490     chars = to_char_list(in_str)
1491     random.shuffle(chars)
1492     return from_char_list(chars)
1493
1494
1495 def scramble(in_str: str) -> Optional[str]:
1496     """
1497     Args:
1498         in_str: a string to shuffle randomly by character
1499
1500     Returns:
1501         A new string containing same chars of the given one but in
1502         a randomized order.  Note that in rare cases this could result
1503         in the same original string as no check is done.  Returns
1504         None to indicate error conditions.
1505
1506     See also :mod:`pyutils.unscrambler`.
1507
1508     >>> random.seed(22)
1509     >>> scramble('awesome')
1510     'meosaew'
1511     """
1512     return shuffle(in_str)
1513
1514
1515 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1516     """
1517     Args:
1518         in_str: the string to strip tags from
1519         keep_tag_content: should we keep the inner contents of tags?
1520
1521     Returns:
1522         A string with all HTML tags removed (optionally with tag contents
1523         preserved).
1524
1525     See also :meth:`contains_html`.
1526
1527     .. note::
1528         This method uses simple regular expressions to strip tags and is
1529         not a full fledged HTML parser by any means.  Consider using
1530         something like BeautifulSoup if your needs are more than this
1531         simple code can fulfill.
1532
1533     >>> strip_html('test: <a href="foo/bar">click here</a>')
1534     'test: '
1535     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1536     'test: click here'
1537     """
1538     if not is_string(in_str):
1539         raise ValueError(in_str)
1540     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1541     return r.sub("", in_str)
1542
1543
1544 def asciify(in_str: str) -> str:
1545     """
1546     Args:
1547         in_str: the string to asciify.
1548
1549     Returns:
1550         An output string roughly equivalent to the original string
1551         where all content to are ascii-only.  This is accomplished
1552         by translating all non-ascii chars into their closest possible
1553         ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1554
1555     See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1556
1557     .. warning::
1558         Some chars may be lost if impossible to translate.
1559
1560     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1561     'eeuuooaaeynAAACIINOE'
1562     """
1563     if not is_string(in_str):
1564         raise ValueError(in_str)
1565
1566     # "NFKD" is the algorithm which is able to successfully translate
1567     # the most of non-ascii chars.
1568     normalized = unicodedata.normalize("NFKD", in_str)
1569
1570     # encode string forcing ascii and ignore any errors
1571     # (unrepresentable chars will be stripped out)
1572     ascii_bytes = normalized.encode("ascii", "ignore")
1573
1574     # turns encoded bytes into an utf-8 string
1575     return ascii_bytes.decode("utf-8")
1576
1577
1578 def slugify(in_str: str, *, separator: str = "-") -> str:
1579     """
1580     Args:
1581         in_str: the string to slugify
1582         separator: the character to use during sligification (default
1583             is a dash)
1584
1585     Returns:
1586         The converted string.  The returned string has the following properties:
1587
1588         * it has no spaces
1589         * all letters are in lower case
1590         * all punctuation signs and non alphanumeric chars are removed
1591         * words are divided using provided separator
1592         * all chars are encoded as ascii (by using :meth:`asciify`)
1593         * is safe for URL
1594
1595     See also :meth:`is_slug` and :meth:`asciify`.
1596
1597     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1598     'top-10-reasons-to-love-dogs'
1599     >>> slugify('Mönstér Mägnët')
1600     'monster-magnet'
1601     """
1602     if not is_string(in_str):
1603         raise ValueError(in_str)
1604
1605     # replace any character that is NOT letter or number with spaces
1606     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1607
1608     # replace spaces with join sign
1609     out = SPACES_RE.sub(separator, out)
1610
1611     # normalize joins (remove duplicates)
1612     out = re.sub(re.escape(separator) + r"+", separator, out)
1613     return asciify(out)
1614
1615
1616 def to_bool(in_str: str) -> bool:
1617     """
1618     Args:
1619         in_str: the string to convert to boolean
1620
1621     Returns:
1622         A boolean equivalent of the original string based on its contents.
1623         All conversion is case insensitive.  A positive boolean (True) is
1624         returned if the string value is any of the following:
1625
1626         * "true"
1627         * "t"
1628         * "1"
1629         * "yes"
1630         * "y"
1631         * "on"
1632
1633         Otherwise False is returned.
1634
1635     See also :mod:`pyutils.argparse_utils`.
1636
1637     >>> to_bool('True')
1638     True
1639
1640     >>> to_bool('1')
1641     True
1642
1643     >>> to_bool('yes')
1644     True
1645
1646     >>> to_bool('no')
1647     False
1648
1649     >>> to_bool('huh?')
1650     False
1651
1652     >>> to_bool('on')
1653     True
1654     """
1655     if not is_string(in_str):
1656         raise ValueError(in_str)
1657     return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
1658
1659
1660 def to_date(in_str: str) -> Optional[datetime.date]:
1661     """
1662     Args:
1663         in_str: the string to convert into a date
1664
1665     Returns:
1666         The datetime.date the string contained or None to indicate
1667         an error.  This parser is relatively clever; see
1668         :class:`datetimes.dateparse_utils` docs for details.
1669
1670     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
1671     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1672
1673     >>> to_date('9/11/2001')
1674     datetime.date(2001, 9, 11)
1675     >>> to_date('xyzzy')
1676     """
1677     import pyutils.datetimes.dateparse_utils as du
1678
1679     try:
1680         d = du.DateParser()  # type: ignore
1681         d.parse(in_str)
1682         return d.get_date()
1683     except du.ParseException:  # type: ignore
1684         msg = f'Unable to parse date {in_str}.'
1685         logger.warning(msg)
1686     return None
1687
1688
1689 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1690     """Finds and extracts a date from the string, if possible.
1691
1692     Args:
1693         in_str: the string to extract a date from
1694
1695     Returns:
1696         a datetime if date was found, otherwise None
1697
1698     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1699     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1700
1701     >>> extract_date("filename.txt    dec 13, 2022")
1702     datetime.datetime(2022, 12, 13, 0, 0)
1703
1704     >>> extract_date("Dear Santa, please get me a pony.")
1705
1706     """
1707     import itertools
1708
1709     import pyutils.datetimes.dateparse_utils as du
1710
1711     d = du.DateParser()  # type: ignore
1712     chunks = in_str.split()
1713     for ngram in itertools.chain(
1714         list_utils.ngrams(chunks, 5),
1715         list_utils.ngrams(chunks, 4),
1716         list_utils.ngrams(chunks, 3),
1717         list_utils.ngrams(chunks, 2),
1718     ):
1719         try:
1720             expr = " ".join(ngram)
1721             logger.debug("Trying %s", expr)
1722             if d.parse(expr):
1723                 return d.get_datetime()
1724         except du.ParseException:  # type: ignore
1725             pass
1726     return None
1727
1728
1729 def is_valid_date(in_str: str) -> bool:
1730     """
1731     Args:
1732         in_str: the string to check
1733
1734     Returns:
1735         True if the string represents a valid date that we can recognize
1736         and False otherwise.  This parser is relatively clever; see
1737         :class:`datetimes.dateparse_utils` docs for details.
1738
1739     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1740     :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1741
1742     >>> is_valid_date('1/2/2022')
1743     True
1744     >>> is_valid_date('christmas')
1745     True
1746     >>> is_valid_date('next wednesday')
1747     True
1748     >>> is_valid_date('xyzzy')
1749     False
1750     """
1751     import pyutils.datetimes.dateparse_utils as dp
1752
1753     try:
1754         d = dp.DateParser()  # type: ignore
1755         _ = d.parse(in_str)
1756         return True
1757     except dp.ParseException:  # type: ignore
1758         msg = f'Unable to parse date {in_str}.'
1759         logger.warning(msg)
1760     return False
1761
1762
1763 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1764     """
1765     Args:
1766         in_str: string to parse into a datetime
1767
1768     Returns:
1769         A python datetime parsed from in_str or None to indicate
1770         an error.  This parser is relatively clever; see
1771         :class:`datetimes.dateparse_utils` docs for details.
1772
1773     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1774     :meth:`extract_date`, :meth:`valid_datetime`.
1775
1776     >>> to_datetime('7/20/1969 02:56 GMT')
1777     datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1778     """
1779     import pyutils.datetimes.dateparse_utils as dp
1780
1781     try:
1782         d = dp.DateParser()  # type: ignore
1783         dt = d.parse(in_str)
1784         if isinstance(dt, datetime.datetime):
1785             return dt
1786     except Exception:
1787         msg = f'Unable to parse datetime {in_str}.'
1788         logger.warning(msg)
1789     return None
1790
1791
1792 def valid_datetime(in_str: str) -> bool:
1793     """
1794     Args:
1795         in_str: the string to check
1796
1797     Returns:
1798         True if in_str contains a valid datetime and False otherwise.
1799         This parser is relatively clever; see
1800         :class:`datetimes.dateparse_utils` docs for details.
1801
1802     >>> valid_datetime('next wednesday at noon')
1803     True
1804     >>> valid_datetime('3 weeks ago at midnight')
1805     True
1806     >>> valid_datetime('next easter at 5:00 am')
1807     True
1808     >>> valid_datetime('sometime soon')
1809     False
1810     """
1811     _ = to_datetime(in_str)
1812     if _ is not None:
1813         return True
1814     msg = f'Unable to parse datetime {in_str}.'
1815     logger.warning(msg)
1816     return False
1817
1818
1819 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1820     """
1821     Args:
1822         in_str: the string to squeeze
1823         character_to_squeeze: the character to remove runs of
1824             more than one in a row (default = space)
1825
1826     Returns: A "squeezed string" where runs of more than one
1827         character_to_squeeze into one.
1828
1829     >>> squeeze(' this        is       a    test    ')
1830     ' this is a test '
1831
1832     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1833     'one|!|two|!|three'
1834
1835     """
1836     return re.sub(
1837         r'(' + re.escape(character_to_squeeze) + r')+',
1838         character_to_squeeze,
1839         in_str,
1840     )
1841
1842
1843 def dedent(in_str: str) -> Optional[str]:
1844     """
1845     Args:
1846         in_str: the string to dedent
1847
1848     Returns:
1849         A string with tab indentation removed or None on error.
1850
1851     See also :meth:`indent`.
1852
1853     >>> dedent('\t\ttest\\n\t\ting')
1854     'test\\ning'
1855     """
1856     if not is_string(in_str):
1857         return None
1858     line_separator = '\n'
1859     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1860     return line_separator.join(lines)
1861
1862
1863 def indent(in_str: str, amount: int) -> str:
1864     """
1865     Args:
1866         in_str: the string to indent
1867         amount: count of spaces to indent each line by
1868
1869     Returns:
1870         An indented string created by prepending amount spaces.
1871
1872     See also :meth:`dedent`.
1873
1874     >>> indent('This is a test', 4)
1875     '    This is a test'
1876     """
1877     if not is_string(in_str):
1878         raise ValueError(in_str)
1879     line_separator = '\n'
1880     lines = [" " * amount + line for line in in_str.split(line_separator)]
1881     return line_separator.join(lines)
1882
1883
1884 def _sprintf(*args, **kwargs) -> str:
1885     """Internal helper."""
1886     ret = ""
1887
1888     sep = kwargs.pop("sep", None)
1889     if sep is not None:
1890         if not isinstance(sep, str):
1891             raise TypeError("sep must be None or a string")
1892
1893     end = kwargs.pop("end", None)
1894     if end is not None:
1895         if not isinstance(end, str):
1896             raise TypeError("end must be None or a string")
1897
1898     if kwargs:
1899         raise TypeError("invalid keyword arguments to sprint()")
1900
1901     if sep is None:
1902         sep = " "
1903     if end is None:
1904         end = "\n"
1905     for n, arg in enumerate(args):
1906         if n:
1907             ret += sep
1908         if isinstance(arg, str):
1909             ret += arg
1910         else:
1911             ret += str(arg)
1912     ret += end
1913     return ret
1914
1915
1916 def strip_ansi_sequences(in_str: str) -> str:
1917     """
1918     Args:
1919         in_str: the string to strip
1920
1921     Returns:
1922         in_str with recognized ANSI escape sequences removed.
1923
1924     See also :mod:`pyutils.ansi`.
1925
1926     .. warning::
1927         This method works by using a regular expression.
1928         It works for all ANSI escape sequences I've tested with but
1929         may miss some; caveat emptor.
1930
1931     >>> import ansi as a
1932     >>> s = a.fg('blue') + 'blue!' + a.reset()
1933     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1934     18
1935     >>> len(strip_ansi_sequences(s))
1936     5
1937     >>> strip_ansi_sequences(s)
1938     'blue!'
1939
1940     """
1941     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1942
1943
1944 class SprintfStdout(contextlib.AbstractContextManager):
1945     """
1946     A context manager that captures outputs to stdout to a buffer
1947     without printing them.
1948
1949     >>> with SprintfStdout() as buf:
1950     ...     print("test")
1951     ...     print("1, 2, 3")
1952     ...
1953     >>> print(buf(), end='')
1954     test
1955     1, 2, 3
1956     """
1957
1958     def __init__(self) -> None:
1959         self.destination = io.StringIO()
1960         self.recorder: contextlib.redirect_stdout
1961
1962     def __enter__(self) -> Callable[[], str]:
1963         self.recorder = contextlib.redirect_stdout(self.destination)
1964         self.recorder.__enter__()
1965         return lambda: self.destination.getvalue()
1966
1967     def __exit__(self, *args) -> Literal[False]:
1968         self.recorder.__exit__(*args)
1969         self.destination.seek(0)
1970         return False
1971
1972
1973 def capitalize_first_letter(in_str: str) -> str:
1974     """
1975     Args:
1976         in_str: the string to capitalize
1977
1978     Returns:
1979         in_str with the first character capitalized.
1980
1981     >>> capitalize_first_letter('test')
1982     'Test'
1983     >>> capitalize_first_letter("ALREADY!")
1984     'ALREADY!'
1985     """
1986     return in_str[0].upper() + in_str[1:]
1987
1988
1989 def it_they(n: int) -> str:
1990     """
1991     Args:
1992         n: how many of them are there?
1993
1994     Returns:
1995         'it' if n is one or 'they' otherwize.
1996
1997     See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
1998     :meth:`thify`.
1999
2000     Suggested usage::
2001
2002         n = num_files_saved_to_tmp()
2003         print(f'Saved file{pluralize(n)} successfully.')
2004         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2005
2006     >>> it_they(1)
2007     'it'
2008     >>> it_they(100)
2009     'they'
2010     """
2011     if n == 1:
2012         return "it"
2013     return "they"
2014
2015
2016 def is_are(n: int) -> str:
2017     """
2018     Args:
2019         n: how many of them are there?
2020
2021     Returns:
2022         'is' if n is one or 'are' otherwize.
2023
2024     See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2025     :meth:`thify`.
2026
2027     Suggested usage::
2028
2029         n = num_files_saved_to_tmp()
2030         print(f'Saved file{pluralize(n)} successfully.')
2031         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2032
2033     >>> is_are(1)
2034     'is'
2035     >>> is_are(2)
2036     'are'
2037
2038     """
2039     if n == 1:
2040         return "is"
2041     return "are"
2042
2043
2044 def pluralize(n: int) -> str:
2045     """
2046     Args:
2047         n: how many of them are there?
2048
2049     Returns:
2050         's' if n is greater than one otherwize ''.
2051
2052     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2053     :meth:`thify`.
2054
2055     Suggested usage::
2056
2057         n = num_files_saved_to_tmp()
2058         print(f'Saved file{pluralize(n)} successfully.')
2059         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2060
2061     >>> pluralize(15)
2062     's'
2063     >>> count = 1
2064     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2065     There is 1 file.
2066     >>> count = 4
2067     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2068     There are 4 files.
2069     """
2070     if n == 1:
2071         return ""
2072     return "s"
2073
2074
2075 def make_contractions(txt: str) -> str:
2076     """This code glues words in txt together to form (English)
2077     contractions.
2078
2079     Args:
2080         txt: the input text to be contractionized.
2081
2082     Returns:
2083         Output text identical to original input except for any
2084         recognized contractions are formed.
2085
2086     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2087
2088     .. note::
2089         The order in which we create contractions is defined by the
2090         implementation and what I thought made more sense when writing
2091         this code.
2092
2093     >>> make_contractions('It is nice today.')
2094     "It's nice today."
2095
2096     >>> make_contractions('I can    not even...')
2097     "I can't even..."
2098
2099     >>> make_contractions('She could not see!')
2100     "She couldn't see!"
2101
2102     >>> make_contractions('But she will not go.')
2103     "But she won't go."
2104
2105     >>> make_contractions('Verily, I shall not.')
2106     "Verily, I shan't."
2107
2108     >>> make_contractions('No you cannot.')
2109     "No you can't."
2110
2111     >>> make_contractions('I said you can not go.')
2112     "I said you can't go."
2113     """
2114
2115     first_second = [
2116         (
2117             [
2118                 'are',
2119                 'could',
2120                 'did',
2121                 'has',
2122                 'have',
2123                 'is',
2124                 'must',
2125                 'should',
2126                 'was',
2127                 'were',
2128                 'would',
2129             ],
2130             ['(n)o(t)'],
2131         ),
2132         (
2133             [
2134                 "I",
2135                 "you",
2136                 "he",
2137                 "she",
2138                 "it",
2139                 "we",
2140                 "they",
2141                 "how",
2142                 "why",
2143                 "when",
2144                 "where",
2145                 "who",
2146                 "there",
2147             ],
2148             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2149         ),
2150     ]
2151
2152     # Special cases: can't, shan't and won't.
2153     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2154     txt = re.sub(
2155         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2156     )
2157     txt = re.sub(
2158         r'\b(w)ill\s*(n)(o)(t)\b',
2159         r"\1\3\2'\4",
2160         txt,
2161         count=0,
2162         flags=re.IGNORECASE,
2163     )
2164
2165     for first_list, second_list in first_second:
2166         for first in first_list:
2167             for second in second_list:
2168                 # Disallow there're/where're.  They're valid English
2169                 # but sound weird.
2170                 if (first in set(['there', 'where'])) and second == 'a(re)':
2171                     continue
2172
2173                 pattern = fr'\b({first})\s+{second}\b'
2174                 if second == '(n)o(t)':
2175                     replacement = r"\1\2'\3"
2176                 else:
2177                     replacement = r"\1'\2"
2178                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2179
2180     return txt
2181
2182
2183 def thify(n: int) -> str:
2184     """
2185     Args:
2186         n: how many of them are there?
2187
2188     Returns:
2189         The proper cardinal suffix for a number.
2190
2191     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2192
2193     Suggested usage::
2194
2195         attempt_count = 0
2196         while True:
2197             attempt_count += 1
2198             if try_the_thing():
2199                 break
2200             print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2201
2202     >>> thify(1)
2203     'st'
2204     >>> thify(33)
2205     'rd'
2206     >>> thify(16)
2207     'th'
2208     """
2209     digit = str(n)
2210     assert is_integer_number(digit)
2211     digit = digit[-1:]
2212     if digit == "1":
2213         return "st"
2214     elif digit == "2":
2215         return "nd"
2216     elif digit == "3":
2217         return "rd"
2218     else:
2219         return "th"
2220
2221
2222 def ngrams(txt: str, n: int):
2223     """
2224     Args:
2225         txt: the string to create ngrams using
2226         n: how many words per ngram created?
2227
2228     Returns:
2229         Generates the ngrams from the input string.
2230
2231     See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2232
2233     >>> [x for x in ngrams('This is a test', 2)]
2234     ['This is', 'is a', 'a test']
2235     """
2236     words = txt.split()
2237     for ngram in ngrams_presplit(words, n):
2238         ret = ''
2239         for w in ngram:
2240             ret += f'{w} '
2241         yield ret.strip()
2242
2243
2244 def ngrams_presplit(words: Sequence[str], n: int):
2245     """
2246     Same as :meth:`ngrams` but with the string pre-split.
2247
2248     See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2249     """
2250     return list_utils.ngrams(words, n)
2251
2252
2253 def bigrams(txt: str):
2254     """Generates the bigrams (n=2) of the given string.
2255
2256     See also :meth:`ngrams`, :meth:`trigrams`.
2257
2258     >>> [x for x in bigrams('this is a test')]
2259     ['this is', 'is a', 'a test']
2260     """
2261     return ngrams(txt, 2)
2262
2263
2264 def trigrams(txt: str):
2265     """Generates the trigrams (n=3) of the given string.
2266
2267     See also :meth:`ngrams`, :meth:`bigrams`.
2268     """
2269     return ngrams(txt, 3)
2270
2271
2272 def shuffle_columns_into_list(
2273     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
2274 ) -> Iterable[str]:
2275     """Helper to shuffle / parse columnar data and return the results as a
2276     list.
2277
2278     Args:
2279         input_lines: A sequence of strings that represents text that
2280             has been broken into columns by the caller
2281         column_specs: an iterable collection of numeric sequences that
2282             indicate one or more column numbers to copy to form the Nth
2283             position in the output list.  See example below.
2284         delim: for column_specs that indicate we should copy more than
2285             one column from the input into this position, use delim to
2286             separate source data.  Defaults to ''.
2287
2288     Returns:
2289         A list of string created by following the instructions set forth
2290         in column_specs.
2291
2292     See also :meth:`shuffle_columns_into_dict`.
2293
2294     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2295     >>> shuffle_columns_into_list(
2296     ...     cols,
2297     ...     [ [8], [2, 3], [5, 6, 7] ],
2298     ...     delim='!',
2299     ... )
2300     ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2301     """
2302     out = []
2303
2304     # Column specs map input lines' columns into outputs.
2305     # [col1, col2...]
2306     for spec in column_specs:
2307         hunk = ''
2308         for n in spec:
2309             hunk = hunk + delim + input_lines[n]
2310         hunk = hunk.strip(delim)
2311         out.append(hunk)
2312     return out
2313
2314
2315 def shuffle_columns_into_dict(
2316     input_lines: Sequence[str],
2317     column_specs: Iterable[Tuple[str, Iterable[int]]],
2318     delim: str = '',
2319 ) -> Dict[str, str]:
2320     """Helper to shuffle / parse columnar data and return the results
2321     as a dict.
2322
2323     Args:
2324         input_lines: a sequence of strings that represents text that
2325             has been broken into columns by the caller
2326         column_specs: instructions for what dictionary keys to apply
2327             to individual or compound input column data.  See example
2328             below.
2329         delim: when forming compound output data by gluing more than
2330             one input column together, use this character to separate
2331             the source data.  Defaults to ''.
2332
2333     Returns:
2334         A dict formed by applying the column_specs instructions.
2335
2336     See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2337
2338     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2339     >>> shuffle_columns_into_dict(
2340     ...     cols,
2341     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2342     ...     delim='!',
2343     ... )
2344     {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2345     """
2346     out = {}
2347
2348     # Column specs map input lines' columns into outputs.
2349     # "key", [col1, col2...]
2350     for spec in column_specs:
2351         hunk = ''
2352         for n in spec[1]:
2353             hunk = hunk + delim + input_lines[n]
2354         hunk = hunk.strip(delim)
2355         out[spec[0]] = hunk
2356     return out
2357
2358
2359 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2360     """
2361     Interpolate a string with data from a dict.
2362
2363     Args:
2364         txt: the mad libs template
2365         values: what you and your kids chose for each category.
2366
2367     See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2368
2369     >>> interpolate_using_dict('This is a {adjective} {noun}.',
2370     ...                        {'adjective': 'good', 'noun': 'example'})
2371     'This is a good example.'
2372     """
2373     return _sprintf(txt.format(**values), end='')
2374
2375
2376 def to_ascii(txt: str):
2377     """
2378     Args:
2379         txt: the input data to encode
2380
2381     Returns:
2382         txt encoded as an ASCII byte string.
2383
2384     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2385     :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2386
2387     >>> to_ascii('test')
2388     b'test'
2389
2390     >>> to_ascii(b'1, 2, 3')
2391     b'1, 2, 3'
2392     """
2393     if isinstance(txt, str):
2394         return txt.encode('ascii')
2395     if isinstance(txt, bytes):
2396         return txt
2397     raise Exception('to_ascii works with strings and bytes')
2398
2399
2400 def to_base64(
2401     txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2402 ) -> bytes:
2403     """
2404     Args:
2405         txt: the input data to encode
2406         encoding: the encoding to use during conversion
2407         errors: how to handle encoding errors
2408
2409     Returns:
2410         txt encoded with a 64-chracter alphabet.  Similar to and compatible
2411         with uuencode/uudecode.
2412
2413     See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2414     :meth:`from_base64`.
2415
2416     >>> to_base64('hello?')
2417     b'aGVsbG8/\\n'
2418     """
2419     return base64.encodebytes(txt.encode(encoding, errors))
2420
2421
2422 def is_base64(txt: str) -> bool:
2423     """
2424     Args:
2425         txt: the string to check
2426
2427     Returns:
2428         True if txt is a valid base64 encoded string.  This assumes
2429         txt was encoded with Python's standard base64 alphabet which
2430         is the same as what uuencode/uudecode uses).
2431
2432     See also :meth:`to_base64`, :meth:`from_base64`.
2433
2434     >>> is_base64('test')    # all letters in the b64 alphabet
2435     True
2436
2437     >>> is_base64('another test, how do you like this one?')
2438     False
2439
2440     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
2441     True
2442
2443     """
2444     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2445     alphabet = set(a.encode('ascii'))
2446     for char in to_ascii(txt.strip()):
2447         if char not in alphabet:
2448             return False
2449     return True
2450
2451
2452 def from_base64(
2453     b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2454 ) -> str:
2455     """
2456     Args:
2457         b64: bytestring of 64-bit encoded data to decode / convert.
2458         encoding: the encoding to use during conversion
2459         errors: how to handle encoding errors
2460
2461     Returns:
2462         The decoded form of b64 as a normal python string.  Similar to
2463         and compatible with uuencode / uudecode.
2464
2465     See also :meth:`to_base64`, :meth:`is_base64`.
2466
2467     >>> from_base64(b'aGVsbG8/\\n')
2468     'hello?'
2469     """
2470     return base64.decodebytes(b64).decode(encoding, errors)
2471
2472
2473 def chunk(txt: str, chunk_size: int):
2474     """
2475     Args:
2476         txt: a string to be chunked into evenly spaced pieces.
2477         chunk_size: the size of each chunk to make
2478
2479     Returns:
2480         The original string chunked into evenly spaced pieces.
2481
2482     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2483     '01001101 11000101 10101010 10101010 10011111 10101000'
2484     """
2485     if len(txt) % chunk_size != 0:
2486         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2487         logger.warning(msg)
2488         warnings.warn(msg, stacklevel=2)
2489     for x in range(0, len(txt), chunk_size):
2490         yield txt[x : x + chunk_size]
2491
2492
2493 def to_bitstring(txt: str, *, delimiter: str = '') -> str:
2494     """
2495     Args:
2496         txt: the string to convert into a bitstring
2497         delimiter: character to insert between adjacent bytes.  Note that
2498             only bitstrings with delimiter='' are interpretable by
2499             :meth:`from_bitstring`.
2500
2501     Returns:
2502         txt converted to ascii/binary and then chopped into bytes.
2503
2504     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2505     :meth:`chunk`.
2506
2507     >>> to_bitstring('hello?')
2508     '011010000110010101101100011011000110111100111111'
2509
2510     >>> to_bitstring('test', delimiter=' ')
2511     '01110100 01100101 01110011 01110100'
2512
2513     >>> to_bitstring(b'test')
2514     '01110100011001010111001101110100'
2515     """
2516     etxt = to_ascii(txt)
2517     bits = bin(int.from_bytes(etxt, 'big'))
2518     bits = bits[2:]
2519     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2520
2521
2522 def is_bitstring(txt: str) -> bool:
2523     """
2524     Args:
2525         txt: the string to check
2526
2527     Returns:
2528         True if txt is a recognized bitstring and False otherwise.
2529         Note that if delimiter is non empty this code will not
2530         recognize the bitstring.
2531
2532     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2533     :meth:`chunk`.
2534
2535     >>> is_bitstring('011010000110010101101100011011000110111100111111')
2536     True
2537
2538     >>> is_bitstring('1234')
2539     False
2540     """
2541     return is_binary_integer_number(f'0b{txt}')
2542
2543
2544 def from_bitstring(
2545     bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2546 ) -> str:
2547     """
2548     Args:
2549         bits: the bitstring to convert back into a python string
2550         encoding: the encoding to use during conversion
2551         errors: how to handle encoding errors
2552
2553     Returns:
2554         The regular python string represented by bits.  Note that this
2555         code does not work with to_bitstring when delimiter is non-empty.
2556
2557     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2558     :meth:`chunk`.
2559
2560     >>> from_bitstring('011010000110010101101100011011000110111100111111')
2561     'hello?'
2562     """
2563     n = int(bits, 2)
2564     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2565
2566
2567 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2568     """
2569     Args:
2570         txt: an IP address to chunk up for sorting purposes
2571
2572     Returns:
2573         A tuple of IP components arranged such that the sorting of
2574         IP addresses using a normal comparator will do something sane
2575         and desireable.
2576
2577     See also :meth:`is_ip_v4`.
2578
2579     >>> ip_v4_sort_key('10.0.0.18')
2580     (10, 0, 0, 18)
2581
2582     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2583     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2584     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2585     """
2586     if not is_ip_v4(txt):
2587         print(f"not IP: {txt}")
2588         return None
2589     return tuple(int(x) for x in txt.split('.'))
2590
2591
2592 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2593     """
2594     Args:
2595         volume: the string to chunk up for sorting purposes
2596
2597     Returns:
2598         A tuple of volume's components such that the sorting of
2599         volumes using a normal comparator will do something sane
2600         and desireable.
2601
2602     See also :mod:`pyutils.files.file_utils`.
2603
2604     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2605     ('usr', 'local', 'bin')
2606
2607     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2608     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2609     ['/usr', '/usr/local', '/usr/local/bin']
2610     """
2611     return tuple(x for x in volume.split('/') if len(x) > 0)
2612
2613
2614 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2615     """
2616     Execute several replace operations in a row.
2617
2618     Args:
2619         in_str: the string in which to replace characters
2620         replace_set: the set of target characters to replace
2621         replacement: the character to replace any member of replace_set
2622             with
2623
2624     See also :meth:`replace_nth`.
2625
2626     Returns:
2627         The string with replacements executed.
2628
2629     >>> s = 'this_is a-test!'
2630     >>> replace_all(s, ' _-!', '')
2631     'thisisatest'
2632     """
2633     for char in replace_set:
2634         in_str = in_str.replace(char, replacement)
2635     return in_str
2636
2637
2638 def replace_nth(in_str: str, source: str, target: str, nth: int):
2639     """
2640     Replaces the nth occurrance of a substring within a string.
2641
2642     Args:
2643         in_str: the string in which to run the replacement
2644         source: the substring to replace
2645         target: the replacement text
2646         nth: which occurrance of source to replace?
2647
2648     See also :meth:`replace_all`.
2649
2650     >>> replace_nth('this is a test', ' ', '-', 3)
2651     'this is a-test'
2652     """
2653     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2654     before = in_str[:where]
2655     after = in_str[where:]
2656     after = after.replace(source, target, 1)
2657     return before + after
2658
2659
2660 if __name__ == '__main__':
2661     import doctest
2662
2663     doctest.testmod()