src/pyutils/string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7
   8 Modifications Copyright (c) 2021-2022 Scott Gasch
   9
  10 Permission is hereby granted, free of charge, to any person obtaining a copy
  11 of this software and associated documentation files (the "Software"), to deal
  12 in the Software without restriction, including without limitation the rights
  13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 copies of the Software, and to permit persons to whom the Software is
  15 furnished to do so, subject to the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included in all
  18 copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26 SOFTWARE.
  27
  28 This class is based on:
  29 https://github.com/daveoncode/python-string-utils.  See `NOTICE
  30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
  31 in the root of this module for a detailed enumeration of what work is
  32 Davide's and what work was added by Scott.
  33
  34 """
  35
  36 import base64
  37 import contextlib  # type: ignore
  38 import datetime
  39 import io
  40 import json
  41 import logging
  42 import numbers
  43 import random
  44 import re
  45 import string
  46 import unicodedata
  47 import warnings
  48 from itertools import zip_longest
  49 from typing import (
  50     Any,
  51     Callable,
  52     Dict,
  53     Iterable,
  54     List,
  55     Literal,
  56     Optional,
  57     Sequence,
  58     Tuple,
  59 )
  60 from uuid import uuid4
  61
  62 from pyutils import list_utils
  63
  64 logger = logging.getLogger(__name__)
  65
  66 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  67
  68 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  69
  70 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  71
  72 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  73
  74 URLS_RAW_STRING = (
  75     r"([a-z-]+://)"  # scheme
  76     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  77     r"(www\.)?"  # www.
  78     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  79     r"(:\d{2,})?"  # port number
  80     r"(/[a-z\d_%+-]*)*"  # folders
  81     r"(\.[a-z\d_%+-]+)*"  # file extension
  82     r"(\?[a-z\d_+%-=]*)?"  # query string
  83     r"(#\S*)?"  # hash
  84 )
  85
  86 URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
  87
  88 URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
  89
  90 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  91
  92 EMAILS_RAW_STRING = (
  93     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  94 )
  95
  96 EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
  97
  98 EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
  99
 100 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
 101
 102 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
 103
 104 SNAKE_CASE_TEST_RE = re.compile(
 105     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
 106 )
 107
 108 SNAKE_CASE_TEST_DASH_RE = re.compile(
 109     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 110 )
 111
 112 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 113
 114 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 115
 116 CREDIT_CARDS = {
 117     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 118     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 119     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 120     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 121     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 122     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 123 }
 124
 125 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 126
 127 UUID_RE = re.compile(
 128     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 129 )
 130
 131 UUID_HEX_OK_RE = re.compile(
 132     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 133     re.IGNORECASE,
 134 )
 135
 136 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 137
 138 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 139
 140 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 141
 142 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 143
 144 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 145
 146 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 147     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 148 )
 149
 150 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 151
 152 HTML_RE = re.compile(
 153     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 154     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 155 )
 156
 157 HTML_TAG_ONLY_RE = re.compile(
 158     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 159     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 160 )
 161
 162 SPACES_RE = re.compile(r"\s")
 163
 164 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 165
 166 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 167
 168 ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
 169
 170 NUM_SUFFIXES = {
 171     "Pb": (1024**5),
 172     "P": (1024**5),
 173     "Tb": (1024**4),
 174     "T": (1024**4),
 175     "Gb": (1024**3),
 176     "G": (1024**3),
 177     "Mb": (1024**2),
 178     "M": (1024**2),
 179     "Kb": (1024**1),
 180     "K": (1024**1),
 181 }
 182
 183 UNIT_WORDS = [
 184     "zero",
 185     "one",
 186     "two",
 187     "three",
 188     "four",
 189     "five",
 190     "six",
 191     "seven",
 192     "eight",
 193     "nine",
 194     "ten",
 195     "eleven",
 196     "twelve",
 197     "thirteen",
 198     "fourteen",
 199     "fifteen",
 200     "sixteen",
 201     "seventeen",
 202     "eighteen",
 203     "nineteen",
 204 ]
 205
 206 TENS_WORDS = [
 207     "",
 208     "",
 209     "twenty",
 210     "thirty",
 211     "forty",
 212     "fifty",
 213     "sixty",
 214     "seventy",
 215     "eighty",
 216     "ninety",
 217 ]
 218
 219 MAGNITUDE_SCALES = [
 220     "hundred",
 221     "thousand",
 222     "million",
 223     "billion",
 224     "trillion",
 225     "quadrillion",
 226 ]
 227
 228 NUM_WORDS = {}
 229 NUM_WORDS["and"] = (1, 0)
 230 for i, word in enumerate(UNIT_WORDS):
 231     NUM_WORDS[word] = (1, i)
 232 for i, word in enumerate(TENS_WORDS):
 233     NUM_WORDS[word] = (1, i * 10)
 234 for i, word in enumerate(MAGNITUDE_SCALES):
 235     if i == 0:
 236         NUM_WORDS[word] = (100, 0)
 237     else:
 238         NUM_WORDS[word] = (10 ** (i * 3), 0)
 239 NUM_WORDS['score'] = (20, 0)
 240
 241
 242 def is_none_or_empty(in_str: Optional[str]) -> bool:
 243     """
 244     Args:
 245         in_str: the string to test
 246
 247     Returns:
 248         True if the input string is either None or an empty string,
 249         False otherwise.
 250
 251     See also :meth:`is_string` and :meth:`is_empty_string`.
 252
 253     >>> is_none_or_empty("")
 254     True
 255     >>> is_none_or_empty(None)
 256     True
 257     >>> is_none_or_empty("   \t   ")
 258     True
 259     >>> is_none_or_empty('Test')
 260     False
 261     """
 262     return in_str is None or len(in_str.strip()) == 0
 263
 264
 265 def is_string(in_str: Any) -> bool:
 266     """
 267     Args:
 268         in_str: the object to test
 269
 270     Returns:
 271         True if the object is a string and False otherwise.
 272
 273     See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 274
 275     >>> is_string('test')
 276     True
 277     >>> is_string(123)
 278     False
 279     >>> is_string(100.3)
 280     False
 281     >>> is_string([1, 2, 3])
 282     False
 283     """
 284     return isinstance(in_str, str)
 285
 286
 287 def is_empty_string(in_str: Any) -> bool:
 288     """
 289     Args:
 290         in_str: the string to test
 291
 292     Returns:
 293         True if the string is empty and False otherwise.
 294
 295     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 296     """
 297     return is_empty(in_str)
 298
 299
 300 def is_empty(in_str: Any) -> bool:
 301     """
 302     Args:
 303         in_str: the string to test
 304
 305     Returns:
 306         True if the string is empty and false otherwise.
 307
 308     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 309
 310     >>> is_empty('')
 311     True
 312     >>> is_empty('    \t\t    ')
 313     True
 314     >>> is_empty('test')
 315     False
 316     >>> is_empty(100.88)
 317     False
 318     >>> is_empty([1, 2, 3])
 319     False
 320     """
 321     return is_string(in_str) and in_str.strip() == ""
 322
 323
 324 def is_full_string(in_str: Any) -> bool:
 325     """
 326     Args:
 327         in_str: the object to test
 328
 329     Returns:
 330         True if the object is a string and is not empty ('') and
 331         is not only composed of whitespace.
 332
 333     See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 334
 335     >>> is_full_string('test!')
 336     True
 337     >>> is_full_string('')
 338     False
 339     >>> is_full_string('      ')
 340     False
 341     >>> is_full_string(100.999)
 342     False
 343     >>> is_full_string({"a": 1, "b": 2})
 344     False
 345     """
 346     return is_string(in_str) and in_str.strip() != ""
 347
 348
 349 def is_number(in_str: str) -> bool:
 350     """
 351     Args:
 352         in_str: the string to test
 353
 354     Returns:
 355         True if the string contains a valid numberic value and
 356         False otherwise.
 357
 358     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 359     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 360     etc...
 361
 362     >>> is_number(100.5)
 363     Traceback (most recent call last):
 364     ...
 365     ValueError: 100.5
 366     >>> is_number("100.5")
 367     True
 368     >>> is_number("test")
 369     False
 370     >>> is_number("99")
 371     True
 372     >>> is_number([1, 2, 3])
 373     Traceback (most recent call last):
 374     ...
 375     ValueError: [1, 2, 3]
 376     """
 377     if not is_string(in_str):
 378         raise ValueError(in_str)
 379     return NUMBER_RE.match(in_str) is not None
 380
 381
 382 def is_integer_number(in_str: str) -> bool:
 383     """
 384     Args:
 385         in_str: the string to test
 386
 387     Returns:
 388         True if the string contains a valid (signed or unsigned,
 389         decimal, hex, or octal, regular or scientific) integral
 390         expression and False otherwise.
 391
 392     See also :meth:`is_number`, :meth:`is_decimal_number`,
 393     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 394     etc...
 395
 396     >>> is_integer_number('42')
 397     True
 398     >>> is_integer_number('42.0')
 399     False
 400     """
 401     return (
 402         (is_number(in_str) and "." not in in_str)
 403         or is_hexidecimal_integer_number(in_str)
 404         or is_octal_integer_number(in_str)
 405         or is_binary_integer_number(in_str)
 406     )
 407
 408
 409 def is_hexidecimal_integer_number(in_str: str) -> bool:
 410     """
 411     Args:
 412         in_str: the string to test
 413
 414     Returns:
 415         True if the string is a hex integer number and False otherwise.
 416
 417     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 418     :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
 419
 420     >>> is_hexidecimal_integer_number('0x12345')
 421     True
 422     >>> is_hexidecimal_integer_number('0x1A3E')
 423     True
 424     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 425     False
 426     >>> is_hexidecimal_integer_number('-0xff')
 427     True
 428     >>> is_hexidecimal_integer_number('test')
 429     False
 430     >>> is_hexidecimal_integer_number(12345)  # Not a string
 431     Traceback (most recent call last):
 432     ...
 433     ValueError: 12345
 434     >>> is_hexidecimal_integer_number(101.4)
 435     Traceback (most recent call last):
 436     ...
 437     ValueError: 101.4
 438     >>> is_hexidecimal_integer_number(0x1A3E)
 439     Traceback (most recent call last):
 440     ...
 441     ValueError: 6718
 442     """
 443     if not is_string(in_str):
 444         raise ValueError(in_str)
 445     return HEX_NUMBER_RE.match(in_str) is not None
 446
 447
 448 def is_octal_integer_number(in_str: str) -> bool:
 449     """
 450     Args:
 451         in_str: the string to test
 452
 453     Returns:
 454         True if the string is a valid octal integral number and False otherwise.
 455
 456     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 457     :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
 458     etc...
 459
 460     >>> is_octal_integer_number('0o777')
 461     True
 462     >>> is_octal_integer_number('-0O115')
 463     True
 464     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 465     False
 466     >>> is_octal_integer_number('7777')  # Needs 0o
 467     False
 468     >>> is_octal_integer_number('test')
 469     False
 470     """
 471     if not is_string(in_str):
 472         raise ValueError(in_str)
 473     return OCT_NUMBER_RE.match(in_str) is not None
 474
 475
 476 def is_binary_integer_number(in_str: str) -> bool:
 477     """
 478     Args:
 479         in_str: the string to test
 480
 481     Returns:
 482         True if the string contains a binary integral number and False otherwise.
 483
 484     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 485     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 486     etc...
 487
 488     >>> is_binary_integer_number('0b10111')
 489     True
 490     >>> is_binary_integer_number('-0b111')
 491     True
 492     >>> is_binary_integer_number('0B10101')
 493     True
 494     >>> is_binary_integer_number('0b10102')
 495     False
 496     >>> is_binary_integer_number('0xFFF')
 497     False
 498     >>> is_binary_integer_number('test')
 499     False
 500     """
 501     if not is_string(in_str):
 502         raise ValueError(in_str)
 503     return BIN_NUMBER_RE.match(in_str) is not None
 504
 505
 506 def to_int(in_str: str) -> int:
 507     """
 508     Args:
 509         in_str: the string to convert
 510
 511     Returns:
 512         The integral value of the string or raises on error.
 513
 514     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 515     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 516     :meth:`is_binary_integer_number`, etc...
 517
 518     >>> to_int('1234')
 519     1234
 520     >>> to_int('0x1234')
 521     4660
 522     >>> to_int('0b01101')
 523     13
 524     >>> to_int('0o777')
 525     511
 526     >>> to_int('test')
 527     Traceback (most recent call last):
 528     ...
 529     ValueError: invalid literal for int() with base 10: 'test'
 530     """
 531     if not is_string(in_str):
 532         raise ValueError(in_str)
 533     if is_binary_integer_number(in_str):
 534         return int(in_str, 2)
 535     if is_octal_integer_number(in_str):
 536         return int(in_str, 8)
 537     if is_hexidecimal_integer_number(in_str):
 538         return int(in_str, 16)
 539     return int(in_str)
 540
 541
 542 def number_string_to_integer(in_str: str) -> int:
 543     """Convert a string containing a written-out number into an int.
 544
 545     Args:
 546         in_str: the string containing the long-hand written out integer number
 547             in English.  See examples below.
 548
 549     Returns:
 550         The integer whose value was parsed from in_str.
 551
 552     See also :meth:`integer_to_number_string`.
 553
 554     .. warning::
 555         This code only handles integers; it will not work with decimals / floats.
 556
 557     >>> number_string_to_integer("one hundred fifty two")
 558     152
 559
 560     >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
 561     10200054003
 562
 563     >>> number_string_to_integer("four-score and 7")
 564     87
 565
 566     >>> number_string_to_integer("fifty xyzzy three")
 567     Traceback (most recent call last):
 568     ...
 569     ValueError: Unknown word: xyzzy
 570     """
 571     if isinstance(in_str, int):
 572         return int(in_str)
 573
 574     current = result = 0
 575     in_str = in_str.replace('-', ' ')
 576     for w in in_str.split():
 577         if w not in NUM_WORDS:
 578             if is_integer_number(w):
 579                 current += int(w)
 580                 continue
 581             else:
 582                 raise ValueError("Unknown word: " + w)
 583         scale, increment = NUM_WORDS[w]
 584         current = current * scale + increment
 585         if scale > 100:
 586             result += current
 587             current = 0
 588     return result + current
 589
 590
 591 def integer_to_number_string(num: int) -> str:
 592     """
 593     Opposite of :meth:`number_string_to_integer`; converts a number to a written out
 594     longhand format in English.
 595
 596     Args:
 597         num: the integer number to convert
 598
 599     Returns:
 600         The long-hand written out English form of the number.  See examples below.
 601
 602     See also :meth:`number_string_to_integer`.
 603
 604     .. warning::
 605         This method does not handle decimals or floats, only ints.
 606
 607     >>> integer_to_number_string(9)
 608     'nine'
 609
 610     >>> integer_to_number_string(42)
 611     'forty two'
 612
 613     >>> integer_to_number_string(123219982)
 614     'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
 615     """
 616
 617     if num < 20:
 618         return UNIT_WORDS[num]
 619     if num < 100:
 620         ret = TENS_WORDS[num // 10]
 621         leftover = num % 10
 622         if leftover != 0:
 623             ret += ' ' + UNIT_WORDS[leftover]
 624         return ret
 625
 626     # If num > 100 go find the highest chunk and convert that, then recursively
 627     # convert the rest.  NUM_WORDS contains items like 'thousand' -> (1000, 0).
 628     # The second item in the tuple is an increment that can be ignored; the first
 629     # is the numeric "scale" of the entry.  So find the greatest entry in NUM_WORDS
 630     # still less than num.  For 123,456 it would be thousand.  Then pull out the
 631     # 123, convert it, and append "thousand".  Then do the rest.
 632     scales = {}
 633     for name, val in NUM_WORDS.items():
 634         if val[0] <= num:
 635             scales[name] = val[0]
 636     scale = max(scales.items(), key=lambda _: _[1])
 637
 638     # scale[1] = numeric magnitude (e.g. 1000)
 639     # scale[0] = name (e.g. "thousand")
 640     ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
 641     leftover = num % scale[1]
 642     if leftover != 0:
 643         ret += ' ' + integer_to_number_string(leftover)
 644     return ret
 645
 646
 647 def is_decimal_number(in_str: str) -> bool:
 648     """
 649     Args:
 650         in_str: the string to check
 651
 652     Returns:
 653         True if the given string represents a decimal or False
 654         otherwise.  A decimal may be signed or unsigned or use
 655         a "scientific notation".
 656
 657     See also :meth:`is_integer_number`.
 658
 659     .. note::
 660         We do not consider integers without a decimal point
 661         to be decimals; they return False (see example).
 662
 663     >>> is_decimal_number('42.0')
 664     True
 665     >>> is_decimal_number('42')
 666     False
 667     """
 668     return is_number(in_str) and "." in in_str
 669
 670
 671 def strip_escape_sequences(in_str: str) -> str:
 672     """
 673     Args:
 674         in_str: the string to strip of escape sequences.
 675
 676     Returns:
 677         in_str with escape sequences removed.
 678
 679     See also: :mod:`pyutils.ansi`.
 680
 681     .. note::
 682         What is considered to be an "escape sequence" is defined
 683         by a regular expression.  While this gets common ones,
 684         there may exist valid sequences that it doesn't match.
 685
 686     >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
 687     'this is a test!'
 688     """
 689     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 690     return in_str
 691
 692
 693 def add_thousands_separator(
 694     in_str: str, *, separator_char: str = ',', places: int = 3
 695 ) -> str:
 696     """
 697     Args:
 698         in_str: string or number to which to add thousands separator(s)
 699         separator_char: the separator character to add (defaults to comma)
 700         places: add a separator every N places (defaults to three)
 701
 702     Returns:
 703         A numeric string with thousands separators added appropriately.
 704
 705     >>> add_thousands_separator('12345678')
 706     '12,345,678'
 707     >>> add_thousands_separator(12345678)
 708     '12,345,678'
 709     >>> add_thousands_separator(12345678.99)
 710     '12,345,678.99'
 711     >>> add_thousands_separator('test')
 712     Traceback (most recent call last):
 713     ...
 714     ValueError: test
 715
 716     """
 717     if isinstance(in_str, numbers.Number):
 718         in_str = f'{in_str}'
 719     if is_number(in_str):
 720         return _add_thousands_separator(
 721             in_str, separator_char=separator_char, places=places
 722         )
 723     raise ValueError(in_str)
 724
 725
 726 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 727     """Internal helper"""
 728     decimal_part = ""
 729     if '.' in in_str:
 730         (in_str, decimal_part) = in_str.split('.')
 731     tmp = [iter(in_str[::-1])] * places
 732     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 733     if len(decimal_part) > 0:
 734         ret += '.'
 735         ret += decimal_part
 736     return ret
 737
 738
 739 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 740     """
 741     Args:
 742         in_str: the string to test
 743         allowed_schemes: an optional list of allowed schemes (e.g.
 744             ['http', 'https', 'ftp'].  If passed, only URLs that
 745             begin with the one of the schemes passed will be considered
 746             to be valid.  Otherwise, any scheme:// will be considered
 747             valid.
 748
 749     Returns:
 750         True if in_str contains a valid URL and False otherwise.
 751
 752     >>> is_url('http://www.mysite.com')
 753     True
 754     >>> is_url('https://mysite.com')
 755     True
 756     >>> is_url('.mysite.com')
 757     False
 758     >>> is_url('scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash')
 759     True
 760     """
 761     if not is_full_string(in_str):
 762         return False
 763
 764     valid = URL_RE.match(in_str) is not None
 765
 766     if allowed_schemes:
 767         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 768     return valid
 769
 770
 771 def is_email(in_str: Any) -> bool:
 772     """
 773     Args:
 774         in_str: the email address to check
 775
 776     Returns: True if the in_str contains a valid email (as defined by
 777         https://tools.ietf.org/html/rfc3696#section-3) or False
 778         otherwise.
 779
 780     >>> is_email('[email protected]')
 781     True
 782     >>> is_email('@gmail.com')
 783     False
 784     """
 785     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 786         return False
 787
 788     try:
 789         # we expect 2 tokens, one before "@" and one after, otherwise
 790         # we have an exception and the email is not valid.
 791         head, tail = in_str.split("@")
 792
 793         # head's size must be <= 64, tail <= 255, head must not start
 794         # with a dot or contain multiple consecutive dots.
 795         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 796             return False
 797
 798         # removes escaped spaces, so that later on the test regex will
 799         # accept the string.
 800         head = head.replace("\\ ", "")
 801         if head.startswith('"') and head.endswith('"'):
 802             head = head.replace(" ", "")[1:-1]
 803         return EMAIL_RE.match(head + "@" + tail) is not None
 804
 805     except ValueError:
 806         # borderline case in which we have multiple "@" signs but the
 807         # head part is correctly escaped.
 808         if ESCAPED_AT_SIGN.search(in_str) is not None:
 809             # replace "@" with "a" in the head
 810             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 811         return False
 812
 813
 814 def suffix_string_to_number(in_str: str) -> Optional[int]:
 815     """Takes a string like "33Gb" and converts it into a number (of bytes)
 816     like 34603008.
 817
 818     Args:
 819         in_str: the string with a suffix to be interpreted and removed.
 820
 821     Returns:
 822         An integer number of bytes or None to indicate an error.
 823
 824     See also :meth:`number_to_suffix_string`.
 825
 826     >>> suffix_string_to_number('1Mb')
 827     1048576
 828     >>> suffix_string_to_number('13.1Gb')
 829     14066017894
 830     """
 831
 832     def suffix_capitalize(s: str) -> str:
 833         if len(s) == 1:
 834             return s.upper()
 835         elif len(s) == 2:
 836             return f"{s[0].upper()}{s[1].lower()}"
 837         return suffix_capitalize(s[0:1])
 838
 839     if is_string(in_str):
 840         if is_integer_number(in_str):
 841             return to_int(in_str)
 842         suffixes = [in_str[-2:], in_str[-1:]]
 843         rest = [in_str[:-2], in_str[:-1]]
 844         for x in range(len(suffixes)):
 845             s = suffixes[x]
 846             s = suffix_capitalize(s)
 847             multiplier = NUM_SUFFIXES.get(s, None)
 848             if multiplier is not None:
 849                 r = rest[x]
 850                 if is_integer_number(r):
 851                     return to_int(r) * multiplier
 852                 if is_decimal_number(r):
 853                     return int(float(r) * multiplier)
 854     return None
 855
 856
 857 def number_to_suffix_string(num: int) -> Optional[str]:
 858     """Take a number (of bytes) and returns a string like "43.8Gb".
 859
 860     Args:
 861         num: an integer number of bytes
 862
 863     Returns:
 864         A string with a suffix representing num bytes concisely or
 865         None to indicate an error.
 866
 867     See also: :meth:`suffix_string_to_number`.
 868
 869     >>> number_to_suffix_string(14066017894)
 870     '13.1Gb'
 871     >>> number_to_suffix_string(1024 * 1024)
 872     '1.0Mb'
 873     """
 874     d = 0.0
 875     suffix = None
 876     for (sfx, size) in NUM_SUFFIXES.items():
 877         if num >= size:
 878             d = num / size
 879             suffix = sfx
 880             break
 881     if suffix is not None:
 882         return f"{d:.1f}{suffix}"
 883     else:
 884         return f'{num:d}'
 885
 886
 887 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 888     """
 889     Args:
 890         in_str: a string to check
 891         card_type: if provided, contains the card type to validate
 892             with.  Otherwise, all known credit card number types will
 893             be accepted.
 894
 895             Supported card types are the following:
 896
 897             * VISA
 898             * MASTERCARD
 899             * AMERICAN_EXPRESS
 900             * DINERS_CLUB
 901             * DISCOVER
 902             * JCB
 903
 904     Returns:
 905         True if in_str is a valid credit card number.
 906
 907     .. warning::
 908         This code is not verifying the authenticity of the credit card (i.e.
 909         not checking whether it's a real card that can be charged); rather
 910         it's only checking that the number follows the "rules" for numbering
 911         established by credit card issuers.
 912
 913     """
 914     if not is_full_string(in_str):
 915         return False
 916
 917     if card_type is not None:
 918         if card_type not in CREDIT_CARDS:
 919             raise KeyError(
 920                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 921             )
 922         return CREDIT_CARDS[card_type].match(in_str) is not None
 923     for c in CREDIT_CARDS:
 924         if CREDIT_CARDS[c].match(in_str) is not None:
 925             return True
 926     return False
 927
 928
 929 def is_camel_case(in_str: Any) -> bool:
 930     """
 931     Args:
 932         in_str: the string to test
 933
 934     Returns:
 935         True if the string is formatted as camel case and False otherwise.
 936         A string is considered camel case when:
 937
 938         * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 939         * it contains both lowercase and uppercase letters
 940         * it does not start with a number
 941
 942     See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
 943     """
 944     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 945
 946
 947 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 948     """
 949     Args:
 950         in_str: the string to test
 951         separator: the snake case separator character to use
 952
 953     Returns: True if the string is snake case and False otherwise.  A
 954         string is considered snake case when:
 955
 956         * it's composed only by lowercase/uppercase letters and digits
 957         * it contains at least one underscore (or provided separator)
 958         * it does not start with a number
 959
 960     See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
 961
 962     >>> is_snake_case('this_is_a_test')
 963     True
 964     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 965     True
 966     >>> is_snake_case('this-is-a-test')
 967     False
 968     >>> is_snake_case('this-is-a-test', separator='-')
 969     True
 970     """
 971     if is_full_string(in_str):
 972         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 973         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 974         r = re_map.get(
 975             separator,
 976             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 977         )
 978         return r.match(in_str) is not None
 979     return False
 980
 981
 982 def is_json(in_str: Any) -> bool:
 983     """
 984     Args:
 985         in_str: the string to test
 986
 987     Returns:
 988         True if the in_str contains valid JSON and False otherwise.
 989
 990     >>> is_json('{"name": "Peter"}')
 991     True
 992     >>> is_json('[1, 2, 3]')
 993     True
 994     >>> is_json('{nope}')
 995     False
 996     """
 997     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 998         try:
 999             return isinstance(json.loads(in_str), (dict, list))
1000         except (TypeError, ValueError, OverflowError):
1001             pass
1002     return False
1003
1004
1005 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1006     """
1007     Args:
1008         in_str: the string to test
1009         allow_hex: should we allow hexidecimal digits in valid uuids?
1010
1011     Returns:
1012         True if the in_str contains a valid UUID and False otherwise.
1013
1014     See also :meth:`generate_uuid`.
1015
1016     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1017     True
1018     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1019     False
1020     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1021     True
1022     """
1023     # string casting is used to allow UUID itself as input data type
1024     s = str(in_str)
1025     if allow_hex:
1026         return UUID_HEX_OK_RE.match(s) is not None
1027     return UUID_RE.match(s) is not None
1028
1029
1030 def is_ip_v4(in_str: Any) -> bool:
1031     """
1032     Args:
1033         in_str: the string to test
1034
1035     Returns:
1036         True if in_str contains a valid IPv4 address and False otherwise.
1037
1038     See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1039     and :meth:`is_ip`.
1040
1041     >>> is_ip_v4('255.200.100.75')
1042     True
1043     >>> is_ip_v4('nope')
1044     False
1045     >>> is_ip_v4('255.200.100.999')  # 999 out of range
1046     False
1047     """
1048     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1049         return False
1050
1051     # checks that each entry in the ip is in the valid range (0 to 255)
1052     for token in in_str.split("."):
1053         if not 0 <= int(token) <= 255:
1054             return False
1055     return True
1056
1057
1058 def extract_ip_v4(in_str: Any) -> Optional[str]:
1059     """
1060     Args:
1061         in_str: the string to extract an IPv4 address from.
1062
1063     Returns:
1064         The first extracted IPv4 address from in_str or None if
1065         none were found or an error occurred.
1066
1067     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1068     and :meth:`is_ip`.
1069
1070     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
1071     '127.0.0.1'
1072     >>> extract_ip_v4('Your mom dresses you funny.')
1073     """
1074     if not is_full_string(in_str):
1075         return None
1076     m = ANYWHERE_IP_V4_RE.search(in_str)
1077     if m is not None:
1078         return m.group(0)
1079     return None
1080
1081
1082 def is_ip_v6(in_str: Any) -> bool:
1083     """
1084     Args:
1085         in_str: the string to test.
1086
1087     Returns:
1088         True if in_str contains a valid IPv6 address and False otherwise.
1089
1090     See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1091     and :meth:`is_ip`.
1092
1093     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1094     True
1095     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
1096     False
1097     """
1098     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1099
1100
1101 def extract_ip_v6(in_str: Any) -> Optional[str]:
1102     """
1103     Args:
1104         in_str: the string from which to extract an IPv6 address.
1105
1106     Returns:
1107         The first IPv6 address found in in_str or None if no address
1108         was found or an error occurred.
1109
1110     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1111     and :meth:`is_ip`.
1112
1113     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1114     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1115     >>> extract_ip_v6("(and she's ugly too, btw)")
1116     """
1117     if not is_full_string(in_str):
1118         return None
1119     m = ANYWHERE_IP_V6_RE.search(in_str)
1120     if m is not None:
1121         return m.group(0)
1122     return None
1123
1124
1125 def is_ip(in_str: Any) -> bool:
1126     """
1127     Args:
1128         in_str: the string to test.
1129
1130     Returns:
1131         True if in_str contains a valid IP address (either IPv4 or
1132         IPv6).
1133
1134     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1135     and :meth:`extract_ip_v4`.
1136
1137     >>> is_ip('255.200.100.75')
1138     True
1139     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1140     True
1141     >>> is_ip('1.2.3')
1142     False
1143     >>> is_ip('1.2.3.999')
1144     False
1145     """
1146     return is_ip_v6(in_str) or is_ip_v4(in_str)
1147
1148
1149 def extract_ip(in_str: Any) -> Optional[str]:
1150     """
1151     Args:
1152         in_str: the string from which to extract in IP address.
1153
1154     Returns:
1155         The first IP address (IPv4 or IPv6) found in in_str or
1156         None to indicate none found or an error condition.
1157
1158     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1159     and :meth:`extract_ip_v4`.
1160
1161     >>> extract_ip('Attacker: 255.200.100.75')
1162     '255.200.100.75'
1163     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1164     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1165     >>> extract_ip('1.2.3')
1166     """
1167     ip = extract_ip_v4(in_str)
1168     if ip is None:
1169         ip = extract_ip_v6(in_str)
1170     return ip
1171
1172
1173 def is_mac_address(in_str: Any) -> bool:
1174     """
1175     Args:
1176         in_str: the string to test
1177
1178     Returns:
1179         True if in_str is a valid MAC address False otherwise.
1180
1181     See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1182
1183     >>> is_mac_address("34:29:8F:12:0D:2F")
1184     True
1185     >>> is_mac_address('34:29:8f:12:0d:2f')
1186     True
1187     >>> is_mac_address('34-29-8F-12-0D-2F')
1188     True
1189     >>> is_mac_address("test")
1190     False
1191     """
1192     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1193
1194
1195 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1196     """
1197     Args:
1198         in_str: the string from which to extract a MAC address.
1199
1200     Returns:
1201         The first MAC address found in in_str or None to indicate no
1202         match or an error.
1203
1204     See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1205
1206     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1207     '34:29:8F:12:0D:2F'
1208
1209     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1210     'd8:5d:e2:34:54:86'
1211     """
1212     if not is_full_string(in_str):
1213         return None
1214     in_str.strip()
1215     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1216     if m is not None:
1217         mac = m.group(0)
1218         mac.replace(":", separator)
1219         mac.replace("-", separator)
1220         return mac
1221     return None
1222
1223
1224 def is_slug(in_str: Any, separator: str = "-") -> bool:
1225     """
1226     Args:
1227         in_str: string to test
1228         separator: the slug character to use
1229
1230     Returns:
1231         True if in_str is a slug string and False otherwise.
1232
1233     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1234
1235     >>> is_slug('my-blog-post-title')
1236     True
1237     >>> is_slug('My blog post title')
1238     False
1239     """
1240     if not is_full_string(in_str):
1241         return False
1242     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1243     return re.match(rex, in_str) is not None
1244
1245
1246 def contains_html(in_str: str) -> bool:
1247     """
1248     Args:
1249         in_str: the string to check for tags in
1250
1251     Returns:
1252         True if the given string contains HTML/XML tags and False
1253         otherwise.
1254
1255     See also :meth:`strip_html`.
1256
1257     .. warning::
1258         By design, this function matches ANY type of tag, so don't expect
1259         to use it as an HTML validator.  It's a quick sanity check at
1260         best.  See something like BeautifulSoup for a more full-featuered
1261         HTML parser.
1262
1263     >>> contains_html('my string is <strong>bold</strong>')
1264     True
1265     >>> contains_html('my string is not bold')
1266     False
1267
1268     """
1269     if not is_string(in_str):
1270         raise ValueError(in_str)
1271     return HTML_RE.search(in_str) is not None
1272
1273
1274 def words_count(in_str: str) -> int:
1275     """
1276     Args:
1277         in_str: the string to count words in
1278
1279     Returns:
1280         The number of words contained in the given string.
1281
1282     .. note::
1283         This method is "smart" in that it does consider only sequences
1284         of one or more letter and/or numbers to be "words".  Thus a
1285         string like this: "! @ # % ... []" will return zero.  Moreover
1286         it is aware of punctuation, so the count for a string like
1287         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1288         in the string).
1289
1290     >>> words_count('hello world')
1291     2
1292     >>> words_count('one,two,three.stop')
1293     4
1294     """
1295     if not is_string(in_str):
1296         raise ValueError(in_str)
1297     return len(WORDS_COUNT_RE.findall(in_str))
1298
1299
1300 def word_count(in_str: str) -> int:
1301     """
1302     Args:
1303         in_str: the string to count words in
1304
1305     Returns:
1306         The number of words contained in the given string.
1307
1308     .. note::
1309         This method is "smart" in that it does consider only sequences
1310         of one or more letter and/or numbers to be "words".  Thus a
1311         string like this: "! @ # % ... []" will return zero.  Moreover
1312         it is aware of punctuation, so the count for a string like
1313         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1314         in the string).
1315
1316     >>> word_count('hello world')
1317     2
1318     >>> word_count('one,two,three.stop')
1319     4
1320     """
1321     return words_count(in_str)
1322
1323
1324 def generate_uuid(omit_dashes: bool = False) -> str:
1325     """
1326     Args:
1327         omit_dashes: should we omit the dashes in the generated UUID?
1328
1329     Returns:
1330         A generated UUID string (using `uuid.uuid4()`) with or without
1331         dashes per the omit_dashes arg.
1332
1333     See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1334
1335     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1336     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1337     """
1338     uid = uuid4()
1339     if omit_dashes:
1340         return uid.hex
1341     return str(uid)
1342
1343
1344 def generate_random_alphanumeric_string(size: int) -> str:
1345     """
1346     Args:
1347         size: number of characters to generate
1348
1349     Returns:
1350         A string of the specified size containing random characters
1351         (uppercase/lowercase ascii letters and digits).
1352
1353     See also :meth:`asciify`, :meth:`generate_uuid`.
1354
1355     >>> random.seed(22)
1356     >>> generate_random_alphanumeric_string(9)
1357     '96ipbNClS'
1358     """
1359     if size < 1:
1360         raise ValueError("size must be >= 1")
1361     chars = string.ascii_letters + string.digits
1362     buffer = [random.choice(chars) for _ in range(size)]
1363     return from_char_list(buffer)
1364
1365
1366 def reverse(in_str: str) -> str:
1367     """
1368     Args:
1369         in_str: the string to reverse
1370
1371     Returns:
1372         The reversed (chracter by character) string.
1373
1374     >>> reverse('test')
1375     'tset'
1376     """
1377     if not is_string(in_str):
1378         raise ValueError(in_str)
1379     return in_str[::-1]
1380
1381
1382 def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
1383     """
1384     Args:
1385         in_str: the camel case string to convert
1386         separator: the snake case separator character to use
1387
1388     Returns:
1389         A snake case string equivalent to the camel case input or the
1390         original string if it is not a valid camel case string or some
1391         other error occurs.
1392
1393     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1394
1395     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1396     'mac_address_extractor_factory'
1397     >>> camel_case_to_snake_case('Luke Skywalker')
1398     'Luke Skywalker'
1399     """
1400     if not is_string(in_str):
1401         raise ValueError(in_str)
1402     if not is_camel_case(in_str):
1403         return in_str
1404     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1405
1406
1407 def snake_case_to_camel_case(
1408     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1409 ) -> str:
1410     """
1411     Args:
1412         in_str: the snake case string to convert
1413         upper_case_first: should we capitalize the first letter?
1414         separator: the separator character to use
1415
1416     Returns:
1417         A camel case string that is equivalent to the snake case string
1418         provided or the original string back again if it is not valid
1419         snake case or another error occurs.
1420
1421     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1422
1423     >>> snake_case_to_camel_case('this_is_a_test')
1424     'ThisIsATest'
1425     >>> snake_case_to_camel_case('Han Solo')
1426     'Han Solo'
1427     """
1428     if not is_string(in_str):
1429         raise ValueError(in_str)
1430     if not is_snake_case(in_str, separator=separator):
1431         return in_str
1432     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1433     if not upper_case_first:
1434         tokens[0] = tokens[0].lower()
1435     return from_char_list(tokens)
1436
1437
1438 def to_char_list(in_str: str) -> List[str]:
1439     """
1440     Args:
1441         in_str: the string to split into a char list
1442
1443     Returns:
1444         A list of strings of length one each.
1445
1446     See also :meth:`from_char_list`.
1447
1448     >>> to_char_list('test')
1449     ['t', 'e', 's', 't']
1450     """
1451     if not is_string(in_str):
1452         return []
1453     return list(in_str)
1454
1455
1456 def from_char_list(in_list: List[str]) -> str:
1457     """
1458     Args:
1459         in_list: A list of characters to convert into a string.
1460
1461     Returns:
1462         The string resulting from gluing the characters in in_list
1463         together.
1464
1465     See also :meth:`to_char_list`.
1466
1467     >>> from_char_list(['t', 'e', 's', 't'])
1468     'test'
1469     """
1470     return "".join(in_list)
1471
1472
1473 def shuffle(in_str: str) -> Optional[str]:
1474     """
1475     Args:
1476         in_str: a string to shuffle randomly by character
1477
1478     Returns:
1479         A new string containing same chars of the given one but in
1480         a randomized order.  Note that in rare cases this could result
1481         in the same original string as no check is done.  Returns
1482         None to indicate error conditions.
1483
1484     >>> random.seed(22)
1485     >>> shuffle('awesome')
1486     'meosaew'
1487     """
1488     if not is_string(in_str):
1489         return None
1490     chars = to_char_list(in_str)
1491     random.shuffle(chars)
1492     return from_char_list(chars)
1493
1494
1495 def scramble(in_str: str) -> Optional[str]:
1496     """
1497     Args:
1498         in_str: a string to shuffle randomly by character
1499
1500     Returns:
1501         A new string containing same chars of the given one but in
1502         a randomized order.  Note that in rare cases this could result
1503         in the same original string as no check is done.  Returns
1504         None to indicate error conditions.
1505
1506     See also :mod:`pyutils.unscrambler`.
1507
1508     >>> random.seed(22)
1509     >>> scramble('awesome')
1510     'meosaew'
1511     """
1512     return shuffle(in_str)
1513
1514
1515 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1516     """
1517     Args:
1518         in_str: the string to strip tags from
1519         keep_tag_content: should we keep the inner contents of tags?
1520
1521     Returns:
1522         A string with all HTML tags removed (optionally with tag contents
1523         preserved).
1524
1525     See also :meth:`contains_html`.
1526
1527     .. note::
1528         This method uses simple regular expressions to strip tags and is
1529         not a full fledged HTML parser by any means.  Consider using
1530         something like BeautifulSoup if your needs are more than this
1531         simple code can fulfill.
1532
1533     >>> strip_html('test: <a href="foo/bar">click here</a>')
1534     'test: '
1535     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1536     'test: click here'
1537     """
1538     if not is_string(in_str):
1539         raise ValueError(in_str)
1540     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1541     return r.sub("", in_str)
1542
1543
1544 def asciify(in_str: str) -> str:
1545     """
1546     Args:
1547         in_str: the string to asciify.
1548
1549     Returns:
1550         An output string roughly equivalent to the original string
1551         where all content to are ascii-only.  This is accomplished
1552         by translating all non-ascii chars into their closest possible
1553         ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1554
1555     See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1556
1557     .. warning::
1558         Some chars may be lost if impossible to translate.
1559
1560     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1561     'eeuuooaaeynAAACIINOE'
1562     """
1563     if not is_string(in_str):
1564         raise ValueError(in_str)
1565
1566     # "NFKD" is the algorithm which is able to successfully translate
1567     # the most of non-ascii chars.
1568     normalized = unicodedata.normalize("NFKD", in_str)
1569
1570     # encode string forcing ascii and ignore any errors
1571     # (unrepresentable chars will be stripped out)
1572     ascii_bytes = normalized.encode("ascii", "ignore")
1573
1574     # turns encoded bytes into an utf-8 string
1575     return ascii_bytes.decode("utf-8")
1576
1577
1578 def slugify(in_str: str, *, separator: str = "-") -> str:
1579     """
1580     Args:
1581         in_str: the string to slugify
1582         separator: the character to use during sligification (default
1583             is a dash)
1584
1585     Returns:
1586         The converted string.  The returned string has the following properties:
1587
1588         * it has no spaces
1589         * all letters are in lower case
1590         * all punctuation signs and non alphanumeric chars are removed
1591         * words are divided using provided separator
1592         * all chars are encoded as ascii (by using :meth:`asciify`)
1593         * is safe for URL
1594
1595     See also :meth:`is_slug` and :meth:`asciify`.
1596
1597     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1598     'top-10-reasons-to-love-dogs'
1599     >>> slugify('Mönstér Mägnët')
1600     'monster-magnet'
1601     """
1602     if not is_string(in_str):
1603         raise ValueError(in_str)
1604
1605     # replace any character that is NOT letter or number with spaces
1606     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1607
1608     # replace spaces with join sign
1609     out = SPACES_RE.sub(separator, out)
1610
1611     # normalize joins (remove duplicates)
1612     out = re.sub(re.escape(separator) + r"+", separator, out)
1613     return asciify(out)
1614
1615
1616 def to_bool(in_str: str) -> bool:
1617     """
1618     Args:
1619         in_str: the string to convert to boolean
1620
1621     Returns:
1622         A boolean equivalent of the original string based on its contents.
1623         All conversion is case insensitive.  A positive boolean (True) is
1624         returned if the string value is any of the following:
1625
1626         * "true"
1627         * "t"
1628         * "1"
1629         * "yes"
1630         * "y"
1631         * "on"
1632
1633         Otherwise False is returned.
1634
1635     See also :mod:`pyutils.argparse_utils`.
1636
1637     >>> to_bool('True')
1638     True
1639
1640     >>> to_bool('1')
1641     True
1642
1643     >>> to_bool('yes')
1644     True
1645
1646     >>> to_bool('no')
1647     False
1648
1649     >>> to_bool('huh?')
1650     False
1651
1652     >>> to_bool('on')
1653     True
1654     """
1655     if not is_string(in_str):
1656         raise ValueError(in_str)
1657     return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
1658
1659
1660 def to_date(in_str: str) -> Optional[datetime.date]:
1661     """
1662     Args:
1663         in_str: the string to convert into a date
1664
1665     Returns:
1666         The datetime.date the string contained or None to indicate
1667         an error.  This parser is relatively clever; see
1668         :class:`datetimes.dateparse_utils` docs for details.
1669
1670     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
1671     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1672
1673     >>> to_date('9/11/2001')
1674     datetime.date(2001, 9, 11)
1675     >>> to_date('xyzzy')
1676     """
1677     import pyutils.datetimes.dateparse_utils as du
1678
1679     try:
1680         d = du.DateParser()  # type: ignore
1681         d.parse(in_str)
1682         return d.get_date()
1683     except du.ParseException:  # type: ignore
1684         pass
1685     return None
1686
1687
1688 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1689     """Finds and extracts a date from the string, if possible.
1690
1691     Args:
1692         in_str: the string to extract a date from
1693
1694     Returns:
1695         a datetime if date was found, otherwise None
1696
1697     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1698     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1699
1700     >>> extract_date("filename.txt    dec 13, 2022")
1701     datetime.datetime(2022, 12, 13, 0, 0)
1702
1703     >>> extract_date("Dear Santa, please get me a pony.")
1704
1705     """
1706     import itertools
1707
1708     import pyutils.datetimes.dateparse_utils as du
1709
1710     d = du.DateParser()  # type: ignore
1711     chunks = in_str.split()
1712     for ngram in itertools.chain(
1713         list_utils.ngrams(chunks, 5),
1714         list_utils.ngrams(chunks, 4),
1715         list_utils.ngrams(chunks, 3),
1716         list_utils.ngrams(chunks, 2),
1717     ):
1718         try:
1719             expr = " ".join(ngram)
1720             logger.debug("Trying %s", expr)
1721             if d.parse(expr):
1722                 return d.get_datetime()
1723         except du.ParseException:  # type: ignore
1724             pass
1725     return None
1726
1727
1728 def is_valid_date(in_str: str) -> bool:
1729     """
1730     Args:
1731         in_str: the string to check
1732
1733     Returns:
1734         True if the string represents a valid date that we can recognize
1735         and False otherwise.  This parser is relatively clever; see
1736         :class:`datetimes.dateparse_utils` docs for details.
1737
1738     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1739     :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1740
1741     >>> is_valid_date('1/2/2022')
1742     True
1743     >>> is_valid_date('christmas')
1744     True
1745     >>> is_valid_date('next wednesday')
1746     True
1747     >>> is_valid_date('xyzzy')
1748     False
1749     """
1750     import pyutils.datetimes.dateparse_utils as dp
1751
1752     try:
1753         d = dp.DateParser()  # type: ignore
1754         _ = d.parse(in_str)
1755         return True
1756     except dp.ParseException:  # type: ignore
1757         pass
1758     return False
1759
1760
1761 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1762     """
1763     Args:
1764         in_str: string to parse into a datetime
1765
1766     Returns:
1767         A python datetime parsed from in_str or None to indicate
1768         an error.  This parser is relatively clever; see
1769         :class:`datetimes.dateparse_utils` docs for details.
1770
1771     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1772     :meth:`extract_date`, :meth:`valid_datetime`.
1773
1774     >>> to_datetime('7/20/1969 02:56 GMT')
1775     datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1776     """
1777     import pyutils.datetimes.dateparse_utils as dp
1778
1779     try:
1780         d = dp.DateParser()  # type: ignore
1781         dt = d.parse(in_str)
1782         if isinstance(dt, datetime.datetime):
1783             return dt
1784     except Exception:
1785         pass
1786     return None
1787
1788
1789 def valid_datetime(in_str: str) -> bool:
1790     """
1791     Args:
1792         in_str: the string to check
1793
1794     Returns:
1795         True if in_str contains a valid datetime and False otherwise.
1796         This parser is relatively clever; see
1797         :class:`datetimes.dateparse_utils` docs for details.
1798
1799     >>> valid_datetime('next wednesday at noon')
1800     True
1801     >>> valid_datetime('3 weeks ago at midnight')
1802     True
1803     >>> valid_datetime('next easter at 5:00 am')
1804     True
1805     >>> valid_datetime('sometime soon')
1806     False
1807     """
1808     _ = to_datetime(in_str)
1809     if _ is not None:
1810         return True
1811     return False
1812
1813
1814 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1815     """
1816     Args:
1817         in_str: the string to squeeze
1818         character_to_squeeze: the character to remove runs of
1819             more than one in a row (default = space)
1820
1821     Returns: A "squeezed string" where runs of more than one
1822         character_to_squeeze into one.
1823
1824     >>> squeeze(' this        is       a    test    ')
1825     ' this is a test '
1826
1827     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1828     'one|!|two|!|three'
1829
1830     """
1831     return re.sub(
1832         r'(' + re.escape(character_to_squeeze) + r')+',
1833         character_to_squeeze,
1834         in_str,
1835     )
1836
1837
1838 def dedent(in_str: str) -> Optional[str]:
1839     """
1840     Args:
1841         in_str: the string to dedent
1842
1843     Returns:
1844         A string with tab indentation removed or None on error.
1845
1846     See also :meth:`indent`.
1847
1848     >>> dedent('\t\ttest\\n\t\ting')
1849     'test\\ning'
1850     """
1851     if not is_string(in_str):
1852         return None
1853     line_separator = '\n'
1854     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1855     return line_separator.join(lines)
1856
1857
1858 def indent(in_str: str, amount: int) -> str:
1859     """
1860     Args:
1861         in_str: the string to indent
1862         amount: count of spaces to indent each line by
1863
1864     Returns:
1865         An indented string created by prepending amount spaces.
1866
1867     See also :meth:`dedent`.
1868
1869     >>> indent('This is a test', 4)
1870     '    This is a test'
1871     """
1872     if not is_string(in_str):
1873         raise ValueError(in_str)
1874     line_separator = '\n'
1875     lines = [" " * amount + line for line in in_str.split(line_separator)]
1876     return line_separator.join(lines)
1877
1878
1879 def _sprintf(*args, **kwargs) -> str:
1880     """Internal helper."""
1881     ret = ""
1882
1883     sep = kwargs.pop("sep", None)
1884     if sep is not None:
1885         if not isinstance(sep, str):
1886             raise TypeError("sep must be None or a string")
1887
1888     end = kwargs.pop("end", None)
1889     if end is not None:
1890         if not isinstance(end, str):
1891             raise TypeError("end must be None or a string")
1892
1893     if kwargs:
1894         raise TypeError("invalid keyword arguments to sprint()")
1895
1896     if sep is None:
1897         sep = " "
1898     if end is None:
1899         end = "\n"
1900     for n, arg in enumerate(args):
1901         if n:
1902             ret += sep
1903         if isinstance(arg, str):
1904             ret += arg
1905         else:
1906             ret += str(arg)
1907     ret += end
1908     return ret
1909
1910
1911 def strip_ansi_sequences(in_str: str) -> str:
1912     """
1913     Args:
1914         in_str: the string to strip
1915
1916     Returns:
1917         in_str with recognized ANSI escape sequences removed.
1918
1919     See also :mod:`pyutils.ansi`.
1920
1921     .. warning::
1922         This method works by using a regular expression.
1923         It works for all ANSI escape sequences I've tested with but
1924         may miss some; caveat emptor.
1925
1926     >>> import ansi as a
1927     >>> s = a.fg('blue') + 'blue!' + a.reset()
1928     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1929     18
1930     >>> len(strip_ansi_sequences(s))
1931     5
1932     >>> strip_ansi_sequences(s)
1933     'blue!'
1934
1935     """
1936     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1937
1938
1939 class SprintfStdout(contextlib.AbstractContextManager):
1940     """
1941     A context manager that captures outputs to stdout to a buffer
1942     without printing them.
1943
1944     >>> with SprintfStdout() as buf:
1945     ...     print("test")
1946     ...     print("1, 2, 3")
1947     ...
1948     >>> print(buf(), end='')
1949     test
1950     1, 2, 3
1951     """
1952
1953     def __init__(self) -> None:
1954         self.destination = io.StringIO()
1955         self.recorder: contextlib.redirect_stdout
1956
1957     def __enter__(self) -> Callable[[], str]:
1958         self.recorder = contextlib.redirect_stdout(self.destination)
1959         self.recorder.__enter__()
1960         return lambda: self.destination.getvalue()
1961
1962     def __exit__(self, *args) -> Literal[False]:
1963         self.recorder.__exit__(*args)
1964         self.destination.seek(0)
1965         return False
1966
1967
1968 def capitalize_first_letter(in_str: str) -> str:
1969     """
1970     Args:
1971         in_str: the string to capitalize
1972
1973     Returns:
1974         in_str with the first character capitalized.
1975
1976     >>> capitalize_first_letter('test')
1977     'Test'
1978     >>> capitalize_first_letter("ALREADY!")
1979     'ALREADY!'
1980     """
1981     return in_str[0].upper() + in_str[1:]
1982
1983
1984 def it_they(n: int) -> str:
1985     """
1986     Args:
1987         n: how many of them are there?
1988
1989     Returns:
1990         'it' if n is one or 'they' otherwize.
1991
1992     See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
1993     :meth:`thify`.
1994
1995     Suggested usage::
1996
1997         n = num_files_saved_to_tmp()
1998         print(f'Saved file{pluralize(n)} successfully.')
1999         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2000
2001     >>> it_they(1)
2002     'it'
2003     >>> it_they(100)
2004     'they'
2005     """
2006     if n == 1:
2007         return "it"
2008     return "they"
2009
2010
2011 def is_are(n: int) -> str:
2012     """
2013     Args:
2014         n: how many of them are there?
2015
2016     Returns:
2017         'is' if n is one or 'are' otherwize.
2018
2019     See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2020     :meth:`thify`.
2021
2022     Suggested usage::
2023
2024         n = num_files_saved_to_tmp()
2025         print(f'Saved file{pluralize(n)} successfully.')
2026         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2027
2028     >>> is_are(1)
2029     'is'
2030     >>> is_are(2)
2031     'are'
2032
2033     """
2034     if n == 1:
2035         return "is"
2036     return "are"
2037
2038
2039 def pluralize(n: int) -> str:
2040     """
2041     Args:
2042         n: how many of them are there?
2043
2044     Returns:
2045         's' if n is greater than one otherwize ''.
2046
2047     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2048     :meth:`thify`.
2049
2050     Suggested usage::
2051
2052         n = num_files_saved_to_tmp()
2053         print(f'Saved file{pluralize(n)} successfully.')
2054         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2055
2056     >>> pluralize(15)
2057     's'
2058     >>> count = 1
2059     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2060     There is 1 file.
2061     >>> count = 4
2062     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2063     There are 4 files.
2064     """
2065     if n == 1:
2066         return ""
2067     return "s"
2068
2069
2070 def make_contractions(txt: str) -> str:
2071     """This code glues words in txt together to form (English)
2072     contractions.
2073
2074     Args:
2075         txt: the input text to be contractionized.
2076
2077     Returns:
2078         Output text identical to original input except for any
2079         recognized contractions are formed.
2080
2081     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2082
2083     .. note::
2084         The order in which we create contractions is defined by the
2085         implementation and what I thought made more sense when writing
2086         this code.
2087
2088     >>> make_contractions('It is nice today.')
2089     "It's nice today."
2090
2091     >>> make_contractions('I can    not even...')
2092     "I can't even..."
2093
2094     >>> make_contractions('She could not see!')
2095     "She couldn't see!"
2096
2097     >>> make_contractions('But she will not go.')
2098     "But she won't go."
2099
2100     >>> make_contractions('Verily, I shall not.')
2101     "Verily, I shan't."
2102
2103     >>> make_contractions('No you cannot.')
2104     "No you can't."
2105
2106     >>> make_contractions('I said you can not go.')
2107     "I said you can't go."
2108     """
2109
2110     first_second = [
2111         (
2112             [
2113                 'are',
2114                 'could',
2115                 'did',
2116                 'has',
2117                 'have',
2118                 'is',
2119                 'must',
2120                 'should',
2121                 'was',
2122                 'were',
2123                 'would',
2124             ],
2125             ['(n)o(t)'],
2126         ),
2127         (
2128             [
2129                 "I",
2130                 "you",
2131                 "he",
2132                 "she",
2133                 "it",
2134                 "we",
2135                 "they",
2136                 "how",
2137                 "why",
2138                 "when",
2139                 "where",
2140                 "who",
2141                 "there",
2142             ],
2143             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2144         ),
2145     ]
2146
2147     # Special cases: can't, shan't and won't.
2148     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2149     txt = re.sub(
2150         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2151     )
2152     txt = re.sub(
2153         r'\b(w)ill\s*(n)(o)(t)\b',
2154         r"\1\3\2'\4",
2155         txt,
2156         count=0,
2157         flags=re.IGNORECASE,
2158     )
2159
2160     for first_list, second_list in first_second:
2161         for first in first_list:
2162             for second in second_list:
2163                 # Disallow there're/where're.  They're valid English
2164                 # but sound weird.
2165                 if (first in set(['there', 'where'])) and second == 'a(re)':
2166                     continue
2167
2168                 pattern = fr'\b({first})\s+{second}\b'
2169                 if second == '(n)o(t)':
2170                     replacement = r"\1\2'\3"
2171                 else:
2172                     replacement = r"\1'\2"
2173                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2174
2175     return txt
2176
2177
2178 def thify(n: int) -> str:
2179     """
2180     Args:
2181         n: how many of them are there?
2182
2183     Returns:
2184         The proper cardinal suffix for a number.
2185
2186     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2187
2188     Suggested usage::
2189
2190         attempt_count = 0
2191         while True:
2192             attempt_count += 1
2193             if try_the_thing():
2194                 break
2195             print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2196
2197     >>> thify(1)
2198     'st'
2199     >>> thify(33)
2200     'rd'
2201     >>> thify(16)
2202     'th'
2203     """
2204     digit = str(n)
2205     assert is_integer_number(digit)
2206     digit = digit[-1:]
2207     if digit == "1":
2208         return "st"
2209     elif digit == "2":
2210         return "nd"
2211     elif digit == "3":
2212         return "rd"
2213     else:
2214         return "th"
2215
2216
2217 def ngrams(txt: str, n: int):
2218     """
2219     Args:
2220         txt: the string to create ngrams using
2221         n: how many words per ngram created?
2222
2223     Returns:
2224         Generates the ngrams from the input string.
2225
2226     See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2227
2228     >>> [x for x in ngrams('This is a test', 2)]
2229     ['This is', 'is a', 'a test']
2230     """
2231     words = txt.split()
2232     for ngram in ngrams_presplit(words, n):
2233         ret = ''
2234         for w in ngram:
2235             ret += f'{w} '
2236         yield ret.strip()
2237
2238
2239 def ngrams_presplit(words: Sequence[str], n: int):
2240     """
2241     Same as :meth:`ngrams` but with the string pre-split.
2242
2243     See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2244     """
2245     return list_utils.ngrams(words, n)
2246
2247
2248 def bigrams(txt: str):
2249     """Generates the bigrams (n=2) of the given string.
2250
2251     See also :meth:`ngrams`, :meth:`trigrams`.
2252
2253     >>> [x for x in bigrams('this is a test')]
2254     ['this is', 'is a', 'a test']
2255     """
2256     return ngrams(txt, 2)
2257
2258
2259 def trigrams(txt: str):
2260     """Generates the trigrams (n=3) of the given string.
2261
2262     See also :meth:`ngrams`, :meth:`bigrams`.
2263     """
2264     return ngrams(txt, 3)
2265
2266
2267 def shuffle_columns_into_list(
2268     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
2269 ) -> Iterable[str]:
2270     """Helper to shuffle / parse columnar data and return the results as a
2271     list.
2272
2273     Args:
2274         input_lines: A sequence of strings that represents text that
2275             has been broken into columns by the caller
2276         column_specs: an iterable collection of numeric sequences that
2277             indicate one or more column numbers to copy to form the Nth
2278             position in the output list.  See example below.
2279         delim: for column_specs that indicate we should copy more than
2280             one column from the input into this position, use delim to
2281             separate source data.  Defaults to ''.
2282
2283     Returns:
2284         A list of string created by following the instructions set forth
2285         in column_specs.
2286
2287     See also :meth:`shuffle_columns_into_dict`.
2288
2289     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2290     >>> shuffle_columns_into_list(
2291     ...     cols,
2292     ...     [ [8], [2, 3], [5, 6, 7] ],
2293     ...     delim='!',
2294     ... )
2295     ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2296     """
2297     out = []
2298
2299     # Column specs map input lines' columns into outputs.
2300     # [col1, col2...]
2301     for spec in column_specs:
2302         hunk = ''
2303         for n in spec:
2304             hunk = hunk + delim + input_lines[n]
2305         hunk = hunk.strip(delim)
2306         out.append(hunk)
2307     return out
2308
2309
2310 def shuffle_columns_into_dict(
2311     input_lines: Sequence[str],
2312     column_specs: Iterable[Tuple[str, Iterable[int]]],
2313     delim: str = '',
2314 ) -> Dict[str, str]:
2315     """Helper to shuffle / parse columnar data and return the results
2316     as a dict.
2317
2318     Args:
2319         input_lines: a sequence of strings that represents text that
2320             has been broken into columns by the caller
2321         column_specs: instructions for what dictionary keys to apply
2322             to individual or compound input column data.  See example
2323             below.
2324         delim: when forming compound output data by gluing more than
2325             one input column together, use this character to separate
2326             the source data.  Defaults to ''.
2327
2328     Returns:
2329         A dict formed by applying the column_specs instructions.
2330
2331     See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2332
2333     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2334     >>> shuffle_columns_into_dict(
2335     ...     cols,
2336     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2337     ...     delim='!',
2338     ... )
2339     {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2340     """
2341     out = {}
2342
2343     # Column specs map input lines' columns into outputs.
2344     # "key", [col1, col2...]
2345     for spec in column_specs:
2346         hunk = ''
2347         for n in spec[1]:
2348             hunk = hunk + delim + input_lines[n]
2349         hunk = hunk.strip(delim)
2350         out[spec[0]] = hunk
2351     return out
2352
2353
2354 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2355     """
2356     Interpolate a string with data from a dict.
2357
2358     Args:
2359         txt: the mad libs template
2360         values: what you and your kids chose for each category.
2361
2362     See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2363
2364     >>> interpolate_using_dict('This is a {adjective} {noun}.',
2365     ...                        {'adjective': 'good', 'noun': 'example'})
2366     'This is a good example.'
2367     """
2368     return _sprintf(txt.format(**values), end='')
2369
2370
2371 def to_ascii(txt: str):
2372     """
2373     Args:
2374         txt: the input data to encode
2375
2376     Returns:
2377         txt encoded as an ASCII byte string.
2378
2379     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2380     :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2381
2382     >>> to_ascii('test')
2383     b'test'
2384
2385     >>> to_ascii(b'1, 2, 3')
2386     b'1, 2, 3'
2387     """
2388     if isinstance(txt, str):
2389         return txt.encode('ascii')
2390     if isinstance(txt, bytes):
2391         return txt
2392     raise Exception('to_ascii works with strings and bytes')
2393
2394
2395 def to_base64(
2396     txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2397 ) -> bytes:
2398     """
2399     Args:
2400         txt: the input data to encode
2401         encoding: the encoding to use during conversion
2402         errors: how to handle encoding errors
2403
2404     Returns:
2405         txt encoded with a 64-chracter alphabet.  Similar to and compatible
2406         with uuencode/uudecode.
2407
2408     See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2409     :meth:`from_base64`.
2410
2411     >>> to_base64('hello?')
2412     b'aGVsbG8/\\n'
2413     """
2414     return base64.encodebytes(txt.encode(encoding, errors))
2415
2416
2417 def is_base64(txt: str) -> bool:
2418     """
2419     Args:
2420         txt: the string to check
2421
2422     Returns:
2423         True if txt is a valid base64 encoded string.  This assumes
2424         txt was encoded with Python's standard base64 alphabet which
2425         is the same as what uuencode/uudecode uses).
2426
2427     See also :meth:`to_base64`, :meth:`from_base64`.
2428
2429     >>> is_base64('test')    # all letters in the b64 alphabet
2430     True
2431
2432     >>> is_base64('another test, how do you like this one?')
2433     False
2434
2435     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
2436     True
2437
2438     """
2439     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2440     alphabet = set(a.encode('ascii'))
2441     for char in to_ascii(txt.strip()):
2442         if char not in alphabet:
2443             return False
2444     return True
2445
2446
2447 def from_base64(
2448     b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2449 ) -> str:
2450     """
2451     Args:
2452         b64: bytestring of 64-bit encoded data to decode / convert.
2453         encoding: the encoding to use during conversion
2454         errors: how to handle encoding errors
2455
2456     Returns:
2457         The decoded form of b64 as a normal python string.  Similar to
2458         and compatible with uuencode / uudecode.
2459
2460     See also :meth:`to_base64`, :meth:`is_base64`.
2461
2462     >>> from_base64(b'aGVsbG8/\\n')
2463     'hello?'
2464     """
2465     return base64.decodebytes(b64).decode(encoding, errors)
2466
2467
2468 def chunk(txt: str, chunk_size: int):
2469     """
2470     Args:
2471         txt: a string to be chunked into evenly spaced pieces.
2472         chunk_size: the size of each chunk to make
2473
2474     Returns:
2475         The original string chunked into evenly spaced pieces.
2476
2477     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2478     '01001101 11000101 10101010 10101010 10011111 10101000'
2479     """
2480     if len(txt) % chunk_size != 0:
2481         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2482         logger.warning(msg)
2483         warnings.warn(msg, stacklevel=2)
2484     for x in range(0, len(txt), chunk_size):
2485         yield txt[x : x + chunk_size]
2486
2487
2488 def to_bitstring(txt: str, *, delimiter: str = '') -> str:
2489     """
2490     Args:
2491         txt: the string to convert into a bitstring
2492         delimiter: character to insert between adjacent bytes.  Note that
2493             only bitstrings with delimiter='' are interpretable by
2494             :meth:`from_bitstring`.
2495
2496     Returns:
2497         txt converted to ascii/binary and then chopped into bytes.
2498
2499     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2500     :meth:`chunk`.
2501
2502     >>> to_bitstring('hello?')
2503     '011010000110010101101100011011000110111100111111'
2504
2505     >>> to_bitstring('test', delimiter=' ')
2506     '01110100 01100101 01110011 01110100'
2507
2508     >>> to_bitstring(b'test')
2509     '01110100011001010111001101110100'
2510     """
2511     etxt = to_ascii(txt)
2512     bits = bin(int.from_bytes(etxt, 'big'))
2513     bits = bits[2:]
2514     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2515
2516
2517 def is_bitstring(txt: str) -> bool:
2518     """
2519     Args:
2520         txt: the string to check
2521
2522     Returns:
2523         True if txt is a recognized bitstring and False otherwise.
2524         Note that if delimiter is non empty this code will not
2525         recognize the bitstring.
2526
2527     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2528     :meth:`chunk`.
2529
2530     >>> is_bitstring('011010000110010101101100011011000110111100111111')
2531     True
2532
2533     >>> is_bitstring('1234')
2534     False
2535     """
2536     return is_binary_integer_number(f'0b{txt}')
2537
2538
2539 def from_bitstring(
2540     bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2541 ) -> str:
2542     """
2543     Args:
2544         bits: the bitstring to convert back into a python string
2545         encoding: the encoding to use during conversion
2546         errors: how to handle encoding errors
2547
2548     Returns:
2549         The regular python string represented by bits.  Note that this
2550         code does not work with to_bitstring when delimiter is non-empty.
2551
2552     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2553     :meth:`chunk`.
2554
2555     >>> from_bitstring('011010000110010101101100011011000110111100111111')
2556     'hello?'
2557     """
2558     n = int(bits, 2)
2559     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2560
2561
2562 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2563     """
2564     Args:
2565         txt: an IP address to chunk up for sorting purposes
2566
2567     Returns:
2568         A tuple of IP components arranged such that the sorting of
2569         IP addresses using a normal comparator will do something sane
2570         and desireable.
2571
2572     See also :meth:`is_ip_v4`.
2573
2574     >>> ip_v4_sort_key('10.0.0.18')
2575     (10, 0, 0, 18)
2576
2577     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2578     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2579     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2580     """
2581     if not is_ip_v4(txt):
2582         print(f"not IP: {txt}")
2583         return None
2584     return tuple(int(x) for x in txt.split('.'))
2585
2586
2587 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2588     """
2589     Args:
2590         volume: the string to chunk up for sorting purposes
2591
2592     Returns:
2593         A tuple of volume's components such that the sorting of
2594         volumes using a normal comparator will do something sane
2595         and desireable.
2596
2597     See also :mod:`pyutils.files.file_utils`.
2598
2599     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2600     ('usr', 'local', 'bin')
2601
2602     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2603     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2604     ['/usr', '/usr/local', '/usr/local/bin']
2605     """
2606     return tuple(x for x in volume.split('/') if len(x) > 0)
2607
2608
2609 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2610     """
2611     Execute several replace operations in a row.
2612
2613     Args:
2614         in_str: the string in which to replace characters
2615         replace_set: the set of target characters to replace
2616         replacement: the character to replace any member of replace_set
2617             with
2618
2619     See also :meth:`replace_nth`.
2620
2621     Returns:
2622         The string with replacements executed.
2623
2624     >>> s = 'this_is a-test!'
2625     >>> replace_all(s, ' _-!', '')
2626     'thisisatest'
2627     """
2628     for char in replace_set:
2629         in_str = in_str.replace(char, replacement)
2630     return in_str
2631
2632
2633 def replace_nth(in_str: str, source: str, target: str, nth: int):
2634     """
2635     Replaces the nth occurrance of a substring within a string.
2636
2637     Args:
2638         in_str: the string in which to run the replacement
2639         source: the substring to replace
2640         target: the replacement text
2641         nth: which occurrance of source to replace?
2642
2643     See also :meth:`replace_all`.
2644
2645     >>> replace_nth('this is a test', ' ', '-', 3)
2646     'this is a-test'
2647     """
2648     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2649     before = in_str[:where]
2650     after = in_str[where:]
2651     after = after.replace(source, target, 1)
2652     return before + after
2653
2654
2655 if __name__ == '__main__':
2656     import doctest
2657
2658     doctest.testmod()