src/pyutils/string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7
   8 Modifications Copyright (c) 2021-2022 Scott Gasch
   9
  10 Permission is hereby granted, free of charge, to any person obtaining a copy
  11 of this software and associated documentation files (the "Software"), to deal
  12 in the Software without restriction, including without limitation the rights
  13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 copies of the Software, and to permit persons to whom the Software is
  15 furnished to do so, subject to the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included in all
  18 copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26 SOFTWARE.
  27
  28 This class is based on:
  29 https://github.com/daveoncode/python-string-utils.  See `NOTICE
  30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
  31 in the root of this module for a detailed enumeration of what work is
  32 Davide's and what work was added by Scott.
  33
  34 """
  35
  36 import base64
  37 import contextlib  # type: ignore
  38 import datetime
  39 import io
  40 import json
  41 import logging
  42 import numbers
  43 import random
  44 import re
  45 import string
  46 import unicodedata
  47 import warnings
  48 from itertools import zip_longest
  49 from typing import (
  50     Any,
  51     Callable,
  52     Dict,
  53     Iterable,
  54     List,
  55     Literal,
  56     Optional,
  57     Sequence,
  58     Tuple,
  59 )
  60 from uuid import uuid4
  61
  62 from pyutils import list_utils
  63
  64 logger = logging.getLogger(__name__)
  65
  66 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  67
  68 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  69
  70 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  71
  72 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  73
  74 URLS_RAW_STRING = (
  75     r"([a-z-]+://)"  # scheme
  76     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  77     r"(www\.)?"  # www.
  78     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  79     r"(:\d{2,})?"  # port number
  80     r"(/[a-z\d_%+-]*)*"  # folders
  81     r"(\.[a-z\d_%+-]+)*"  # file extension
  82     r"(\?[a-z\d_+%-=]*)?"  # query string
  83     r"(#\S*)?"  # hash
  84 )
  85
  86 URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
  87
  88 URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
  89
  90 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  91
  92 EMAILS_RAW_STRING = (
  93     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  94 )
  95
  96 EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
  97
  98 EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
  99
 100 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
 101
 102 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
 103
 104 SNAKE_CASE_TEST_RE = re.compile(
 105     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
 106 )
 107
 108 SNAKE_CASE_TEST_DASH_RE = re.compile(
 109     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 110 )
 111
 112 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 113
 114 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 115
 116 CREDIT_CARDS = {
 117     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 118     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 119     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 120     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 121     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 122     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 123 }
 124
 125 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 126
 127 UUID_RE = re.compile(
 128     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 129 )
 130
 131 UUID_HEX_OK_RE = re.compile(
 132     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 133     re.IGNORECASE,
 134 )
 135
 136 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 137
 138 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 139
 140 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 141
 142 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 143
 144 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 145
 146 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 147     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 148 )
 149
 150 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 151
 152 HTML_RE = re.compile(
 153     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 154     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 155 )
 156
 157 HTML_TAG_ONLY_RE = re.compile(
 158     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 159     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 160 )
 161
 162 SPACES_RE = re.compile(r"\s")
 163
 164 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 165
 166 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 167
 168 ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
 169
 170 NUM_SUFFIXES = {
 171     "Pb": (1024**5),
 172     "P": (1024**5),
 173     "Tb": (1024**4),
 174     "T": (1024**4),
 175     "Gb": (1024**3),
 176     "G": (1024**3),
 177     "Mb": (1024**2),
 178     "M": (1024**2),
 179     "Kb": (1024**1),
 180     "K": (1024**1),
 181 }
 182
 183 UNIT_WORDS = [
 184     "zero",
 185     "one",
 186     "two",
 187     "three",
 188     "four",
 189     "five",
 190     "six",
 191     "seven",
 192     "eight",
 193     "nine",
 194     "ten",
 195     "eleven",
 196     "twelve",
 197     "thirteen",
 198     "fourteen",
 199     "fifteen",
 200     "sixteen",
 201     "seventeen",
 202     "eighteen",
 203     "nineteen",
 204 ]
 205
 206 TENS_WORDS = [
 207     "",
 208     "",
 209     "twenty",
 210     "thirty",
 211     "forty",
 212     "fifty",
 213     "sixty",
 214     "seventy",
 215     "eighty",
 216     "ninety",
 217 ]
 218
 219 MAGNITUDE_SCALES = [
 220     "hundred",
 221     "thousand",
 222     "million",
 223     "billion",
 224     "trillion",
 225     "quadrillion",
 226 ]
 227
 228 NUM_WORDS = {}
 229 NUM_WORDS["and"] = (1, 0)
 230 for i, word in enumerate(UNIT_WORDS):
 231     NUM_WORDS[word] = (1, i)
 232 for i, word in enumerate(TENS_WORDS):
 233     NUM_WORDS[word] = (1, i * 10)
 234 for i, word in enumerate(MAGNITUDE_SCALES):
 235     if i == 0:
 236         NUM_WORDS[word] = (100, 0)
 237     else:
 238         NUM_WORDS[word] = (10 ** (i * 3), 0)
 239 NUM_WORDS['score'] = (20, 0)
 240
 241
 242 def is_none_or_empty(in_str: Optional[str]) -> bool:
 243     """
 244     Args:
 245         in_str: the string to test
 246
 247     Returns:
 248         True if the input string is either None or an empty string,
 249         False otherwise.
 250
 251     See also :meth:`is_string` and :meth:`is_empty_string`.
 252
 253     >>> is_none_or_empty("")
 254     True
 255     >>> is_none_or_empty(None)
 256     True
 257     >>> is_none_or_empty("   \t   ")
 258     True
 259     >>> is_none_or_empty('Test')
 260     False
 261     """
 262     return in_str is None or len(in_str.strip()) == 0
 263
 264
 265 def is_string(in_str: Any) -> bool:
 266     """
 267     Args:
 268         in_str: the object to test
 269
 270     Returns:
 271         True if the object is a string and False otherwise.
 272
 273     See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 274
 275     >>> is_string('test')
 276     True
 277     >>> is_string(123)
 278     False
 279     >>> is_string(100.3)
 280     False
 281     >>> is_string([1, 2, 3])
 282     False
 283     """
 284     return isinstance(in_str, str)
 285
 286
 287 def is_empty_string(in_str: Any) -> bool:
 288     """
 289     Args:
 290         in_str: the string to test
 291
 292     Returns:
 293         True if the string is empty and False otherwise.
 294
 295     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 296     """
 297     return is_empty(in_str)
 298
 299
 300 def is_empty(in_str: Any) -> bool:
 301     """
 302     Args:
 303         in_str: the string to test
 304
 305     Returns:
 306         True if the string is empty and false otherwise.
 307
 308     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 309
 310     >>> is_empty('')
 311     True
 312     >>> is_empty('    \t\t    ')
 313     True
 314     >>> is_empty('test')
 315     False
 316     >>> is_empty(100.88)
 317     False
 318     >>> is_empty([1, 2, 3])
 319     False
 320     """
 321     return is_string(in_str) and in_str.strip() == ""
 322
 323
 324 def is_full_string(in_str: Any) -> bool:
 325     """
 326     Args:
 327         in_str: the object to test
 328
 329     Returns:
 330         True if the object is a string and is not empty ('') and
 331         is not only composed of whitespace.
 332
 333     See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 334
 335     >>> is_full_string('test!')
 336     True
 337     >>> is_full_string('')
 338     False
 339     >>> is_full_string('      ')
 340     False
 341     >>> is_full_string(100.999)
 342     False
 343     >>> is_full_string({"a": 1, "b": 2})
 344     False
 345     """
 346     return is_string(in_str) and in_str.strip() != ""
 347
 348
 349 def is_number(in_str: str) -> bool:
 350     """
 351     Args:
 352         in_str: the string to test
 353
 354     Returns:
 355         True if the string contains a valid numberic value and
 356         False otherwise.
 357
 358     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 359     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 360     etc...
 361
 362     >>> is_number(100.5)
 363     Traceback (most recent call last):
 364     ...
 365     ValueError: 100.5
 366     >>> is_number("100.5")
 367     True
 368     >>> is_number("test")
 369     False
 370     >>> is_number("99")
 371     True
 372     >>> is_number([1, 2, 3])
 373     Traceback (most recent call last):
 374     ...
 375     ValueError: [1, 2, 3]
 376     """
 377     if not is_string(in_str):
 378         raise ValueError(in_str)
 379     return NUMBER_RE.match(in_str) is not None
 380
 381
 382 def is_integer_number(in_str: str) -> bool:
 383     """
 384     Args:
 385         in_str: the string to test
 386
 387     Returns:
 388         True if the string contains a valid (signed or unsigned,
 389         decimal, hex, or octal, regular or scientific) integral
 390         expression and False otherwise.
 391
 392     See also :meth:`is_number`, :meth:`is_decimal_number`,
 393     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 394     etc...
 395
 396     >>> is_integer_number('42')
 397     True
 398     >>> is_integer_number('42.0')
 399     False
 400     """
 401     return (
 402         (is_number(in_str) and "." not in in_str)
 403         or is_hexidecimal_integer_number(in_str)
 404         or is_octal_integer_number(in_str)
 405         or is_binary_integer_number(in_str)
 406     )
 407
 408
 409 def is_hexidecimal_integer_number(in_str: str) -> bool:
 410     """
 411     Args:
 412         in_str: the string to test
 413
 414     Returns:
 415         True if the string is a hex integer number and False otherwise.
 416
 417     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 418     :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
 419
 420     >>> is_hexidecimal_integer_number('0x12345')
 421     True
 422     >>> is_hexidecimal_integer_number('0x1A3E')
 423     True
 424     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 425     False
 426     >>> is_hexidecimal_integer_number('-0xff')
 427     True
 428     >>> is_hexidecimal_integer_number('test')
 429     False
 430     >>> is_hexidecimal_integer_number(12345)  # Not a string
 431     Traceback (most recent call last):
 432     ...
 433     ValueError: 12345
 434     >>> is_hexidecimal_integer_number(101.4)
 435     Traceback (most recent call last):
 436     ...
 437     ValueError: 101.4
 438     >>> is_hexidecimal_integer_number(0x1A3E)
 439     Traceback (most recent call last):
 440     ...
 441     ValueError: 6718
 442     """
 443     if not is_string(in_str):
 444         raise ValueError(in_str)
 445     return HEX_NUMBER_RE.match(in_str) is not None
 446
 447
 448 def is_octal_integer_number(in_str: str) -> bool:
 449     """
 450     Args:
 451         in_str: the string to test
 452
 453     Returns:
 454         True if the string is a valid octal integral number and False otherwise.
 455
 456     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 457     :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
 458     etc...
 459
 460     >>> is_octal_integer_number('0o777')
 461     True
 462     >>> is_octal_integer_number('-0O115')
 463     True
 464     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 465     False
 466     >>> is_octal_integer_number('7777')  # Needs 0o
 467     False
 468     >>> is_octal_integer_number('test')
 469     False
 470     """
 471     if not is_string(in_str):
 472         raise ValueError(in_str)
 473     return OCT_NUMBER_RE.match(in_str) is not None
 474
 475
 476 def is_binary_integer_number(in_str: str) -> bool:
 477     """
 478     Args:
 479         in_str: the string to test
 480
 481     Returns:
 482         True if the string contains a binary integral number and False otherwise.
 483
 484     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 485     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 486     etc...
 487
 488     >>> is_binary_integer_number('0b10111')
 489     True
 490     >>> is_binary_integer_number('-0b111')
 491     True
 492     >>> is_binary_integer_number('0B10101')
 493     True
 494     >>> is_binary_integer_number('0b10102')
 495     False
 496     >>> is_binary_integer_number('0xFFF')
 497     False
 498     >>> is_binary_integer_number('test')
 499     False
 500     """
 501     if not is_string(in_str):
 502         raise ValueError(in_str)
 503     return BIN_NUMBER_RE.match(in_str) is not None
 504
 505
 506 def to_int(in_str: str) -> int:
 507     """
 508     Args:
 509         in_str: the string to convert
 510
 511     Returns:
 512         The integral value of the string or raises on error.
 513
 514     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 515     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 516     :meth:`is_binary_integer_number`, etc...
 517
 518     >>> to_int('1234')
 519     1234
 520     >>> to_int('0x1234')
 521     4660
 522     >>> to_int('0b01101')
 523     13
 524     >>> to_int('0o777')
 525     511
 526     >>> to_int('test')
 527     Traceback (most recent call last):
 528     ...
 529     ValueError: invalid literal for int() with base 10: 'test'
 530     """
 531     if not is_string(in_str):
 532         raise ValueError(in_str)
 533     if is_binary_integer_number(in_str):
 534         return int(in_str, 2)
 535     if is_octal_integer_number(in_str):
 536         return int(in_str, 8)
 537     if is_hexidecimal_integer_number(in_str):
 538         return int(in_str, 16)
 539     return int(in_str)
 540
 541
 542 def number_string_to_integer(in_str: str) -> int:
 543     """Convert a string containing a written-out number into an int.
 544
 545     Args:
 546         in_str: the string containing the long-hand written out integer number
 547             in English.  See examples below.
 548
 549     Returns:
 550         The integer whose value was parsed from in_str.
 551
 552     See also :meth:`integer_to_number_string`.
 553
 554     .. warning::
 555         This code only handles integers; it will not work with decimals / floats.
 556
 557     >>> number_string_to_integer("one hundred fifty two")
 558     152
 559
 560     >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
 561     10200054003
 562
 563     >>> number_string_to_integer("four-score and 7")
 564     87
 565
 566     >>> number_string_to_integer("fifty xyzzy three")
 567     Traceback (most recent call last):
 568     ...
 569     ValueError: Unknown word: xyzzy
 570     """
 571     if isinstance(in_str, int):
 572         return int(in_str)
 573
 574     current = result = 0
 575     in_str = in_str.replace('-', ' ')
 576     for w in in_str.split():
 577         if w not in NUM_WORDS:
 578             if is_integer_number(w):
 579                 current += int(w)
 580                 continue
 581             else:
 582                 raise ValueError("Unknown word: " + w)
 583         scale, increment = NUM_WORDS[w]
 584         current = current * scale + increment
 585         if scale > 100:
 586             result += current
 587             current = 0
 588     return result + current
 589
 590
 591 def integer_to_number_string(num: int) -> str:
 592     """
 593     Opposite of :meth:`number_string_to_integer`; converts a number to a written out
 594     longhand format in English.
 595
 596     Args:
 597         num: the integer number to convert
 598
 599     Returns:
 600         The long-hand written out English form of the number.  See examples below.
 601
 602     See also :meth:`number_string_to_integer`.
 603
 604     .. warning::
 605         This method does not handle decimals or floats, only ints.
 606
 607     >>> integer_to_number_string(9)
 608     'nine'
 609
 610     >>> integer_to_number_string(42)
 611     'forty two'
 612
 613     >>> integer_to_number_string(123219982)
 614     'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
 615     """
 616
 617     if num < 20:
 618         return UNIT_WORDS[num]
 619     if num < 100:
 620         ret = TENS_WORDS[num // 10]
 621         leftover = num % 10
 622         if leftover != 0:
 623             ret += ' ' + UNIT_WORDS[leftover]
 624         return ret
 625
 626     # If num > 100 go find the highest chunk and convert that, then recursively
 627     # convert the rest.  NUM_WORDS contains items like 'thousand' -> (1000, 0).
 628     # The second item in the tuple is an increment that can be ignored; the first
 629     # is the numeric "scale" of the entry.  So find the greatest entry in NUM_WORDS
 630     # still less than num.  For 123,456 it would be thousand.  Then pull out the
 631     # 123, convert it, and append "thousand".  Then do the rest.
 632     scales = {}
 633     for name, val in NUM_WORDS.items():
 634         if val[0] <= num:
 635             scales[name] = val[0]
 636     scale = max(scales.items(), key=lambda _: _[1])
 637
 638     # scale[1] = numeric magnitude (e.g. 1000)
 639     # scale[0] = name (e.g. "thousand")
 640     ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
 641     leftover = num % scale[1]
 642     if leftover != 0:
 643         ret += ' ' + integer_to_number_string(leftover)
 644     return ret
 645
 646
 647 def is_decimal_number(in_str: str) -> bool:
 648     """
 649     Args:
 650         in_str: the string to check
 651
 652     Returns:
 653         True if the given string represents a decimal or False
 654         otherwise.  A decimal may be signed or unsigned or use
 655         a "scientific notation".
 656
 657     See also :meth:`is_integer_number`.
 658
 659     .. note::
 660         We do not consider integers without a decimal point
 661         to be decimals; they return False (see example).
 662
 663     >>> is_decimal_number('42.0')
 664     True
 665     >>> is_decimal_number('42')
 666     False
 667     """
 668     return is_number(in_str) and "." in in_str
 669
 670
 671 def strip_escape_sequences(in_str: str) -> str:
 672     """
 673     Args:
 674         in_str: the string to strip of escape sequences.
 675
 676     Returns:
 677         in_str with escape sequences removed.
 678
 679     See also: :mod:`pyutils.ansi`.
 680
 681     .. note::
 682         What is considered to be an "escape sequence" is defined
 683         by a regular expression.  While this gets common ones,
 684         there may exist valid sequences that it doesn't match.
 685
 686     >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
 687     'this is a test!'
 688     """
 689     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 690     return in_str
 691
 692
 693 def add_thousands_separator(
 694     in_str: str, *, separator_char: str = ',', places: int = 3
 695 ) -> str:
 696     """
 697     Args:
 698         in_str: string or number to which to add thousands separator(s)
 699         separator_char: the separator character to add (defaults to comma)
 700         places: add a separator every N places (defaults to three)
 701
 702     Returns:
 703         A numeric string with thousands separators added appropriately.
 704
 705     >>> add_thousands_separator('12345678')
 706     '12,345,678'
 707     >>> add_thousands_separator(12345678)
 708     '12,345,678'
 709     >>> add_thousands_separator(12345678.99)
 710     '12,345,678.99'
 711     >>> add_thousands_separator('test')
 712     Traceback (most recent call last):
 713     ...
 714     ValueError: test
 715
 716     """
 717     if isinstance(in_str, numbers.Number):
 718         in_str = f'{in_str}'
 719     if is_number(in_str):
 720         return _add_thousands_separator(
 721             in_str, separator_char=separator_char, places=places
 722         )
 723     raise ValueError(in_str)
 724
 725
 726 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 727     """Internal helper"""
 728     decimal_part = ""
 729     if '.' in in_str:
 730         (in_str, decimal_part) = in_str.split('.')
 731     tmp = [iter(in_str[::-1])] * places
 732     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 733     if len(decimal_part) > 0:
 734         ret += '.'
 735         ret += decimal_part
 736     return ret
 737
 738
 739 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 740     """
 741     Args:
 742         in_str: the string to test
 743         allowed_schemes: an optional list of allowed schemes (e.g.
 744             ['http', 'https', 'ftp'].  If passed, only URLs that
 745             begin with the one of the schemes passed will be considered
 746             to be valid.  Otherwise, any scheme:// will be considered
 747             valid.
 748
 749     Returns:
 750         True if in_str contains a valid URL and False otherwise.
 751
 752     >>> is_url('http://www.mysite.com')
 753     True
 754     >>> is_url('https://mysite.com')
 755     True
 756     >>> is_url('.mysite.com')
 757     False
 758     >>> is_url('scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash')
 759     True
 760     """
 761     if not is_full_string(in_str):
 762         return False
 763
 764     valid = URL_RE.match(in_str) is not None
 765
 766     if allowed_schemes:
 767         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 768     return valid
 769
 770
 771 def is_email(in_str: Any) -> bool:
 772     """
 773     Args:
 774         in_str: the email address to check
 775
 776     Returns: True if the in_str contains a valid email (as defined by
 777         https://tools.ietf.org/html/rfc3696#section-3) or False
 778         otherwise.
 779
 780     >>> is_email('[email protected]')
 781     True
 782     >>> is_email('@gmail.com')
 783     False
 784     """
 785     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 786         return False
 787
 788     try:
 789         # we expect 2 tokens, one before "@" and one after, otherwise
 790         # we have an exception and the email is not valid.
 791         head, tail = in_str.split("@")
 792
 793         # head's size must be <= 64, tail <= 255, head must not start
 794         # with a dot or contain multiple consecutive dots.
 795         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 796             return False
 797
 798         # removes escaped spaces, so that later on the test regex will
 799         # accept the string.
 800         head = head.replace("\\ ", "")
 801         if head.startswith('"') and head.endswith('"'):
 802             head = head.replace(" ", "")[1:-1]
 803         return EMAIL_RE.match(head + "@" + tail) is not None
 804
 805     except ValueError:
 806         # borderline case in which we have multiple "@" signs but the
 807         # head part is correctly escaped.
 808         if ESCAPED_AT_SIGN.search(in_str) is not None:
 809             # replace "@" with "a" in the head
 810             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 811         return False
 812
 813
 814 def suffix_string_to_number(in_str: str) -> Optional[int]:
 815     """Takes a string like "33Gb" and converts it into a number (of bytes)
 816     like 34603008.
 817
 818     Args:
 819         in_str: the string with a suffix to be interpreted and removed.
 820
 821     Returns:
 822         An integer number of bytes or None to indicate an error.
 823
 824     See also :meth:`number_to_suffix_string`.
 825
 826     >>> suffix_string_to_number('1Mb')
 827     1048576
 828     >>> suffix_string_to_number('13.1Gb')
 829     14066017894
 830     """
 831
 832     def suffix_capitalize(s: str) -> str:
 833         if len(s) == 1:
 834             return s.upper()
 835         elif len(s) == 2:
 836             return f"{s[0].upper()}{s[1].lower()}"
 837         return suffix_capitalize(s[0:1])
 838
 839     if is_string(in_str):
 840         if is_integer_number(in_str):
 841             return to_int(in_str)
 842         suffixes = [in_str[-2:], in_str[-1:]]
 843         rest = [in_str[:-2], in_str[:-1]]
 844         for x in range(len(suffixes)):
 845             s = suffixes[x]
 846             s = suffix_capitalize(s)
 847             multiplier = NUM_SUFFIXES.get(s, None)
 848             if multiplier is not None:
 849                 r = rest[x]
 850                 if is_integer_number(r):
 851                     return to_int(r) * multiplier
 852                 if is_decimal_number(r):
 853                     return int(float(r) * multiplier)
 854     return None
 855
 856
 857 def number_to_suffix_string(num: int) -> Optional[str]:
 858     """Take a number (of bytes) and returns a string like "43.8Gb".
 859
 860     Args:
 861         num: an integer number of bytes
 862
 863     Returns:
 864         A string with a suffix representing num bytes concisely or
 865         None to indicate an error.
 866
 867     See also: :meth:`suffix_string_to_number`.
 868
 869     >>> number_to_suffix_string(14066017894)
 870     '13.1Gb'
 871     >>> number_to_suffix_string(1024 * 1024)
 872     '1.0Mb'
 873     """
 874     d = 0.0
 875     suffix = None
 876     for (sfx, size) in NUM_SUFFIXES.items():
 877         if num >= size:
 878             d = num / size
 879             suffix = sfx
 880             break
 881     if suffix is not None:
 882         return f"{d:.1f}{suffix}"
 883     else:
 884         return f'{num:d}'
 885
 886
 887 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 888     """
 889     Args:
 890         in_str: a string to check
 891         card_type: if provided, contains the card type to validate
 892             with.  Otherwise, all known credit card number types will
 893             be accepted.
 894
 895             Supported card types are the following:
 896
 897             * VISA
 898             * MASTERCARD
 899             * AMERICAN_EXPRESS
 900             * DINERS_CLUB
 901             * DISCOVER
 902             * JCB
 903
 904     Returns:
 905         True if in_str is a valid credit card number.
 906
 907     .. warning::
 908         This code is not verifying the authenticity of the credit card (i.e.
 909         not checking whether it's a real card that can be charged); rather
 910         it's only checking that the number follows the "rules" for numbering
 911         established by credit card issuers.
 912
 913     """
 914     if not is_full_string(in_str):
 915         return False
 916
 917     if card_type is not None:
 918         if card_type not in CREDIT_CARDS:
 919             raise KeyError(
 920                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 921             )
 922         return CREDIT_CARDS[card_type].match(in_str) is not None
 923     for c in CREDIT_CARDS:
 924         if CREDIT_CARDS[c].match(in_str) is not None:
 925             return True
 926     return False
 927
 928
 929 def is_camel_case(in_str: Any) -> bool:
 930     """
 931     Args:
 932         in_str: the string to test
 933
 934     Returns:
 935         True if the string is formatted as camel case and False otherwise.
 936         A string is considered camel case when:
 937
 938         * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 939         * it contains both lowercase and uppercase letters
 940         * it does not start with a number
 941
 942     See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
 943     """
 944     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 945
 946
 947 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 948     """
 949     Args:
 950         in_str: the string to test
 951         separator: the snake case separator character to use
 952
 953     Returns: True if the string is snake case and False otherwise.  A
 954         string is considered snake case when:
 955
 956         * it's composed only by lowercase/uppercase letters and digits
 957         * it contains at least one underscore (or provided separator)
 958         * it does not start with a number
 959
 960     See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
 961
 962     >>> is_snake_case('this_is_a_test')
 963     True
 964     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 965     True
 966     >>> is_snake_case('this-is-a-test')
 967     False
 968     >>> is_snake_case('this-is-a-test', separator='-')
 969     True
 970     """
 971     if is_full_string(in_str):
 972         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 973         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 974         r = re_map.get(
 975             separator,
 976             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 977         )
 978         return r.match(in_str) is not None
 979     return False
 980
 981
 982 def is_json(in_str: Any) -> bool:
 983     """
 984     Args:
 985         in_str: the string to test
 986
 987     Returns:
 988         True if the in_str contains valid JSON and False otherwise.
 989
 990     >>> is_json('{"name": "Peter"}')
 991     True
 992     >>> is_json('[1, 2, 3]')
 993     True
 994     >>> is_json('{nope}')
 995     False
 996     """
 997     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 998         try:
 999             return isinstance(json.loads(in_str), (dict, list))
1000         except (TypeError, ValueError, OverflowError):
1001             pass
1002     return False
1003
1004
1005 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1006     """
1007     Args:
1008         in_str: the string to test
1009         allow_hex: should we allow hexidecimal digits in valid uuids?
1010
1011     Returns:
1012         True if the in_str contains a valid UUID and False otherwise.
1013
1014     See also :meth:`generate_uuid`.
1015
1016     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1017     True
1018     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1019     False
1020     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1021     True
1022     """
1023     # string casting is used to allow UUID itself as input data type
1024     s = str(in_str)
1025     if allow_hex:
1026         return UUID_HEX_OK_RE.match(s) is not None
1027     return UUID_RE.match(s) is not None
1028
1029
1030 def is_ip_v4(in_str: Any) -> bool:
1031     """
1032     Args:
1033         in_str: the string to test
1034
1035     Returns:
1036         True if in_str contains a valid IPv4 address and False otherwise.
1037
1038     See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1039     and :meth:`is_ip`.
1040
1041     >>> is_ip_v4('255.200.100.75')
1042     True
1043     >>> is_ip_v4('nope')
1044     False
1045     >>> is_ip_v4('255.200.100.999')  # 999 out of range
1046     False
1047     """
1048     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1049         return False
1050
1051     # checks that each entry in the ip is in the valid range (0 to 255)
1052     for token in in_str.split("."):
1053         if not 0 <= int(token) <= 255:
1054             return False
1055     return True
1056
1057
1058 def extract_ip_v4(in_str: Any) -> Optional[str]:
1059     """
1060     Args:
1061         in_str: the string to extract an IPv4 address from.
1062
1063     Returns:
1064         The first extracted IPv4 address from in_str or None if
1065         none were found or an error occurred.
1066
1067     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1068     and :meth:`is_ip`.
1069
1070     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
1071     '127.0.0.1'
1072     >>> extract_ip_v4('Your mom dresses you funny.')
1073     """
1074     if not is_full_string(in_str):
1075         return None
1076     m = ANYWHERE_IP_V4_RE.search(in_str)
1077     if m is not None:
1078         return m.group(0)
1079     return None
1080
1081
1082 def is_ip_v6(in_str: Any) -> bool:
1083     """
1084     Args:
1085         in_str: the string to test.
1086
1087     Returns:
1088         True if in_str contains a valid IPv6 address and False otherwise.
1089
1090     See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1091     and :meth:`is_ip`.
1092
1093     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1094     True
1095     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
1096     False
1097     """
1098     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1099
1100
1101 def extract_ip_v6(in_str: Any) -> Optional[str]:
1102     """
1103     Args:
1104         in_str: the string from which to extract an IPv6 address.
1105
1106     Returns:
1107         The first IPv6 address found in in_str or None if no address
1108         was found or an error occurred.
1109
1110     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1111     and :meth:`is_ip`.
1112
1113     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1114     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1115     >>> extract_ip_v6("(and she's ugly too, btw)")
1116     """
1117     if not is_full_string(in_str):
1118         return None
1119     m = ANYWHERE_IP_V6_RE.search(in_str)
1120     if m is not None:
1121         return m.group(0)
1122     return None
1123
1124
1125 def is_ip(in_str: Any) -> bool:
1126     """
1127     Args:
1128         in_str: the string to test.
1129
1130     Returns:
1131         True if in_str contains a valid IP address (either IPv4 or
1132         IPv6).
1133
1134     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1135     and :meth:`extract_ip_v4`.
1136
1137     >>> is_ip('255.200.100.75')
1138     True
1139     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1140     True
1141     >>> is_ip('1.2.3')
1142     False
1143     >>> is_ip('1.2.3.999')
1144     False
1145     """
1146     return is_ip_v6(in_str) or is_ip_v4(in_str)
1147
1148
1149 def extract_ip(in_str: Any) -> Optional[str]:
1150     """
1151     Args:
1152         in_str: the string from which to extract in IP address.
1153
1154     Returns:
1155         The first IP address (IPv4 or IPv6) found in in_str or
1156         None to indicate none found or an error condition.
1157
1158     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1159     and :meth:`extract_ip_v4`.
1160
1161     >>> extract_ip('Attacker: 255.200.100.75')
1162     '255.200.100.75'
1163     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1164     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1165     >>> extract_ip('1.2.3')
1166     """
1167     ip = extract_ip_v4(in_str)
1168     if ip is None:
1169         ip = extract_ip_v6(in_str)
1170     return ip
1171
1172
1173 def is_mac_address(in_str: Any) -> bool:
1174     """
1175     Args:
1176         in_str: the string to test
1177
1178     Returns:
1179         True if in_str is a valid MAC address False otherwise.
1180
1181     See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1182
1183     >>> is_mac_address("34:29:8F:12:0D:2F")
1184     True
1185     >>> is_mac_address('34:29:8f:12:0d:2f')
1186     True
1187     >>> is_mac_address('34-29-8F-12-0D-2F')
1188     True
1189     >>> is_mac_address("test")
1190     False
1191     """
1192     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1193
1194
1195 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1196     """
1197     Args:
1198         in_str: the string from which to extract a MAC address.
1199
1200     Returns:
1201         The first MAC address found in in_str or None to indicate no
1202         match or an error.
1203
1204     See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1205
1206     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1207     '34:29:8F:12:0D:2F'
1208
1209     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1210     'd8:5d:e2:34:54:86'
1211     """
1212     if not is_full_string(in_str):
1213         return None
1214     in_str.strip()
1215     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1216     if m is not None:
1217         mac = m.group(0)
1218         mac.replace(":", separator)
1219         mac.replace("-", separator)
1220         return mac
1221     return None
1222
1223
1224 def is_slug(in_str: Any, separator: str = "-") -> bool:
1225     """
1226     Args:
1227         in_str: string to test
1228         separator: the slug character to use
1229
1230     Returns:
1231         True if in_str is a slug string and False otherwise.
1232
1233     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1234
1235     >>> is_slug('my-blog-post-title')
1236     True
1237     >>> is_slug('My blog post title')
1238     False
1239     """
1240     if not is_full_string(in_str):
1241         return False
1242     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1243     return re.match(rex, in_str) is not None
1244
1245
1246 def contains_html(in_str: str) -> bool:
1247     """
1248     Args:
1249         in_str: the string to check for tags in
1250
1251     Returns:
1252         True if the given string contains HTML/XML tags and False
1253         otherwise.
1254
1255     See also :meth:`strip_html`.
1256
1257     .. warning::
1258         By design, this function matches ANY type of tag, so don't expect
1259         to use it as an HTML validator.  It's a quick sanity check at
1260         best.  See something like BeautifulSoup for a more full-featuered
1261         HTML parser.
1262
1263     >>> contains_html('my string is <strong>bold</strong>')
1264     True
1265     >>> contains_html('my string is not bold')
1266     False
1267
1268     """
1269     if not is_string(in_str):
1270         raise ValueError(in_str)
1271     return HTML_RE.search(in_str) is not None
1272
1273
1274 def words_count(in_str: str) -> int:
1275     """
1276     Args:
1277         in_str: the string to count words in
1278
1279     Returns:
1280         The number of words contained in the given string.
1281
1282     .. note::
1283         This method is "smart" in that it does consider only sequences
1284         of one or more letter and/or numbers to be "words".  Thus a
1285         string like this: "! @ # % ... []" will return zero.  Moreover
1286         it is aware of punctuation, so the count for a string like
1287         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1288         in the string).
1289
1290     >>> words_count('hello world')
1291     2
1292     >>> words_count('one,two,three.stop')
1293     4
1294     """
1295     if not is_string(in_str):
1296         raise ValueError(in_str)
1297     return len(WORDS_COUNT_RE.findall(in_str))
1298
1299
1300 def word_count(in_str: str) -> int:
1301     """
1302     Args:
1303         in_str: the string to count words in
1304
1305     Returns:
1306         The number of words contained in the given string.
1307
1308     .. note::
1309         This method is "smart" in that it does consider only sequences
1310         of one or more letter and/or numbers to be "words".  Thus a
1311         string like this: "! @ # % ... []" will return zero.  Moreover
1312         it is aware of punctuation, so the count for a string like
1313         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1314         in the string).
1315
1316     >>> word_count('hello world')
1317     2
1318     >>> word_count('one,two,three.stop')
1319     4
1320     """
1321     return words_count(in_str)
1322
1323
1324 def generate_uuid(omit_dashes: bool = False) -> str:
1325     """
1326     Args:
1327         omit_dashes: should we omit the dashes in the generated UUID?
1328
1329     Returns:
1330         A generated UUID string (using `uuid.uuid4()`) with or without
1331         dashes per the omit_dashes arg.
1332
1333     See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1334
1335     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1336     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1337     """
1338     uid = uuid4()
1339     if omit_dashes:
1340         return uid.hex
1341     return str(uid)
1342
1343
1344 def generate_random_alphanumeric_string(size: int) -> str:
1345     """
1346     Args:
1347         size: number of characters to generate
1348
1349     Returns:
1350         A string of the specified size containing random characters
1351         (uppercase/lowercase ascii letters and digits).
1352
1353     See also :meth:`asciify`, :meth:`generate_uuid`.
1354
1355     >>> random.seed(22)
1356     >>> generate_random_alphanumeric_string(9)
1357     '96ipbNClS'
1358     """
1359     if size < 1:
1360         raise ValueError("size must be >= 1")
1361     chars = string.ascii_letters + string.digits
1362     buffer = [random.choice(chars) for _ in range(size)]
1363     return from_char_list(buffer)
1364
1365
1366 def reverse(in_str: str) -> str:
1367     """
1368     Args:
1369         in_str: the string to reverse
1370
1371     Returns:
1372         The reversed (chracter by character) string.
1373
1374     >>> reverse('test')
1375     'tset'
1376     """
1377     if not is_string(in_str):
1378         raise ValueError(in_str)
1379     return in_str[::-1]
1380
1381
1382 def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
1383     """
1384     Args:
1385         in_str: the camel case string to convert
1386         separator: the snake case separator character to use
1387
1388     Returns:
1389         A snake case string equivalent to the camel case input or the
1390         original string if it is not a valid camel case string or some
1391         other error occurs.
1392
1393     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1394
1395     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1396     'mac_address_extractor_factory'
1397     >>> camel_case_to_snake_case('Luke Skywalker')
1398     'Luke Skywalker'
1399     """
1400     if not is_string(in_str):
1401         raise ValueError(in_str)
1402     if not is_camel_case(in_str):
1403         return in_str
1404     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1405
1406
1407 def snake_case_to_camel_case(
1408     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1409 ) -> str:
1410     """
1411     Args:
1412         in_str: the snake case string to convert
1413         upper_case_first: should we capitalize the first letter?
1414         separator: the separator character to use
1415
1416     Returns:
1417         A camel case string that is equivalent to the snake case string
1418         provided or the original string back again if it is not valid
1419         snake case or another error occurs.
1420
1421     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1422
1423     >>> snake_case_to_camel_case('this_is_a_test')
1424     'ThisIsATest'
1425     >>> snake_case_to_camel_case('Han Solo')
1426     'Han Solo'
1427     """
1428     if not is_string(in_str):
1429         raise ValueError(in_str)
1430     if not is_snake_case(in_str, separator=separator):
1431         return in_str
1432     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1433     if not upper_case_first:
1434         tokens[0] = tokens[0].lower()
1435     return from_char_list(tokens)
1436
1437
1438 def to_char_list(in_str: str) -> List[str]:
1439     """
1440     Args:
1441         in_str: the string to split into a char list
1442
1443     Returns:
1444         A list of strings of length one each.
1445
1446     See also :meth:`from_char_list`.
1447
1448     >>> to_char_list('test')
1449     ['t', 'e', 's', 't']
1450     """
1451     if not is_string(in_str):
1452         return []
1453     return list(in_str)
1454
1455
1456 def from_char_list(in_list: List[str]) -> str:
1457     """
1458     Args:
1459         in_list: A list of characters to convert into a string.
1460
1461     Returns:
1462         The string resulting from gluing the characters in in_list
1463         together.
1464
1465     See also :meth:`to_char_list`.
1466
1467     >>> from_char_list(['t', 'e', 's', 't'])
1468     'test'
1469     """
1470     return "".join(in_list)
1471
1472
1473 def shuffle(in_str: str) -> Optional[str]:
1474     """
1475     Args:
1476         in_str: a string to shuffle randomly by character
1477
1478     Returns:
1479         A new string containing same chars of the given one but in
1480         a randomized order.  Note that in rare cases this could result
1481         in the same original string as no check is done.  Returns
1482         None to indicate error conditions.
1483
1484     >>> random.seed(22)
1485     >>> shuffle('awesome')
1486     'meosaew'
1487     """
1488     if not is_string(in_str):
1489         return None
1490     chars = to_char_list(in_str)
1491     random.shuffle(chars)
1492     return from_char_list(chars)
1493
1494
1495 def scramble(in_str: str) -> Optional[str]:
1496     """
1497     Args:
1498         in_str: a string to shuffle randomly by character
1499
1500     Returns:
1501         A new string containing same chars of the given one but in
1502         a randomized order.  Note that in rare cases this could result
1503         in the same original string as no check is done.  Returns
1504         None to indicate error conditions.
1505
1506     See also :mod:`pyutils.unscrambler`.
1507
1508     >>> random.seed(22)
1509     >>> scramble('awesome')
1510     'meosaew'
1511     """
1512     return shuffle(in_str)
1513
1514
1515 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1516     """
1517     Args:
1518         in_str: the string to strip tags from
1519         keep_tag_content: should we keep the inner contents of tags?
1520
1521     Returns:
1522         A string with all HTML tags removed (optionally with tag contents
1523         preserved).
1524
1525     See also :meth:`contains_html`.
1526
1527     .. note::
1528         This method uses simple regular expressions to strip tags and is
1529         not a full fledged HTML parser by any means.  Consider using
1530         something like BeautifulSoup if your needs are more than this
1531         simple code can fulfill.
1532
1533     >>> strip_html('test: <a href="foo/bar">click here</a>')
1534     'test: '
1535     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1536     'test: click here'
1537     """
1538     if not is_string(in_str):
1539         raise ValueError(in_str)
1540     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1541     return r.sub("", in_str)
1542
1543
1544 def asciify(in_str: str) -> str:
1545     """
1546     Args:
1547         in_str: the string to asciify.
1548
1549     Returns:
1550         An output string roughly equivalent to the original string
1551         where all content to are ascii-only.  This is accomplished
1552         by translating all non-ascii chars into their closest possible
1553         ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1554
1555     See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1556
1557     .. warning::
1558         Some chars may be lost if impossible to translate.
1559
1560     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1561     'eeuuooaaeynAAACIINOE'
1562     """
1563     if not is_string(in_str):
1564         raise ValueError(in_str)
1565
1566     # "NFKD" is the algorithm which is able to successfully translate
1567     # the most of non-ascii chars.
1568     normalized = unicodedata.normalize("NFKD", in_str)
1569
1570     # encode string forcing ascii and ignore any errors
1571     # (unrepresentable chars will be stripped out)
1572     ascii_bytes = normalized.encode("ascii", "ignore")
1573
1574     # turns encoded bytes into an utf-8 string
1575     return ascii_bytes.decode("utf-8")
1576
1577
1578 def slugify(in_str: str, *, separator: str = "-") -> str:
1579     """
1580     Args:
1581         in_str: the string to slugify
1582         separator: the character to use during sligification (default
1583             is a dash)
1584
1585     Returns:
1586         The converted string.  The returned string has the following properties:
1587
1588         * it has no spaces
1589         * all letters are in lower case
1590         * all punctuation signs and non alphanumeric chars are removed
1591         * words are divided using provided separator
1592         * all chars are encoded as ascii (by using :meth:`asciify`)
1593         * is safe for URL
1594
1595     See also :meth:`is_slug` and :meth:`asciify`.
1596
1597     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1598     'top-10-reasons-to-love-dogs'
1599     >>> slugify('Mönstér Mägnët')
1600     'monster-magnet'
1601     """
1602     if not is_string(in_str):
1603         raise ValueError(in_str)
1604
1605     # replace any character that is NOT letter or number with spaces
1606     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1607
1608     # replace spaces with join sign
1609     out = SPACES_RE.sub(separator, out)
1610
1611     # normalize joins (remove duplicates)
1612     out = re.sub(re.escape(separator) + r"+", separator, out)
1613     return asciify(out)
1614
1615
1616 def to_bool(in_str: str) -> bool:
1617     """
1618     Args:
1619         in_str: the string to convert to boolean
1620
1621     Returns:
1622         A boolean equivalent of the original string based on its contents.
1623         All conversion is case insensitive.  A positive boolean (True) is
1624         returned if the string value is any of the following:
1625
1626         * "true"
1627         * "t"
1628         * "1"
1629         * "yes"
1630         * "y"
1631         * "on"
1632
1633         Otherwise False is returned.
1634
1635     See also :mod:`pyutils.argparse_utils`.
1636
1637     >>> to_bool('True')
1638     True
1639
1640     >>> to_bool('1')
1641     True
1642
1643     >>> to_bool('yes')
1644     True
1645
1646     >>> to_bool('no')
1647     False
1648
1649     >>> to_bool('huh?')
1650     False
1651
1652     >>> to_bool('on')
1653     True
1654     """
1655     if not is_string(in_str):
1656         raise ValueError(in_str)
1657     return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
1658
1659
1660 def to_date(in_str: str) -> Optional[datetime.date]:
1661     """
1662     Args:
1663         in_str: the string to convert into a date
1664
1665     Returns:
1666         The datetime.date the string contained or None to indicate
1667         an error.  This parser is relatively clever; see
1668         :class:`datetimes.dateparse_utils` docs for details.
1669
1670     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
1671     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1672
1673     >>> to_date('9/11/2001')
1674     datetime.date(2001, 9, 11)
1675     >>> to_date('xyzzy')
1676     """
1677     import pyutils.datetimes.dateparse_utils as du
1678
1679     try:
1680         d = du.DateParser()  # type: ignore
1681         d.parse(in_str)
1682         return d.get_date()
1683     except du.ParseException:  # type: ignore
1684         pass
1685     return None
1686
1687
1688 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1689     """Finds and extracts a date from the string, if possible.
1690
1691     Args:
1692         in_str: the string to extract a date from
1693
1694     Returns:
1695         a datetime if date was found, otherwise None
1696
1697     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1698     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1699
1700     >>> extract_date("filename.txt    dec 13, 2022")
1701     datetime.datetime(2022, 12, 13, 0, 0)
1702
1703     >>> extract_date("Dear Santa, please get me a pony.")
1704
1705     """
1706     import itertools
1707
1708     import pyutils.datetimes.dateparse_utils as du
1709
1710     d = du.DateParser()  # type: ignore
1711     chunks = in_str.split()
1712     for ngram in itertools.chain(
1713         list_utils.ngrams(chunks, 5),
1714         list_utils.ngrams(chunks, 4),
1715         list_utils.ngrams(chunks, 3),
1716         list_utils.ngrams(chunks, 2),
1717     ):
1718         try:
1719             expr = " ".join(ngram)
1720             logger.debug("Trying %s", expr)
1721             if d.parse(expr):
1722                 return d.get_datetime()
1723         except du.ParseException:  # type: ignore
1724             pass
1725     return None
1726
1727
1728 def is_valid_date(in_str: str) -> bool:
1729     """
1730     Args:
1731         in_str: the string to check
1732
1733     Returns:
1734         True if the string represents a valid date that we can recognize
1735         and False otherwise.  This parser is relatively clever; see
1736         :class:`datetimes.dateparse_utils` docs for details.
1737
1738     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1739     :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1740
1741     >>> is_valid_date('1/2/2022')
1742     True
1743     >>> is_valid_date('christmas')
1744     True
1745     >>> is_valid_date('next wednesday')
1746     True
1747     >>> is_valid_date('xyzzy')
1748     False
1749     """
1750     import pyutils.datetimes.dateparse_utils as dp
1751
1752     try:
1753         d = dp.DateParser()  # type: ignore
1754         _ = d.parse(in_str)
1755         return True
1756     except dp.ParseException:  # type: ignore
1757         msg = f'Unable to parse date {in_str}.'
1758         logger.warning(msg)
1759     return False
1760
1761
1762 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1763     """
1764     Args:
1765         in_str: string to parse into a datetime
1766
1767     Returns:
1768         A python datetime parsed from in_str or None to indicate
1769         an error.  This parser is relatively clever; see
1770         :class:`datetimes.dateparse_utils` docs for details.
1771
1772     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1773     :meth:`extract_date`, :meth:`valid_datetime`.
1774
1775     >>> to_datetime('7/20/1969 02:56 GMT')
1776     datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1777     """
1778     import pyutils.datetimes.dateparse_utils as dp
1779
1780     try:
1781         d = dp.DateParser()  # type: ignore
1782         dt = d.parse(in_str)
1783         if isinstance(dt, datetime.datetime):
1784             return dt
1785     except Exception:
1786         msg = f'Unable to parse datetime {in_str}.'
1787         logger.warning(msg)
1788     return None
1789
1790
1791 def valid_datetime(in_str: str) -> bool:
1792     """
1793     Args:
1794         in_str: the string to check
1795
1796     Returns:
1797         True if in_str contains a valid datetime and False otherwise.
1798         This parser is relatively clever; see
1799         :class:`datetimes.dateparse_utils` docs for details.
1800
1801     >>> valid_datetime('next wednesday at noon')
1802     True
1803     >>> valid_datetime('3 weeks ago at midnight')
1804     True
1805     >>> valid_datetime('next easter at 5:00 am')
1806     True
1807     >>> valid_datetime('sometime soon')
1808     False
1809     """
1810     _ = to_datetime(in_str)
1811     if _ is not None:
1812         return True
1813     msg = f'Unable to parse datetime {in_str}.'
1814     logger.warning(msg)
1815     return False
1816
1817
1818 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1819     """
1820     Args:
1821         in_str: the string to squeeze
1822         character_to_squeeze: the character to remove runs of
1823             more than one in a row (default = space)
1824
1825     Returns: A "squeezed string" where runs of more than one
1826         character_to_squeeze into one.
1827
1828     >>> squeeze(' this        is       a    test    ')
1829     ' this is a test '
1830
1831     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1832     'one|!|two|!|three'
1833
1834     """
1835     return re.sub(
1836         r'(' + re.escape(character_to_squeeze) + r')+',
1837         character_to_squeeze,
1838         in_str,
1839     )
1840
1841
1842 def dedent(in_str: str) -> Optional[str]:
1843     """
1844     Args:
1845         in_str: the string to dedent
1846
1847     Returns:
1848         A string with tab indentation removed or None on error.
1849
1850     See also :meth:`indent`.
1851
1852     >>> dedent('\t\ttest\\n\t\ting')
1853     'test\\ning'
1854     """
1855     if not is_string(in_str):
1856         return None
1857     line_separator = '\n'
1858     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1859     return line_separator.join(lines)
1860
1861
1862 def indent(in_str: str, amount: int) -> str:
1863     """
1864     Args:
1865         in_str: the string to indent
1866         amount: count of spaces to indent each line by
1867
1868     Returns:
1869         An indented string created by prepending amount spaces.
1870
1871     See also :meth:`dedent`.
1872
1873     >>> indent('This is a test', 4)
1874     '    This is a test'
1875     """
1876     if not is_string(in_str):
1877         raise ValueError(in_str)
1878     line_separator = '\n'
1879     lines = [" " * amount + line for line in in_str.split(line_separator)]
1880     return line_separator.join(lines)
1881
1882
1883 def _sprintf(*args, **kwargs) -> str:
1884     """Internal helper."""
1885     ret = ""
1886
1887     sep = kwargs.pop("sep", None)
1888     if sep is not None:
1889         if not isinstance(sep, str):
1890             raise TypeError("sep must be None or a string")
1891
1892     end = kwargs.pop("end", None)
1893     if end is not None:
1894         if not isinstance(end, str):
1895             raise TypeError("end must be None or a string")
1896
1897     if kwargs:
1898         raise TypeError("invalid keyword arguments to sprint()")
1899
1900     if sep is None:
1901         sep = " "
1902     if end is None:
1903         end = "\n"
1904     for n, arg in enumerate(args):
1905         if n:
1906             ret += sep
1907         if isinstance(arg, str):
1908             ret += arg
1909         else:
1910             ret += str(arg)
1911     ret += end
1912     return ret
1913
1914
1915 def strip_ansi_sequences(in_str: str) -> str:
1916     """
1917     Args:
1918         in_str: the string to strip
1919
1920     Returns:
1921         in_str with recognized ANSI escape sequences removed.
1922
1923     See also :mod:`pyutils.ansi`.
1924
1925     .. warning::
1926         This method works by using a regular expression.
1927         It works for all ANSI escape sequences I've tested with but
1928         may miss some; caveat emptor.
1929
1930     >>> import ansi as a
1931     >>> s = a.fg('blue') + 'blue!' + a.reset()
1932     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1933     18
1934     >>> len(strip_ansi_sequences(s))
1935     5
1936     >>> strip_ansi_sequences(s)
1937     'blue!'
1938
1939     """
1940     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1941
1942
1943 class SprintfStdout(contextlib.AbstractContextManager):
1944     """
1945     A context manager that captures outputs to stdout to a buffer
1946     without printing them.
1947
1948     >>> with SprintfStdout() as buf:
1949     ...     print("test")
1950     ...     print("1, 2, 3")
1951     ...
1952     >>> print(buf(), end='')
1953     test
1954     1, 2, 3
1955     """
1956
1957     def __init__(self) -> None:
1958         self.destination = io.StringIO()
1959         self.recorder: contextlib.redirect_stdout
1960
1961     def __enter__(self) -> Callable[[], str]:
1962         self.recorder = contextlib.redirect_stdout(self.destination)
1963         self.recorder.__enter__()
1964         return lambda: self.destination.getvalue()
1965
1966     def __exit__(self, *args) -> Literal[False]:
1967         self.recorder.__exit__(*args)
1968         self.destination.seek(0)
1969         return False
1970
1971
1972 def capitalize_first_letter(in_str: str) -> str:
1973     """
1974     Args:
1975         in_str: the string to capitalize
1976
1977     Returns:
1978         in_str with the first character capitalized.
1979
1980     >>> capitalize_first_letter('test')
1981     'Test'
1982     >>> capitalize_first_letter("ALREADY!")
1983     'ALREADY!'
1984     """
1985     return in_str[0].upper() + in_str[1:]
1986
1987
1988 def it_they(n: int) -> str:
1989     """
1990     Args:
1991         n: how many of them are there?
1992
1993     Returns:
1994         'it' if n is one or 'they' otherwize.
1995
1996     See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
1997     :meth:`thify`.
1998
1999     Suggested usage::
2000
2001         n = num_files_saved_to_tmp()
2002         print(f'Saved file{pluralize(n)} successfully.')
2003         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2004
2005     >>> it_they(1)
2006     'it'
2007     >>> it_they(100)
2008     'they'
2009     """
2010     if n == 1:
2011         return "it"
2012     return "they"
2013
2014
2015 def is_are(n: int) -> str:
2016     """
2017     Args:
2018         n: how many of them are there?
2019
2020     Returns:
2021         'is' if n is one or 'are' otherwize.
2022
2023     See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2024     :meth:`thify`.
2025
2026     Suggested usage::
2027
2028         n = num_files_saved_to_tmp()
2029         print(f'Saved file{pluralize(n)} successfully.')
2030         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2031
2032     >>> is_are(1)
2033     'is'
2034     >>> is_are(2)
2035     'are'
2036
2037     """
2038     if n == 1:
2039         return "is"
2040     return "are"
2041
2042
2043 def pluralize(n: int) -> str:
2044     """
2045     Args:
2046         n: how many of them are there?
2047
2048     Returns:
2049         's' if n is greater than one otherwize ''.
2050
2051     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2052     :meth:`thify`.
2053
2054     Suggested usage::
2055
2056         n = num_files_saved_to_tmp()
2057         print(f'Saved file{pluralize(n)} successfully.')
2058         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2059
2060     >>> pluralize(15)
2061     's'
2062     >>> count = 1
2063     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2064     There is 1 file.
2065     >>> count = 4
2066     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2067     There are 4 files.
2068     """
2069     if n == 1:
2070         return ""
2071     return "s"
2072
2073
2074 def make_contractions(txt: str) -> str:
2075     """This code glues words in txt together to form (English)
2076     contractions.
2077
2078     Args:
2079         txt: the input text to be contractionized.
2080
2081     Returns:
2082         Output text identical to original input except for any
2083         recognized contractions are formed.
2084
2085     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2086
2087     .. note::
2088         The order in which we create contractions is defined by the
2089         implementation and what I thought made more sense when writing
2090         this code.
2091
2092     >>> make_contractions('It is nice today.')
2093     "It's nice today."
2094
2095     >>> make_contractions('I can    not even...')
2096     "I can't even..."
2097
2098     >>> make_contractions('She could not see!')
2099     "She couldn't see!"
2100
2101     >>> make_contractions('But she will not go.')
2102     "But she won't go."
2103
2104     >>> make_contractions('Verily, I shall not.')
2105     "Verily, I shan't."
2106
2107     >>> make_contractions('No you cannot.')
2108     "No you can't."
2109
2110     >>> make_contractions('I said you can not go.')
2111     "I said you can't go."
2112     """
2113
2114     first_second = [
2115         (
2116             [
2117                 'are',
2118                 'could',
2119                 'did',
2120                 'has',
2121                 'have',
2122                 'is',
2123                 'must',
2124                 'should',
2125                 'was',
2126                 'were',
2127                 'would',
2128             ],
2129             ['(n)o(t)'],
2130         ),
2131         (
2132             [
2133                 "I",
2134                 "you",
2135                 "he",
2136                 "she",
2137                 "it",
2138                 "we",
2139                 "they",
2140                 "how",
2141                 "why",
2142                 "when",
2143                 "where",
2144                 "who",
2145                 "there",
2146             ],
2147             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2148         ),
2149     ]
2150
2151     # Special cases: can't, shan't and won't.
2152     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2153     txt = re.sub(
2154         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2155     )
2156     txt = re.sub(
2157         r'\b(w)ill\s*(n)(o)(t)\b',
2158         r"\1\3\2'\4",
2159         txt,
2160         count=0,
2161         flags=re.IGNORECASE,
2162     )
2163
2164     for first_list, second_list in first_second:
2165         for first in first_list:
2166             for second in second_list:
2167                 # Disallow there're/where're.  They're valid English
2168                 # but sound weird.
2169                 if (first in set(['there', 'where'])) and second == 'a(re)':
2170                     continue
2171
2172                 pattern = fr'\b({first})\s+{second}\b'
2173                 if second == '(n)o(t)':
2174                     replacement = r"\1\2'\3"
2175                 else:
2176                     replacement = r"\1'\2"
2177                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2178
2179     return txt
2180
2181
2182 def thify(n: int) -> str:
2183     """
2184     Args:
2185         n: how many of them are there?
2186
2187     Returns:
2188         The proper cardinal suffix for a number.
2189
2190     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2191
2192     Suggested usage::
2193
2194         attempt_count = 0
2195         while True:
2196             attempt_count += 1
2197             if try_the_thing():
2198                 break
2199             print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2200
2201     >>> thify(1)
2202     'st'
2203     >>> thify(33)
2204     'rd'
2205     >>> thify(16)
2206     'th'
2207     """
2208     digit = str(n)
2209     assert is_integer_number(digit)
2210     digit = digit[-1:]
2211     if digit == "1":
2212         return "st"
2213     elif digit == "2":
2214         return "nd"
2215     elif digit == "3":
2216         return "rd"
2217     else:
2218         return "th"
2219
2220
2221 def ngrams(txt: str, n: int):
2222     """
2223     Args:
2224         txt: the string to create ngrams using
2225         n: how many words per ngram created?
2226
2227     Returns:
2228         Generates the ngrams from the input string.
2229
2230     See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2231
2232     >>> [x for x in ngrams('This is a test', 2)]
2233     ['This is', 'is a', 'a test']
2234     """
2235     words = txt.split()
2236     for ngram in ngrams_presplit(words, n):
2237         ret = ''
2238         for w in ngram:
2239             ret += f'{w} '
2240         yield ret.strip()
2241
2242
2243 def ngrams_presplit(words: Sequence[str], n: int):
2244     """
2245     Same as :meth:`ngrams` but with the string pre-split.
2246
2247     See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2248     """
2249     return list_utils.ngrams(words, n)
2250
2251
2252 def bigrams(txt: str):
2253     """Generates the bigrams (n=2) of the given string.
2254
2255     See also :meth:`ngrams`, :meth:`trigrams`.
2256
2257     >>> [x for x in bigrams('this is a test')]
2258     ['this is', 'is a', 'a test']
2259     """
2260     return ngrams(txt, 2)
2261
2262
2263 def trigrams(txt: str):
2264     """Generates the trigrams (n=3) of the given string.
2265
2266     See also :meth:`ngrams`, :meth:`bigrams`.
2267     """
2268     return ngrams(txt, 3)
2269
2270
2271 def shuffle_columns_into_list(
2272     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
2273 ) -> Iterable[str]:
2274     """Helper to shuffle / parse columnar data and return the results as a
2275     list.
2276
2277     Args:
2278         input_lines: A sequence of strings that represents text that
2279             has been broken into columns by the caller
2280         column_specs: an iterable collection of numeric sequences that
2281             indicate one or more column numbers to copy to form the Nth
2282             position in the output list.  See example below.
2283         delim: for column_specs that indicate we should copy more than
2284             one column from the input into this position, use delim to
2285             separate source data.  Defaults to ''.
2286
2287     Returns:
2288         A list of string created by following the instructions set forth
2289         in column_specs.
2290
2291     See also :meth:`shuffle_columns_into_dict`.
2292
2293     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2294     >>> shuffle_columns_into_list(
2295     ...     cols,
2296     ...     [ [8], [2, 3], [5, 6, 7] ],
2297     ...     delim='!',
2298     ... )
2299     ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2300     """
2301     out = []
2302
2303     # Column specs map input lines' columns into outputs.
2304     # [col1, col2...]
2305     for spec in column_specs:
2306         hunk = ''
2307         for n in spec:
2308             hunk = hunk + delim + input_lines[n]
2309         hunk = hunk.strip(delim)
2310         out.append(hunk)
2311     return out
2312
2313
2314 def shuffle_columns_into_dict(
2315     input_lines: Sequence[str],
2316     column_specs: Iterable[Tuple[str, Iterable[int]]],
2317     delim: str = '',
2318 ) -> Dict[str, str]:
2319     """Helper to shuffle / parse columnar data and return the results
2320     as a dict.
2321
2322     Args:
2323         input_lines: a sequence of strings that represents text that
2324             has been broken into columns by the caller
2325         column_specs: instructions for what dictionary keys to apply
2326             to individual or compound input column data.  See example
2327             below.
2328         delim: when forming compound output data by gluing more than
2329             one input column together, use this character to separate
2330             the source data.  Defaults to ''.
2331
2332     Returns:
2333         A dict formed by applying the column_specs instructions.
2334
2335     See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2336
2337     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2338     >>> shuffle_columns_into_dict(
2339     ...     cols,
2340     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2341     ...     delim='!',
2342     ... )
2343     {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2344     """
2345     out = {}
2346
2347     # Column specs map input lines' columns into outputs.
2348     # "key", [col1, col2...]
2349     for spec in column_specs:
2350         hunk = ''
2351         for n in spec[1]:
2352             hunk = hunk + delim + input_lines[n]
2353         hunk = hunk.strip(delim)
2354         out[spec[0]] = hunk
2355     return out
2356
2357
2358 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2359     """
2360     Interpolate a string with data from a dict.
2361
2362     Args:
2363         txt: the mad libs template
2364         values: what you and your kids chose for each category.
2365
2366     See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2367
2368     >>> interpolate_using_dict('This is a {adjective} {noun}.',
2369     ...                        {'adjective': 'good', 'noun': 'example'})
2370     'This is a good example.'
2371     """
2372     return _sprintf(txt.format(**values), end='')
2373
2374
2375 def to_ascii(txt: str):
2376     """
2377     Args:
2378         txt: the input data to encode
2379
2380     Returns:
2381         txt encoded as an ASCII byte string.
2382
2383     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2384     :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2385
2386     >>> to_ascii('test')
2387     b'test'
2388
2389     >>> to_ascii(b'1, 2, 3')
2390     b'1, 2, 3'
2391     """
2392     if isinstance(txt, str):
2393         return txt.encode('ascii')
2394     if isinstance(txt, bytes):
2395         return txt
2396     raise Exception('to_ascii works with strings and bytes')
2397
2398
2399 def to_base64(
2400     txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2401 ) -> bytes:
2402     """
2403     Args:
2404         txt: the input data to encode
2405         encoding: the encoding to use during conversion
2406         errors: how to handle encoding errors
2407
2408     Returns:
2409         txt encoded with a 64-chracter alphabet.  Similar to and compatible
2410         with uuencode/uudecode.
2411
2412     See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2413     :meth:`from_base64`.
2414
2415     >>> to_base64('hello?')
2416     b'aGVsbG8/\\n'
2417     """
2418     return base64.encodebytes(txt.encode(encoding, errors))
2419
2420
2421 def is_base64(txt: str) -> bool:
2422     """
2423     Args:
2424         txt: the string to check
2425
2426     Returns:
2427         True if txt is a valid base64 encoded string.  This assumes
2428         txt was encoded with Python's standard base64 alphabet which
2429         is the same as what uuencode/uudecode uses).
2430
2431     See also :meth:`to_base64`, :meth:`from_base64`.
2432
2433     >>> is_base64('test')    # all letters in the b64 alphabet
2434     True
2435
2436     >>> is_base64('another test, how do you like this one?')
2437     False
2438
2439     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
2440     True
2441
2442     """
2443     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2444     alphabet = set(a.encode('ascii'))
2445     for char in to_ascii(txt.strip()):
2446         if char not in alphabet:
2447             return False
2448     return True
2449
2450
2451 def from_base64(
2452     b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2453 ) -> str:
2454     """
2455     Args:
2456         b64: bytestring of 64-bit encoded data to decode / convert.
2457         encoding: the encoding to use during conversion
2458         errors: how to handle encoding errors
2459
2460     Returns:
2461         The decoded form of b64 as a normal python string.  Similar to
2462         and compatible with uuencode / uudecode.
2463
2464     See also :meth:`to_base64`, :meth:`is_base64`.
2465
2466     >>> from_base64(b'aGVsbG8/\\n')
2467     'hello?'
2468     """
2469     return base64.decodebytes(b64).decode(encoding, errors)
2470
2471
2472 def chunk(txt: str, chunk_size: int):
2473     """
2474     Args:
2475         txt: a string to be chunked into evenly spaced pieces.
2476         chunk_size: the size of each chunk to make
2477
2478     Returns:
2479         The original string chunked into evenly spaced pieces.
2480
2481     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2482     '01001101 11000101 10101010 10101010 10011111 10101000'
2483     """
2484     if len(txt) % chunk_size != 0:
2485         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2486         logger.warning(msg)
2487         warnings.warn(msg, stacklevel=2)
2488     for x in range(0, len(txt), chunk_size):
2489         yield txt[x : x + chunk_size]
2490
2491
2492 def to_bitstring(txt: str, *, delimiter: str = '') -> str:
2493     """
2494     Args:
2495         txt: the string to convert into a bitstring
2496         delimiter: character to insert between adjacent bytes.  Note that
2497             only bitstrings with delimiter='' are interpretable by
2498             :meth:`from_bitstring`.
2499
2500     Returns:
2501         txt converted to ascii/binary and then chopped into bytes.
2502
2503     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2504     :meth:`chunk`.
2505
2506     >>> to_bitstring('hello?')
2507     '011010000110010101101100011011000110111100111111'
2508
2509     >>> to_bitstring('test', delimiter=' ')
2510     '01110100 01100101 01110011 01110100'
2511
2512     >>> to_bitstring(b'test')
2513     '01110100011001010111001101110100'
2514     """
2515     etxt = to_ascii(txt)
2516     bits = bin(int.from_bytes(etxt, 'big'))
2517     bits = bits[2:]
2518     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2519
2520
2521 def is_bitstring(txt: str) -> bool:
2522     """
2523     Args:
2524         txt: the string to check
2525
2526     Returns:
2527         True if txt is a recognized bitstring and False otherwise.
2528         Note that if delimiter is non empty this code will not
2529         recognize the bitstring.
2530
2531     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2532     :meth:`chunk`.
2533
2534     >>> is_bitstring('011010000110010101101100011011000110111100111111')
2535     True
2536
2537     >>> is_bitstring('1234')
2538     False
2539     """
2540     return is_binary_integer_number(f'0b{txt}')
2541
2542
2543 def from_bitstring(
2544     bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2545 ) -> str:
2546     """
2547     Args:
2548         bits: the bitstring to convert back into a python string
2549         encoding: the encoding to use during conversion
2550         errors: how to handle encoding errors
2551
2552     Returns:
2553         The regular python string represented by bits.  Note that this
2554         code does not work with to_bitstring when delimiter is non-empty.
2555
2556     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2557     :meth:`chunk`.
2558
2559     >>> from_bitstring('011010000110010101101100011011000110111100111111')
2560     'hello?'
2561     """
2562     n = int(bits, 2)
2563     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2564
2565
2566 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2567     """
2568     Args:
2569         txt: an IP address to chunk up for sorting purposes
2570
2571     Returns:
2572         A tuple of IP components arranged such that the sorting of
2573         IP addresses using a normal comparator will do something sane
2574         and desireable.
2575
2576     See also :meth:`is_ip_v4`.
2577
2578     >>> ip_v4_sort_key('10.0.0.18')
2579     (10, 0, 0, 18)
2580
2581     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2582     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2583     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2584     """
2585     if not is_ip_v4(txt):
2586         print(f"not IP: {txt}")
2587         return None
2588     return tuple(int(x) for x in txt.split('.'))
2589
2590
2591 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2592     """
2593     Args:
2594         volume: the string to chunk up for sorting purposes
2595
2596     Returns:
2597         A tuple of volume's components such that the sorting of
2598         volumes using a normal comparator will do something sane
2599         and desireable.
2600
2601     See also :mod:`pyutils.files.file_utils`.
2602
2603     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2604     ('usr', 'local', 'bin')
2605
2606     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2607     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2608     ['/usr', '/usr/local', '/usr/local/bin']
2609     """
2610     return tuple(x for x in volume.split('/') if len(x) > 0)
2611
2612
2613 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2614     """
2615     Execute several replace operations in a row.
2616
2617     Args:
2618         in_str: the string in which to replace characters
2619         replace_set: the set of target characters to replace
2620         replacement: the character to replace any member of replace_set
2621             with
2622
2623     See also :meth:`replace_nth`.
2624
2625     Returns:
2626         The string with replacements executed.
2627
2628     >>> s = 'this_is a-test!'
2629     >>> replace_all(s, ' _-!', '')
2630     'thisisatest'
2631     """
2632     for char in replace_set:
2633         in_str = in_str.replace(char, replacement)
2634     return in_str
2635
2636
2637 def replace_nth(in_str: str, source: str, target: str, nth: int):
2638     """
2639     Replaces the nth occurrance of a substring within a string.
2640
2641     Args:
2642         in_str: the string in which to run the replacement
2643         source: the substring to replace
2644         target: the replacement text
2645         nth: which occurrance of source to replace?
2646
2647     See also :meth:`replace_all`.
2648
2649     >>> replace_nth('this is a test', ' ', '-', 3)
2650     'this is a-test'
2651     """
2652     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2653     before = in_str[:where]
2654     after = in_str[where:]
2655     after = after.replace(source, target, 1)
2656     return before + after
2657
2658
2659 if __name__ == '__main__':
2660     import doctest
2661
2662     doctest.testmod()