src/pyutils/string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7
   8 Modifications Copyright (c) 2021-2022 Scott Gasch
   9
  10 Permission is hereby granted, free of charge, to any person obtaining a copy
  11 of this software and associated documentation files (the "Software"), to deal
  12 in the Software without restriction, including without limitation the rights
  13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 copies of the Software, and to permit persons to whom the Software is
  15 furnished to do so, subject to the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included in all
  18 copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26 SOFTWARE.
  27
  28 This class is based on:
  29 https://github.com/daveoncode/python-string-utils.  See `NOTICE
  30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
  31 in the root of this module for a detailed enumeration of what work is
  32 Davide's and what work was added by Scott.
  33
  34 """
  35
  36 import base64
  37 import contextlib  # type: ignore
  38 import datetime
  39 import io
  40 import json
  41 import logging
  42 import numbers
  43 import random
  44 import re
  45 import string
  46 import unicodedata
  47 import warnings
  48 from itertools import zip_longest
  49 from typing import (
  50     Any,
  51     Callable,
  52     Dict,
  53     Iterable,
  54     List,
  55     Literal,
  56     Optional,
  57     Sequence,
  58     Tuple,
  59 )
  60 from uuid import uuid4
  61
  62 from pyutils import list_utils
  63
  64 logger = logging.getLogger(__name__)
  65
  66 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  67
  68 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  69
  70 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  71
  72 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  73
  74 URLS_RAW_STRING = (
  75     r"([a-z-]+://)"  # scheme
  76     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  77     r"(www\.)?"  # www.
  78     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  79     r"(:\d{2,})?"  # port number
  80     r"(/[a-z\d_%+-]*)*"  # folders
  81     r"(\.[a-z\d_%+-]+)*"  # file extension
  82     r"(\?[a-z\d_+%-=]*)?"  # query string
  83     r"(#\S*)?"  # hash
  84 )
  85
  86 URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
  87
  88 URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
  89
  90 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  91
  92 EMAILS_RAW_STRING = (
  93     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  94 )
  95
  96 EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
  97
  98 EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
  99
 100 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
 101
 102 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
 103
 104 SNAKE_CASE_TEST_RE = re.compile(
 105     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
 106 )
 107
 108 SNAKE_CASE_TEST_DASH_RE = re.compile(
 109     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 110 )
 111
 112 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 113
 114 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 115
 116 CREDIT_CARDS = {
 117     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 118     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 119     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 120     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 121     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 122     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 123 }
 124
 125 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 126
 127 UUID_RE = re.compile(
 128     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 129 )
 130
 131 UUID_HEX_OK_RE = re.compile(
 132     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 133     re.IGNORECASE,
 134 )
 135
 136 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 137
 138 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 139
 140 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 141
 142 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 143
 144 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 145
 146 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 147     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 148 )
 149
 150 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 151
 152 HTML_RE = re.compile(
 153     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 154     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 155 )
 156
 157 HTML_TAG_ONLY_RE = re.compile(
 158     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 159     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 160 )
 161
 162 SPACES_RE = re.compile(r"\s")
 163
 164 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 165
 166 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 167
 168 ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
 169
 170 NUM_SUFFIXES = {
 171     "Pb": (1024**5),
 172     "P": (1024**5),
 173     "Tb": (1024**4),
 174     "T": (1024**4),
 175     "Gb": (1024**3),
 176     "G": (1024**3),
 177     "Mb": (1024**2),
 178     "M": (1024**2),
 179     "Kb": (1024**1),
 180     "K": (1024**1),
 181 }
 182
 183 UNIT_WORDS = [
 184     "zero",
 185     "one",
 186     "two",
 187     "three",
 188     "four",
 189     "five",
 190     "six",
 191     "seven",
 192     "eight",
 193     "nine",
 194     "ten",
 195     "eleven",
 196     "twelve",
 197     "thirteen",
 198     "fourteen",
 199     "fifteen",
 200     "sixteen",
 201     "seventeen",
 202     "eighteen",
 203     "nineteen",
 204 ]
 205
 206 TENS_WORDS = [
 207     "",
 208     "",
 209     "twenty",
 210     "thirty",
 211     "forty",
 212     "fifty",
 213     "sixty",
 214     "seventy",
 215     "eighty",
 216     "ninety",
 217 ]
 218
 219 MAGNITUDE_SCALES = [
 220     "hundred",
 221     "thousand",
 222     "million",
 223     "billion",
 224     "trillion",
 225     "quadrillion",
 226 ]
 227
 228 NUM_WORDS = {}
 229 NUM_WORDS["and"] = (1, 0)
 230 for i, word in enumerate(UNIT_WORDS):
 231     NUM_WORDS[word] = (1, i)
 232 for i, word in enumerate(TENS_WORDS):
 233     NUM_WORDS[word] = (1, i * 10)
 234 for i, word in enumerate(MAGNITUDE_SCALES):
 235     if i == 0:
 236         NUM_WORDS[word] = (100, 0)
 237     else:
 238         NUM_WORDS[word] = (10 ** (i * 3), 0)
 239 NUM_WORDS['score'] = (20, 0)
 240
 241
 242 def is_none_or_empty(in_str: Optional[str]) -> bool:
 243     """
 244     Args:
 245         in_str: the string to test
 246
 247     Returns:
 248         True if the input string is either None or an empty string,
 249         False otherwise.
 250
 251     See also :meth:`is_string` and :meth:`is_empty_string`.
 252
 253     >>> is_none_or_empty("")
 254     True
 255     >>> is_none_or_empty(None)
 256     True
 257     >>> is_none_or_empty("   \t   ")
 258     True
 259     >>> is_none_or_empty('Test')
 260     False
 261     """
 262     return in_str is None or len(in_str.strip()) == 0
 263
 264
 265 def is_string(in_str: Any) -> bool:
 266     """
 267     Args:
 268         in_str: the object to test
 269
 270     Returns:
 271         True if the object is a string and False otherwise.
 272
 273     See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 274
 275     >>> is_string('test')
 276     True
 277     >>> is_string(123)
 278     False
 279     >>> is_string(100.3)
 280     False
 281     >>> is_string([1, 2, 3])
 282     False
 283     """
 284     return isinstance(in_str, str)
 285
 286
 287 def is_empty_string(in_str: Any) -> bool:
 288     """
 289     Args:
 290         in_str: the string to test
 291
 292     Returns:
 293         True if the string is empty and False otherwise.
 294
 295     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 296     """
 297     return is_empty(in_str)
 298
 299
 300 def is_empty(in_str: Any) -> bool:
 301     """
 302     Args:
 303         in_str: the string to test
 304
 305     Returns:
 306         True if the string is empty and false otherwise.
 307
 308     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 309
 310     >>> is_empty('')
 311     True
 312     >>> is_empty('    \t\t    ')
 313     True
 314     >>> is_empty('test')
 315     False
 316     >>> is_empty(100.88)
 317     False
 318     >>> is_empty([1, 2, 3])
 319     False
 320     """
 321     return is_string(in_str) and in_str.strip() == ""
 322
 323
 324 def is_full_string(in_str: Any) -> bool:
 325     """
 326     Args:
 327         in_str: the object to test
 328
 329     Returns:
 330         True if the object is a string and is not empty ('') and
 331         is not only composed of whitespace.
 332
 333     See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 334
 335     >>> is_full_string('test!')
 336     True
 337     >>> is_full_string('')
 338     False
 339     >>> is_full_string('      ')
 340     False
 341     >>> is_full_string(100.999)
 342     False
 343     >>> is_full_string({"a": 1, "b": 2})
 344     False
 345     """
 346     return is_string(in_str) and in_str.strip() != ""
 347
 348
 349 def is_number(in_str: str) -> bool:
 350     """
 351     Args:
 352         in_str: the string to test
 353
 354     Returns:
 355         True if the string contains a valid numberic value and
 356         False otherwise.
 357
 358     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 359     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 360     etc...
 361
 362     >>> is_number(100.5)
 363     Traceback (most recent call last):
 364     ...
 365     ValueError: 100.5
 366     >>> is_number("100.5")
 367     True
 368     >>> is_number("test")
 369     False
 370     >>> is_number("99")
 371     True
 372     >>> is_number([1, 2, 3])
 373     Traceback (most recent call last):
 374     ...
 375     ValueError: [1, 2, 3]
 376     """
 377     if not is_string(in_str):
 378         raise ValueError(in_str)
 379     return NUMBER_RE.match(in_str) is not None
 380
 381
 382 def is_integer_number(in_str: str) -> bool:
 383     """
 384     Args:
 385         in_str: the string to test
 386
 387     Returns:
 388         True if the string contains a valid (signed or unsigned,
 389         decimal, hex, or octal, regular or scientific) integral
 390         expression and False otherwise.
 391
 392     See also :meth:`is_number`, :meth:`is_decimal_number`,
 393     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 394     etc...
 395
 396     >>> is_integer_number('42')
 397     True
 398     >>> is_integer_number('42.0')
 399     False
 400     """
 401     return (
 402         (is_number(in_str) and "." not in in_str)
 403         or is_hexidecimal_integer_number(in_str)
 404         or is_octal_integer_number(in_str)
 405         or is_binary_integer_number(in_str)
 406     )
 407
 408
 409 def is_hexidecimal_integer_number(in_str: str) -> bool:
 410     """
 411     Args:
 412         in_str: the string to test
 413
 414     Returns:
 415         True if the string is a hex integer number and False otherwise.
 416
 417     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 418     :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
 419
 420     >>> is_hexidecimal_integer_number('0x12345')
 421     True
 422     >>> is_hexidecimal_integer_number('0x1A3E')
 423     True
 424     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 425     False
 426     >>> is_hexidecimal_integer_number('-0xff')
 427     True
 428     >>> is_hexidecimal_integer_number('test')
 429     False
 430     >>> is_hexidecimal_integer_number(12345)  # Not a string
 431     Traceback (most recent call last):
 432     ...
 433     ValueError: 12345
 434     >>> is_hexidecimal_integer_number(101.4)
 435     Traceback (most recent call last):
 436     ...
 437     ValueError: 101.4
 438     >>> is_hexidecimal_integer_number(0x1A3E)
 439     Traceback (most recent call last):
 440     ...
 441     ValueError: 6718
 442     """
 443     if not is_string(in_str):
 444         raise ValueError(in_str)
 445     return HEX_NUMBER_RE.match(in_str) is not None
 446
 447
 448 def is_octal_integer_number(in_str: str) -> bool:
 449     """
 450     Args:
 451         in_str: the string to test
 452
 453     Returns:
 454         True if the string is a valid octal integral number and False otherwise.
 455
 456     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 457     :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
 458     etc...
 459
 460     >>> is_octal_integer_number('0o777')
 461     True
 462     >>> is_octal_integer_number('-0O115')
 463     True
 464     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 465     False
 466     >>> is_octal_integer_number('7777')  # Needs 0o
 467     False
 468     >>> is_octal_integer_number('test')
 469     False
 470     """
 471     if not is_string(in_str):
 472         raise ValueError(in_str)
 473     return OCT_NUMBER_RE.match(in_str) is not None
 474
 475
 476 def is_binary_integer_number(in_str: str) -> bool:
 477     """
 478     Args:
 479         in_str: the string to test
 480
 481     Returns:
 482         True if the string contains a binary integral number and False otherwise.
 483
 484     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 485     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 486     etc...
 487
 488     >>> is_binary_integer_number('0b10111')
 489     True
 490     >>> is_binary_integer_number('-0b111')
 491     True
 492     >>> is_binary_integer_number('0B10101')
 493     True
 494     >>> is_binary_integer_number('0b10102')
 495     False
 496     >>> is_binary_integer_number('0xFFF')
 497     False
 498     >>> is_binary_integer_number('test')
 499     False
 500     """
 501     if not is_string(in_str):
 502         raise ValueError(in_str)
 503     return BIN_NUMBER_RE.match(in_str) is not None
 504
 505
 506 def to_int(in_str: str) -> int:
 507     """
 508     Args:
 509         in_str: the string to convert
 510
 511     Returns:
 512         The integral value of the string or raises on error.
 513
 514     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 515     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 516     :meth:`is_binary_integer_number`, etc...
 517
 518     >>> to_int('1234')
 519     1234
 520     >>> to_int('0x1234')
 521     4660
 522     >>> to_int('0b01101')
 523     13
 524     >>> to_int('0o777')
 525     511
 526     >>> to_int('test')
 527     Traceback (most recent call last):
 528     ...
 529     ValueError: invalid literal for int() with base 10: 'test'
 530     """
 531     if not is_string(in_str):
 532         raise ValueError(in_str)
 533     if is_binary_integer_number(in_str):
 534         return int(in_str, 2)
 535     if is_octal_integer_number(in_str):
 536         return int(in_str, 8)
 537     if is_hexidecimal_integer_number(in_str):
 538         return int(in_str, 16)
 539     return int(in_str)
 540
 541
 542 def number_string_to_integer(in_str: str) -> int:
 543     """Convert a string containing a written-out number into an int.
 544
 545     Args:
 546         in_str: the string containing the long-hand written out integer number
 547             in English.  See examples below.
 548
 549     Returns:
 550         The integer whose value was parsed from in_str.
 551
 552     See also :meth:`integer_to_number_string`.
 553
 554     .. warning::
 555         This code only handles integers; it will not work with decimals / floats.
 556
 557     >>> number_string_to_integer("one hundred fifty two")
 558     152
 559
 560     >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
 561     10200054003
 562
 563     >>> number_string_to_integer("four-score and 7")
 564     87
 565
 566     >>> number_string_to_integer("fifty xyzzy three")
 567     Traceback (most recent call last):
 568     ...
 569     ValueError: Unknown word: xyzzy
 570     """
 571     if isinstance(in_str, int):
 572         return int(in_str)
 573
 574     current = result = 0
 575     in_str = in_str.replace('-', ' ')
 576     for w in in_str.split():
 577         if w not in NUM_WORDS:
 578             if is_integer_number(w):
 579                 current += int(w)
 580                 continue
 581             else:
 582                 raise ValueError("Unknown word: " + w)
 583         scale, increment = NUM_WORDS[w]
 584         current = current * scale + increment
 585         if scale > 100:
 586             result += current
 587             current = 0
 588     return result + current
 589
 590
 591 def integer_to_number_string(num: int) -> str:
 592     """
 593     Opposite of :meth:`number_string_to_integer`; converts a number to a written out
 594     longhand format in English.
 595
 596     Args:
 597         num: the integer number to convert
 598
 599     Returns:
 600         The long-hand written out English form of the number.  See examples below.
 601
 602     See also :meth:`number_string_to_integer`.
 603
 604     .. warning::
 605         This method does not handle decimals or floats, only ints.
 606
 607     >>> integer_to_number_string(9)
 608     'nine'
 609
 610     >>> integer_to_number_string(42)
 611     'forty two'
 612
 613     >>> integer_to_number_string(123219982)
 614     'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
 615     """
 616
 617     if num < 20:
 618         return UNIT_WORDS[num]
 619     if num < 100:
 620         ret = TENS_WORDS[num // 10]
 621         leftover = num % 10
 622         if leftover != 0:
 623             ret += ' ' + UNIT_WORDS[leftover]
 624         return ret
 625
 626     # If num > 100 go find the highest chunk and convert that, then recursively
 627     # convert the rest.  NUM_WORDS contains items like 'thousand' -> (1000, 0).
 628     # The second item in the tuple is an increment that can be ignored; the first
 629     # is the numeric "scale" of the entry.  So find the greatest entry in NUM_WORDS
 630     # still less than num.  For 123,456 it would be thousand.  Then pull out the
 631     # 123, convert it, and append "thousand".  Then do the rest.
 632     scales = {}
 633     for name, val in NUM_WORDS.items():
 634         if val[0] <= num:
 635             scales[name] = val[0]
 636     scale = max(scales.items(), key=lambda _: _[1])
 637
 638     # scale[1] = numeric magnitude (e.g. 1000)
 639     # scale[0] = name (e.g. "thousand")
 640     ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
 641     leftover = num % scale[1]
 642     if leftover != 0:
 643         ret += ' ' + integer_to_number_string(leftover)
 644     return ret
 645
 646
 647 def is_decimal_number(in_str: str) -> bool:
 648     """
 649     Args:
 650         in_str: the string to check
 651
 652     Returns:
 653         True if the given string represents a decimal or False
 654         otherwise.  A decimal may be signed or unsigned or use
 655         a "scientific notation".
 656
 657     See also :meth:`is_integer_number`.
 658
 659     .. note::
 660         We do not consider integers without a decimal point
 661         to be decimals; they return False (see example).
 662
 663     >>> is_decimal_number('42.0')
 664     True
 665     >>> is_decimal_number('42')
 666     False
 667     """
 668     return is_number(in_str) and "." in in_str
 669
 670
 671 def strip_escape_sequences(in_str: str) -> str:
 672     """
 673     Args:
 674         in_str: the string to strip of escape sequences.
 675
 676     Returns:
 677         in_str with escape sequences removed.
 678
 679     See also: :mod:`pyutils.ansi`.
 680
 681     .. note::
 682         What is considered to be an "escape sequence" is defined
 683         by a regular expression.  While this gets common ones,
 684         there may exist valid sequences that it doesn't match.
 685
 686     >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
 687     'this is a test!'
 688     """
 689     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 690     return in_str
 691
 692
 693 def add_thousands_separator(
 694     in_str: str, *, separator_char: str = ',', places: int = 3
 695 ) -> str:
 696     """
 697     Args:
 698         in_str: string or number to which to add thousands separator(s)
 699         separator_char: the separator character to add (defaults to comma)
 700         places: add a separator every N places (defaults to three)
 701
 702     Returns:
 703         A numeric string with thousands separators added appropriately.
 704
 705     >>> add_thousands_separator('12345678')
 706     '12,345,678'
 707     >>> add_thousands_separator(12345678)
 708     '12,345,678'
 709     >>> add_thousands_separator(12345678.99)
 710     '12,345,678.99'
 711     >>> add_thousands_separator('test')
 712     Traceback (most recent call last):
 713     ...
 714     ValueError: test
 715
 716     """
 717     if isinstance(in_str, numbers.Number):
 718         in_str = f'{in_str}'
 719     if is_number(in_str):
 720         return _add_thousands_separator(
 721             in_str, separator_char=separator_char, places=places
 722         )
 723     raise ValueError(in_str)
 724
 725
 726 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 727     """Internal helper"""
 728     decimal_part = ""
 729     if '.' in in_str:
 730         (in_str, decimal_part) = in_str.split('.')
 731     tmp = [iter(in_str[::-1])] * places
 732     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 733     if len(decimal_part) > 0:
 734         ret += '.'
 735         ret += decimal_part
 736     return ret
 737
 738
 739 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 740     """
 741     Args:
 742         in_str: the string to test
 743         allowed_schemes: an optional list of allowed schemes (e.g.
 744             ['http', 'https', 'ftp'].  If passed, only URLs that
 745             begin with the one of the schemes passed will be considered
 746             to be valid.  Otherwise, any scheme:// will be considered
 747             valid.
 748
 749     Returns:
 750         True if in_str contains a valid URL and False otherwise.
 751
 752     >>> is_url('http://www.mysite.com')
 753     True
 754     >>> is_url('https://mysite.com')
 755     True
 756     >>> is_url('.mysite.com')
 757     False
 758     >>> is_url('scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash')
 759     True
 760     """
 761     if not is_full_string(in_str):
 762         return False
 763
 764     valid = URL_RE.match(in_str) is not None
 765
 766     if allowed_schemes:
 767         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 768     return valid
 769
 770
 771 def is_email(in_str: Any) -> bool:
 772     """
 773     Args:
 774         in_str: the email address to check
 775
 776     Returns: True if the in_str contains a valid email (as defined by
 777         https://tools.ietf.org/html/rfc3696#section-3) or False
 778         otherwise.
 779
 780     >>> is_email('[email protected]')
 781     True
 782     >>> is_email('@gmail.com')
 783     False
 784     """
 785     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 786         return False
 787
 788     try:
 789         # we expect 2 tokens, one before "@" and one after, otherwise
 790         # we have an exception and the email is not valid.
 791         head, tail = in_str.split("@")
 792
 793         # head's size must be <= 64, tail <= 255, head must not start
 794         # with a dot or contain multiple consecutive dots.
 795         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 796             return False
 797
 798         # removes escaped spaces, so that later on the test regex will
 799         # accept the string.
 800         head = head.replace("\\ ", "")
 801         if head.startswith('"') and head.endswith('"'):
 802             head = head.replace(" ", "")[1:-1]
 803         return EMAIL_RE.match(head + "@" + tail) is not None
 804
 805     except ValueError:
 806         # borderline case in which we have multiple "@" signs but the
 807         # head part is correctly escaped.
 808         if ESCAPED_AT_SIGN.search(in_str) is not None:
 809             # replace "@" with "a" in the head
 810             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 811         return False
 812
 813
 814 def suffix_string_to_number(in_str: str) -> Optional[int]:
 815     """Takes a string like "33Gb" and converts it into a number (of bytes)
 816     like 34603008.
 817
 818     Args:
 819         in_str: the string with a suffix to be interpreted and removed.
 820
 821     Returns:
 822         An integer number of bytes or None to indicate an error.
 823
 824     See also :meth:`number_to_suffix_string`.
 825
 826     >>> suffix_string_to_number('1Mb')
 827     1048576
 828     >>> suffix_string_to_number('13.1Gb')
 829     14066017894
 830     """
 831
 832     def suffix_capitalize(s: str) -> str:
 833         if len(s) == 1:
 834             return s.upper()
 835         elif len(s) == 2:
 836             return f"{s[0].upper()}{s[1].lower()}"
 837         return suffix_capitalize(s[0:1])
 838
 839     if is_string(in_str):
 840         if is_integer_number(in_str):
 841             return to_int(in_str)
 842         suffixes = [in_str[-2:], in_str[-1:]]
 843         rest = [in_str[:-2], in_str[:-1]]
 844         for x in range(len(suffixes)):
 845             s = suffixes[x]
 846             s = suffix_capitalize(s)
 847             multiplier = NUM_SUFFIXES.get(s, None)
 848             if multiplier is not None:
 849                 r = rest[x]
 850                 if is_integer_number(r):
 851                     return to_int(r) * multiplier
 852                 if is_decimal_number(r):
 853                     return int(float(r) * multiplier)
 854     return None
 855
 856
 857 def number_to_suffix_string(num: int) -> Optional[str]:
 858     """Take a number (of bytes) and returns a string like "43.8Gb".
 859
 860     Args:
 861         num: an integer number of bytes
 862
 863     Returns:
 864         A string with a suffix representing num bytes concisely or
 865         None to indicate an error.
 866
 867     See also: :meth:`suffix_string_to_number`.
 868
 869     >>> number_to_suffix_string(14066017894)
 870     '13.1Gb'
 871     >>> number_to_suffix_string(1024 * 1024)
 872     '1.0Mb'
 873     """
 874     d = 0.0
 875     suffix = None
 876     for (sfx, size) in NUM_SUFFIXES.items():
 877         if num >= size:
 878             d = num / size
 879             suffix = sfx
 880             break
 881     if suffix is not None:
 882         return f"{d:.1f}{suffix}"
 883     else:
 884         return f'{num:d}'
 885
 886
 887 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 888     """
 889     Args:
 890         in_str: a string to check
 891         card_type: if provided, contains the card type to validate
 892             with.  Otherwise, all known credit card number types will
 893             be accepted.
 894
 895             Supported card types are the following:
 896
 897             * VISA
 898             * MASTERCARD
 899             * AMERICAN_EXPRESS
 900             * DINERS_CLUB
 901             * DISCOVER
 902             * JCB
 903
 904     Returns:
 905         True if in_str is a valid credit card number.
 906
 907     .. warning::
 908         This code is not verifying the authenticity of the credit card (i.e.
 909         not checking whether it's a real card that can be charged); rather
 910         it's only checking that the number follows the "rules" for numbering
 911         established by credit card issuers.
 912
 913     """
 914     if not is_full_string(in_str):
 915         return False
 916
 917     if card_type is not None:
 918         if card_type not in CREDIT_CARDS:
 919             raise KeyError(
 920                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 921             )
 922         return CREDIT_CARDS[card_type].match(in_str) is not None
 923     for c in CREDIT_CARDS:
 924         if CREDIT_CARDS[c].match(in_str) is not None:
 925             return True
 926     return False
 927
 928
 929 def is_camel_case(in_str: Any) -> bool:
 930     """
 931     Args:
 932         in_str: the string to test
 933
 934     Returns:
 935         True if the string is formatted as camel case and False otherwise.
 936         A string is considered camel case when:
 937
 938         * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 939         * it contains both lowercase and uppercase letters
 940         * it does not start with a number
 941
 942     See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
 943     """
 944     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 945
 946
 947 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 948     """
 949     Args:
 950         in_str: the string to test
 951         separator: the snake case separator character to use
 952
 953     Returns: True if the string is snake case and False otherwise.  A
 954         string is considered snake case when:
 955
 956         * it's composed only by lowercase/uppercase letters and digits
 957         * it contains at least one underscore (or provided separator)
 958         * it does not start with a number
 959
 960     See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
 961
 962     >>> is_snake_case('this_is_a_test')
 963     True
 964     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 965     True
 966     >>> is_snake_case('this-is-a-test')
 967     False
 968     >>> is_snake_case('this-is-a-test', separator='-')
 969     True
 970     """
 971     if is_full_string(in_str):
 972         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 973         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 974         r = re_map.get(
 975             separator,
 976             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 977         )
 978         return r.match(in_str) is not None
 979     return False
 980
 981
 982 def is_json(in_str: Any) -> bool:
 983     """
 984     Args:
 985         in_str: the string to test
 986
 987     Returns:
 988         True if the in_str contains valid JSON and False otherwise.
 989
 990     >>> is_json('{"name": "Peter"}')
 991     True
 992     >>> is_json('[1, 2, 3]')
 993     True
 994     >>> is_json('{nope}')
 995     False
 996     """
 997     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 998         try:
 999             return isinstance(json.loads(in_str), (dict, list))
1000         except (TypeError, ValueError, OverflowError):
1001             pass
1002     return False
1003
1004
1005 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1006     """
1007     Args:
1008         in_str: the string to test
1009         allow_hex: should we allow hexidecimal digits in valid uuids?
1010
1011     Returns:
1012         True if the in_str contains a valid UUID and False otherwise.
1013
1014     See also :meth:`generate_uuid`.
1015
1016     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1017     True
1018     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1019     False
1020     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1021     True
1022     """
1023     # string casting is used to allow UUID itself as input data type
1024     s = str(in_str)
1025     if allow_hex:
1026         return UUID_HEX_OK_RE.match(s) is not None
1027     return UUID_RE.match(s) is not None
1028
1029
1030 def is_ip_v4(in_str: Any) -> bool:
1031     """
1032     Args:
1033         in_str: the string to test
1034
1035     Returns:
1036         True if in_str contains a valid IPv4 address and False otherwise.
1037
1038     See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1039     and :meth:`is_ip`.
1040
1041     >>> is_ip_v4('255.200.100.75')
1042     True
1043     >>> is_ip_v4('nope')
1044     False
1045     >>> is_ip_v4('255.200.100.999')  # 999 out of range
1046     False
1047     """
1048     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1049         return False
1050
1051     # checks that each entry in the ip is in the valid range (0 to 255)
1052     for token in in_str.split("."):
1053         if not 0 <= int(token) <= 255:
1054             return False
1055     return True
1056
1057
1058 def extract_ip_v4(in_str: Any) -> Optional[str]:
1059     """
1060     Args:
1061         in_str: the string to extract an IPv4 address from.
1062
1063     Returns:
1064         The first extracted IPv4 address from in_str or None if
1065         none were found or an error occurred.
1066
1067     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1068     and :meth:`is_ip`.
1069
1070     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
1071     '127.0.0.1'
1072     >>> extract_ip_v4('Your mom dresses you funny.')
1073     """
1074     if not is_full_string(in_str):
1075         return None
1076     m = ANYWHERE_IP_V4_RE.search(in_str)
1077     if m is not None:
1078         return m.group(0)
1079     return None
1080
1081
1082 def is_ip_v6(in_str: Any) -> bool:
1083     """
1084     Args:
1085         in_str: the string to test.
1086
1087     Returns:
1088         True if in_str contains a valid IPv6 address and False otherwise.
1089
1090     See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1091     and :meth:`is_ip`.
1092
1093     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1094     True
1095     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
1096     False
1097     """
1098     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1099
1100
1101 def extract_ip_v6(in_str: Any) -> Optional[str]:
1102     """
1103     Args:
1104         in_str: the string from which to extract an IPv6 address.
1105
1106     Returns:
1107         The first IPv6 address found in in_str or None if no address
1108         was found or an error occurred.
1109
1110     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1111     and :meth:`is_ip`.
1112
1113     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1114     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1115     >>> extract_ip_v6("(and she's ugly too, btw)")
1116     """
1117     if not is_full_string(in_str):
1118         return None
1119     m = ANYWHERE_IP_V6_RE.search(in_str)
1120     if m is not None:
1121         return m.group(0)
1122     return None
1123
1124
1125 def is_ip(in_str: Any) -> bool:
1126     """
1127     Args:
1128         in_str: the string to test.
1129
1130     Returns:
1131         True if in_str contains a valid IP address (either IPv4 or
1132         IPv6).
1133
1134     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1135     and :meth:`extract_ip_v4`.
1136
1137     >>> is_ip('255.200.100.75')
1138     True
1139     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1140     True
1141     >>> is_ip('1.2.3')
1142     False
1143     >>> is_ip('1.2.3.999')
1144     False
1145     """
1146     return is_ip_v6(in_str) or is_ip_v4(in_str)
1147
1148
1149 def extract_ip(in_str: Any) -> Optional[str]:
1150     """
1151     Args:
1152         in_str: the string from which to extract in IP address.
1153
1154     Returns:
1155         The first IP address (IPv4 or IPv6) found in in_str or
1156         None to indicate none found or an error condition.
1157
1158     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1159     and :meth:`extract_ip_v4`.
1160
1161     >>> extract_ip('Attacker: 255.200.100.75')
1162     '255.200.100.75'
1163     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1164     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1165     >>> extract_ip('1.2.3')
1166     """
1167     ip = extract_ip_v4(in_str)
1168     if ip is None:
1169         ip = extract_ip_v6(in_str)
1170     return ip
1171
1172
1173 def is_mac_address(in_str: Any) -> bool:
1174     """
1175     Args:
1176         in_str: the string to test
1177
1178     Returns:
1179         True if in_str is a valid MAC address False otherwise.
1180
1181     See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1182
1183     >>> is_mac_address("34:29:8F:12:0D:2F")
1184     True
1185     >>> is_mac_address('34:29:8f:12:0d:2f')
1186     True
1187     >>> is_mac_address('34-29-8F-12-0D-2F')
1188     True
1189     >>> is_mac_address("test")
1190     False
1191     """
1192     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1193
1194
1195 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1196     """
1197     Args:
1198         in_str: the string from which to extract a MAC address.
1199         separator: the MAC address hex byte separator to use.
1200
1201     Returns:
1202         The first MAC address found in in_str or None to indicate no
1203         match or an error.
1204
1205     See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1206
1207     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1208     '34:29:8F:12:0D:2F'
1209
1210     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1211     'd8:5d:e2:34:54:86'
1212     """
1213     if not is_full_string(in_str):
1214         return None
1215     in_str.strip()
1216     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1217     if m is not None:
1218         mac = m.group(0)
1219         mac.replace(":", separator)
1220         mac.replace("-", separator)
1221         return mac
1222     return None
1223
1224
1225 def is_slug(in_str: Any, separator: str = "-") -> bool:
1226     """
1227     Args:
1228         in_str: string to test
1229         separator: the slug character to use
1230
1231     Returns:
1232         True if in_str is a slug string and False otherwise.
1233
1234     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1235
1236     >>> is_slug('my-blog-post-title')
1237     True
1238     >>> is_slug('My blog post title')
1239     False
1240     """
1241     if not is_full_string(in_str):
1242         return False
1243     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1244     return re.match(rex, in_str) is not None
1245
1246
1247 def contains_html(in_str: str) -> bool:
1248     """
1249     Args:
1250         in_str: the string to check for tags in
1251
1252     Returns:
1253         True if the given string contains HTML/XML tags and False
1254         otherwise.
1255
1256     See also :meth:`strip_html`.
1257
1258     .. warning::
1259         By design, this function matches ANY type of tag, so don't expect
1260         to use it as an HTML validator.  It's a quick sanity check at
1261         best.  See something like BeautifulSoup for a more full-featuered
1262         HTML parser.
1263
1264     >>> contains_html('my string is <strong>bold</strong>')
1265     True
1266     >>> contains_html('my string is not bold')
1267     False
1268
1269     """
1270     if not is_string(in_str):
1271         raise ValueError(in_str)
1272     return HTML_RE.search(in_str) is not None
1273
1274
1275 def words_count(in_str: str) -> int:
1276     """
1277     Args:
1278         in_str: the string to count words in
1279
1280     Returns:
1281         The number of words contained in the given string.
1282
1283     .. note::
1284         This method is "smart" in that it does consider only sequences
1285         of one or more letter and/or numbers to be "words".  Thus a
1286         string like this: "! @ # % ... []" will return zero.  Moreover
1287         it is aware of punctuation, so the count for a string like
1288         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1289         in the string).
1290
1291     >>> words_count('hello world')
1292     2
1293     >>> words_count('one,two,three.stop')
1294     4
1295     """
1296     if not is_string(in_str):
1297         raise ValueError(in_str)
1298     return len(WORDS_COUNT_RE.findall(in_str))
1299
1300
1301 def word_count(in_str: str) -> int:
1302     """
1303     Args:
1304         in_str: the string to count words in
1305
1306     Returns:
1307         The number of words contained in the given string.
1308
1309     .. note::
1310         This method is "smart" in that it does consider only sequences
1311         of one or more letter and/or numbers to be "words".  Thus a
1312         string like this: "! @ # % ... []" will return zero.  Moreover
1313         it is aware of punctuation, so the count for a string like
1314         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1315         in the string).
1316
1317     >>> word_count('hello world')
1318     2
1319     >>> word_count('one,two,three.stop')
1320     4
1321     """
1322     return words_count(in_str)
1323
1324
1325 def generate_uuid(omit_dashes: bool = False) -> str:
1326     """
1327     Args:
1328         omit_dashes: should we omit the dashes in the generated UUID?
1329
1330     Returns:
1331         A generated UUID string (using `uuid.uuid4()`) with or without
1332         dashes per the omit_dashes arg.
1333
1334     See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1335
1336     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1337     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1338     """
1339     uid = uuid4()
1340     if omit_dashes:
1341         return uid.hex
1342     return str(uid)
1343
1344
1345 def generate_random_alphanumeric_string(size: int) -> str:
1346     """
1347     Args:
1348         size: number of characters to generate
1349
1350     Returns:
1351         A string of the specified size containing random characters
1352         (uppercase/lowercase ascii letters and digits).
1353
1354     See also :meth:`asciify`, :meth:`generate_uuid`.
1355
1356     >>> random.seed(22)
1357     >>> generate_random_alphanumeric_string(9)
1358     '96ipbNClS'
1359     """
1360     if size < 1:
1361         raise ValueError("size must be >= 1")
1362     chars = string.ascii_letters + string.digits
1363     buffer = [random.choice(chars) for _ in range(size)]
1364     return from_char_list(buffer)
1365
1366
1367 def reverse(in_str: str) -> str:
1368     """
1369     Args:
1370         in_str: the string to reverse
1371
1372     Returns:
1373         The reversed (chracter by character) string.
1374
1375     >>> reverse('test')
1376     'tset'
1377     """
1378     if not is_string(in_str):
1379         raise ValueError(in_str)
1380     return in_str[::-1]
1381
1382
1383 def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
1384     """
1385     Args:
1386         in_str: the camel case string to convert
1387         separator: the snake case separator character to use
1388
1389     Returns:
1390         A snake case string equivalent to the camel case input or the
1391         original string if it is not a valid camel case string or some
1392         other error occurs.
1393
1394     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1395
1396     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1397     'mac_address_extractor_factory'
1398     >>> camel_case_to_snake_case('Luke Skywalker')
1399     'Luke Skywalker'
1400     """
1401     if not is_string(in_str):
1402         raise ValueError(in_str)
1403     if not is_camel_case(in_str):
1404         return in_str
1405     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1406
1407
1408 def snake_case_to_camel_case(
1409     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1410 ) -> str:
1411     """
1412     Args:
1413         in_str: the snake case string to convert
1414         upper_case_first: should we capitalize the first letter?
1415         separator: the separator character to use
1416
1417     Returns:
1418         A camel case string that is equivalent to the snake case string
1419         provided or the original string back again if it is not valid
1420         snake case or another error occurs.
1421
1422     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1423
1424     >>> snake_case_to_camel_case('this_is_a_test')
1425     'ThisIsATest'
1426     >>> snake_case_to_camel_case('Han Solo')
1427     'Han Solo'
1428     """
1429     if not is_string(in_str):
1430         raise ValueError(in_str)
1431     if not is_snake_case(in_str, separator=separator):
1432         return in_str
1433     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1434     if not upper_case_first:
1435         tokens[0] = tokens[0].lower()
1436     return from_char_list(tokens)
1437
1438
1439 def to_char_list(in_str: str) -> List[str]:
1440     """
1441     Args:
1442         in_str: the string to split into a char list
1443
1444     Returns:
1445         A list of strings of length one each.
1446
1447     See also :meth:`from_char_list`.
1448
1449     >>> to_char_list('test')
1450     ['t', 'e', 's', 't']
1451     """
1452     if not is_string(in_str):
1453         return []
1454     return list(in_str)
1455
1456
1457 def from_char_list(in_list: List[str]) -> str:
1458     """
1459     Args:
1460         in_list: A list of characters to convert into a string.
1461
1462     Returns:
1463         The string resulting from gluing the characters in in_list
1464         together.
1465
1466     See also :meth:`to_char_list`.
1467
1468     >>> from_char_list(['t', 'e', 's', 't'])
1469     'test'
1470     """
1471     return "".join(in_list)
1472
1473
1474 def shuffle(in_str: str) -> Optional[str]:
1475     """
1476     Args:
1477         in_str: a string to shuffle randomly by character
1478
1479     Returns:
1480         A new string containing same chars of the given one but in
1481         a randomized order.  Note that in rare cases this could result
1482         in the same original string as no check is done.  Returns
1483         None to indicate error conditions.
1484
1485     >>> random.seed(22)
1486     >>> shuffle('awesome')
1487     'meosaew'
1488     """
1489     if not is_string(in_str):
1490         return None
1491     chars = to_char_list(in_str)
1492     random.shuffle(chars)
1493     return from_char_list(chars)
1494
1495
1496 def scramble(in_str: str) -> Optional[str]:
1497     """
1498     Args:
1499         in_str: a string to shuffle randomly by character
1500
1501     Returns:
1502         A new string containing same chars of the given one but in
1503         a randomized order.  Note that in rare cases this could result
1504         in the same original string as no check is done.  Returns
1505         None to indicate error conditions.
1506
1507     See also :mod:`pyutils.unscrambler`.
1508
1509     >>> random.seed(22)
1510     >>> scramble('awesome')
1511     'meosaew'
1512     """
1513     return shuffle(in_str)
1514
1515
1516 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1517     """
1518     Args:
1519         in_str: the string to strip tags from
1520         keep_tag_content: should we keep the inner contents of tags?
1521
1522     Returns:
1523         A string with all HTML tags removed (optionally with tag contents
1524         preserved).
1525
1526     See also :meth:`contains_html`.
1527
1528     .. note::
1529         This method uses simple regular expressions to strip tags and is
1530         not a full fledged HTML parser by any means.  Consider using
1531         something like BeautifulSoup if your needs are more than this
1532         simple code can fulfill.
1533
1534     >>> strip_html('test: <a href="foo/bar">click here</a>')
1535     'test: '
1536     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1537     'test: click here'
1538     """
1539     if not is_string(in_str):
1540         raise ValueError(in_str)
1541     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1542     return r.sub("", in_str)
1543
1544
1545 def asciify(in_str: str) -> str:
1546     """
1547     Args:
1548         in_str: the string to asciify.
1549
1550     Returns:
1551         An output string roughly equivalent to the original string
1552         where all content to are ascii-only.  This is accomplished
1553         by translating all non-ascii chars into their closest possible
1554         ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1555
1556     See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1557
1558     .. warning::
1559         Some chars may be lost if impossible to translate.
1560
1561     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1562     'eeuuooaaeynAAACIINOE'
1563     """
1564     if not is_string(in_str):
1565         raise ValueError(in_str)
1566
1567     # "NFKD" is the algorithm which is able to successfully translate
1568     # the most of non-ascii chars.
1569     normalized = unicodedata.normalize("NFKD", in_str)
1570
1571     # encode string forcing ascii and ignore any errors
1572     # (unrepresentable chars will be stripped out)
1573     ascii_bytes = normalized.encode("ascii", "ignore")
1574
1575     # turns encoded bytes into an utf-8 string
1576     return ascii_bytes.decode("utf-8")
1577
1578
1579 def slugify(in_str: str, *, separator: str = "-") -> str:
1580     """
1581     Args:
1582         in_str: the string to slugify
1583         separator: the character to use during sligification (default
1584             is a dash)
1585
1586     Returns:
1587         The converted string.  The returned string has the following properties:
1588
1589         * it has no spaces
1590         * all letters are in lower case
1591         * all punctuation signs and non alphanumeric chars are removed
1592         * words are divided using provided separator
1593         * all chars are encoded as ascii (by using :meth:`asciify`)
1594         * is safe for URL
1595
1596     See also :meth:`is_slug` and :meth:`asciify`.
1597
1598     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1599     'top-10-reasons-to-love-dogs'
1600     >>> slugify('Mönstér Mägnët')
1601     'monster-magnet'
1602     """
1603     if not is_string(in_str):
1604         raise ValueError(in_str)
1605
1606     # replace any character that is NOT letter or number with spaces
1607     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1608
1609     # replace spaces with join sign
1610     out = SPACES_RE.sub(separator, out)
1611
1612     # normalize joins (remove duplicates)
1613     out = re.sub(re.escape(separator) + r"+", separator, out)
1614     return asciify(out)
1615
1616
1617 def to_bool(in_str: str) -> bool:
1618     """
1619     Args:
1620         in_str: the string to convert to boolean
1621
1622     Returns:
1623         A boolean equivalent of the original string based on its contents.
1624         All conversion is case insensitive.  A positive boolean (True) is
1625         returned if the string value is any of the following:
1626
1627         * "true"
1628         * "t"
1629         * "1"
1630         * "yes"
1631         * "y"
1632         * "on"
1633
1634         Otherwise False is returned.
1635
1636     See also :mod:`pyutils.argparse_utils`.
1637
1638     >>> to_bool('True')
1639     True
1640
1641     >>> to_bool('1')
1642     True
1643
1644     >>> to_bool('yes')
1645     True
1646
1647     >>> to_bool('no')
1648     False
1649
1650     >>> to_bool('huh?')
1651     False
1652
1653     >>> to_bool('on')
1654     True
1655     """
1656     if not is_string(in_str):
1657         raise ValueError(in_str)
1658     return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
1659
1660
1661 def to_date(in_str: str) -> Optional[datetime.date]:
1662     """
1663     Args:
1664         in_str: the string to convert into a date
1665
1666     Returns:
1667         The datetime.date the string contained or None to indicate
1668         an error.  This parser is relatively clever; see
1669         :class:`datetimes.dateparse_utils` docs for details.
1670
1671     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
1672     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1673
1674     >>> to_date('9/11/2001')
1675     datetime.date(2001, 9, 11)
1676     >>> to_date('xyzzy')
1677     """
1678     import pyutils.datetimes.dateparse_utils as du
1679
1680     try:
1681         d = du.DateParser()  # type: ignore
1682         d.parse(in_str)
1683         return d.get_date()
1684     except du.ParseException:  # type: ignore
1685         pass
1686     return None
1687
1688
1689 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1690     """Finds and extracts a date from the string, if possible.
1691
1692     Args:
1693         in_str: the string to extract a date from
1694
1695     Returns:
1696         a datetime if date was found, otherwise None
1697
1698     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1699     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1700
1701     >>> extract_date("filename.txt    dec 13, 2022")
1702     datetime.datetime(2022, 12, 13, 0, 0)
1703
1704     >>> extract_date("Dear Santa, please get me a pony.")
1705
1706     """
1707     import itertools
1708
1709     import pyutils.datetimes.dateparse_utils as du
1710
1711     d = du.DateParser()  # type: ignore
1712     chunks = in_str.split()
1713     for ngram in itertools.chain(
1714         list_utils.ngrams(chunks, 5),
1715         list_utils.ngrams(chunks, 4),
1716         list_utils.ngrams(chunks, 3),
1717         list_utils.ngrams(chunks, 2),
1718     ):
1719         try:
1720             expr = " ".join(ngram)
1721             logger.debug("Trying %s", expr)
1722             if d.parse(expr):
1723                 return d.get_datetime()
1724         except du.ParseException:  # type: ignore
1725             pass
1726     return None
1727
1728
1729 def is_valid_date(in_str: str) -> bool:
1730     """
1731     Args:
1732         in_str: the string to check
1733
1734     Returns:
1735         True if the string represents a valid date that we can recognize
1736         and False otherwise.  This parser is relatively clever; see
1737         :class:`datetimes.dateparse_utils` docs for details.
1738
1739     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1740     :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1741
1742     >>> is_valid_date('1/2/2022')
1743     True
1744     >>> is_valid_date('christmas')
1745     True
1746     >>> is_valid_date('next wednesday')
1747     True
1748     >>> is_valid_date('xyzzy')
1749     False
1750     """
1751     import pyutils.datetimes.dateparse_utils as dp
1752
1753     try:
1754         d = dp.DateParser()  # type: ignore
1755         _ = d.parse(in_str)
1756         return True
1757     except dp.ParseException:  # type: ignore
1758         pass
1759     return False
1760
1761
1762 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1763     """
1764     Args:
1765         in_str: string to parse into a datetime
1766
1767     Returns:
1768         A python datetime parsed from in_str or None to indicate
1769         an error.  This parser is relatively clever; see
1770         :class:`datetimes.dateparse_utils` docs for details.
1771
1772     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1773     :meth:`extract_date`, :meth:`valid_datetime`.
1774
1775     >>> to_datetime('7/20/1969 02:56 GMT')
1776     datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1777     """
1778     import pyutils.datetimes.dateparse_utils as dp
1779
1780     try:
1781         d = dp.DateParser()  # type: ignore
1782         dt = d.parse(in_str)
1783         if isinstance(dt, datetime.datetime):
1784             return dt
1785     except Exception:
1786         pass
1787     return None
1788
1789
1790 def valid_datetime(in_str: str) -> bool:
1791     """
1792     Args:
1793         in_str: the string to check
1794
1795     Returns:
1796         True if in_str contains a valid datetime and False otherwise.
1797         This parser is relatively clever; see
1798         :class:`datetimes.dateparse_utils` docs for details.
1799
1800     >>> valid_datetime('next wednesday at noon')
1801     True
1802     >>> valid_datetime('3 weeks ago at midnight')
1803     True
1804     >>> valid_datetime('next easter at 5:00 am')
1805     True
1806     >>> valid_datetime('sometime soon')
1807     False
1808     """
1809     _ = to_datetime(in_str)
1810     if _ is not None:
1811         return True
1812     return False
1813
1814
1815 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1816     """
1817     Args:
1818         in_str: the string to squeeze
1819         character_to_squeeze: the character to remove runs of
1820             more than one in a row (default = space)
1821
1822     Returns: A "squeezed string" where runs of more than one
1823         character_to_squeeze into one.
1824
1825     >>> squeeze(' this        is       a    test    ')
1826     ' this is a test '
1827
1828     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1829     'one|!|two|!|three'
1830
1831     """
1832     return re.sub(
1833         r'(' + re.escape(character_to_squeeze) + r')+',
1834         character_to_squeeze,
1835         in_str,
1836     )
1837
1838
1839 def dedent(in_str: str) -> Optional[str]:
1840     """
1841     Args:
1842         in_str: the string to dedent
1843
1844     Returns:
1845         A string with tab indentation removed or None on error.
1846
1847     See also :meth:`indent`.
1848
1849     >>> dedent('\t\ttest\\n\t\ting')
1850     'test\\ning'
1851     """
1852     if not is_string(in_str):
1853         return None
1854     line_separator = '\n'
1855     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1856     return line_separator.join(lines)
1857
1858
1859 def indent(in_str: str, amount: int) -> str:
1860     """
1861     Args:
1862         in_str: the string to indent
1863         amount: count of spaces to indent each line by
1864
1865     Returns:
1866         An indented string created by prepending amount spaces.
1867
1868     See also :meth:`dedent`.
1869
1870     >>> indent('This is a test', 4)
1871     '    This is a test'
1872     """
1873     if not is_string(in_str):
1874         raise ValueError(in_str)
1875     line_separator = '\n'
1876     lines = [" " * amount + line for line in in_str.split(line_separator)]
1877     return line_separator.join(lines)
1878
1879
1880 def _sprintf(*args, **kwargs) -> str:
1881     """Internal helper."""
1882     ret = ""
1883
1884     sep = kwargs.pop("sep", None)
1885     if sep is not None:
1886         if not isinstance(sep, str):
1887             raise TypeError("sep must be None or a string")
1888
1889     end = kwargs.pop("end", None)
1890     if end is not None:
1891         if not isinstance(end, str):
1892             raise TypeError("end must be None or a string")
1893
1894     if kwargs:
1895         raise TypeError("invalid keyword arguments to sprint()")
1896
1897     if sep is None:
1898         sep = " "
1899     if end is None:
1900         end = "\n"
1901     for n, arg in enumerate(args):
1902         if n:
1903             ret += sep
1904         if isinstance(arg, str):
1905             ret += arg
1906         else:
1907             ret += str(arg)
1908     ret += end
1909     return ret
1910
1911
1912 def strip_ansi_sequences(in_str: str) -> str:
1913     """
1914     Args:
1915         in_str: the string to strip
1916
1917     Returns:
1918         in_str with recognized ANSI escape sequences removed.
1919
1920     See also :mod:`pyutils.ansi`.
1921
1922     .. warning::
1923         This method works by using a regular expression.
1924         It works for all ANSI escape sequences I've tested with but
1925         may miss some; caveat emptor.
1926
1927     >>> import ansi as a
1928     >>> s = a.fg('blue') + 'blue!' + a.reset()
1929     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1930     18
1931     >>> len(strip_ansi_sequences(s))
1932     5
1933     >>> strip_ansi_sequences(s)
1934     'blue!'
1935
1936     """
1937     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1938
1939
1940 class SprintfStdout(contextlib.AbstractContextManager):
1941     """
1942     A context manager that captures outputs to stdout to a buffer
1943     without printing them.
1944
1945     >>> with SprintfStdout() as buf:
1946     ...     print("test")
1947     ...     print("1, 2, 3")
1948     ...
1949     >>> print(buf(), end='')
1950     test
1951     1, 2, 3
1952     """
1953
1954     def __init__(self) -> None:
1955         self.destination = io.StringIO()
1956         self.recorder: contextlib.redirect_stdout
1957
1958     def __enter__(self) -> Callable[[], str]:
1959         self.recorder = contextlib.redirect_stdout(self.destination)
1960         self.recorder.__enter__()
1961         return lambda: self.destination.getvalue()
1962
1963     def __exit__(self, *args) -> Literal[False]:
1964         self.recorder.__exit__(*args)
1965         self.destination.seek(0)
1966         return False
1967
1968
1969 def capitalize_first_letter(in_str: str) -> str:
1970     """
1971     Args:
1972         in_str: the string to capitalize
1973
1974     Returns:
1975         in_str with the first character capitalized.
1976
1977     >>> capitalize_first_letter('test')
1978     'Test'
1979     >>> capitalize_first_letter("ALREADY!")
1980     'ALREADY!'
1981     """
1982     return in_str[0].upper() + in_str[1:]
1983
1984
1985 def it_they(n: int) -> str:
1986     """
1987     Args:
1988         n: how many of them are there?
1989
1990     Returns:
1991         'it' if n is one or 'they' otherwize.
1992
1993     See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
1994     :meth:`thify`.
1995
1996     Suggested usage::
1997
1998         n = num_files_saved_to_tmp()
1999         print(f'Saved file{pluralize(n)} successfully.')
2000         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2001
2002     >>> it_they(1)
2003     'it'
2004     >>> it_they(100)
2005     'they'
2006     """
2007     if n == 1:
2008         return "it"
2009     return "they"
2010
2011
2012 def is_are(n: int) -> str:
2013     """
2014     Args:
2015         n: how many of them are there?
2016
2017     Returns:
2018         'is' if n is one or 'are' otherwize.
2019
2020     See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2021     :meth:`thify`.
2022
2023     Suggested usage::
2024
2025         n = num_files_saved_to_tmp()
2026         print(f'Saved file{pluralize(n)} successfully.')
2027         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2028
2029     >>> is_are(1)
2030     'is'
2031     >>> is_are(2)
2032     'are'
2033
2034     """
2035     if n == 1:
2036         return "is"
2037     return "are"
2038
2039
2040 def pluralize(n: int) -> str:
2041     """
2042     Args:
2043         n: how many of them are there?
2044
2045     Returns:
2046         's' if n is greater than one otherwize ''.
2047
2048     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2049     :meth:`thify`.
2050
2051     Suggested usage::
2052
2053         n = num_files_saved_to_tmp()
2054         print(f'Saved file{pluralize(n)} successfully.')
2055         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2056
2057     >>> pluralize(15)
2058     's'
2059     >>> count = 1
2060     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2061     There is 1 file.
2062     >>> count = 4
2063     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2064     There are 4 files.
2065     """
2066     if n == 1:
2067         return ""
2068     return "s"
2069
2070
2071 def make_contractions(txt: str) -> str:
2072     """This code glues words in txt together to form (English)
2073     contractions.
2074
2075     Args:
2076         txt: the input text to be contractionized.
2077
2078     Returns:
2079         Output text identical to original input except for any
2080         recognized contractions are formed.
2081
2082     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2083
2084     .. note::
2085         The order in which we create contractions is defined by the
2086         implementation and what I thought made more sense when writing
2087         this code.
2088
2089     >>> make_contractions('It is nice today.')
2090     "It's nice today."
2091
2092     >>> make_contractions('I can    not even...')
2093     "I can't even..."
2094
2095     >>> make_contractions('She could not see!')
2096     "She couldn't see!"
2097
2098     >>> make_contractions('But she will not go.')
2099     "But she won't go."
2100
2101     >>> make_contractions('Verily, I shall not.')
2102     "Verily, I shan't."
2103
2104     >>> make_contractions('No you cannot.')
2105     "No you can't."
2106
2107     >>> make_contractions('I said you can not go.')
2108     "I said you can't go."
2109     """
2110
2111     first_second = [
2112         (
2113             [
2114                 'are',
2115                 'could',
2116                 'did',
2117                 'has',
2118                 'have',
2119                 'is',
2120                 'must',
2121                 'should',
2122                 'was',
2123                 'were',
2124                 'would',
2125             ],
2126             ['(n)o(t)'],
2127         ),
2128         (
2129             [
2130                 "I",
2131                 "you",
2132                 "he",
2133                 "she",
2134                 "it",
2135                 "we",
2136                 "they",
2137                 "how",
2138                 "why",
2139                 "when",
2140                 "where",
2141                 "who",
2142                 "there",
2143             ],
2144             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2145         ),
2146     ]
2147
2148     # Special cases: can't, shan't and won't.
2149     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2150     txt = re.sub(
2151         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2152     )
2153     txt = re.sub(
2154         r'\b(w)ill\s*(n)(o)(t)\b',
2155         r"\1\3\2'\4",
2156         txt,
2157         count=0,
2158         flags=re.IGNORECASE,
2159     )
2160
2161     for first_list, second_list in first_second:
2162         for first in first_list:
2163             for second in second_list:
2164                 # Disallow there're/where're.  They're valid English
2165                 # but sound weird.
2166                 if (first in set(['there', 'where'])) and second == 'a(re)':
2167                     continue
2168
2169                 pattern = fr'\b({first})\s+{second}\b'
2170                 if second == '(n)o(t)':
2171                     replacement = r"\1\2'\3"
2172                 else:
2173                     replacement = r"\1'\2"
2174                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2175
2176     return txt
2177
2178
2179 def thify(n: int) -> str:
2180     """
2181     Args:
2182         n: how many of them are there?
2183
2184     Returns:
2185         The proper cardinal suffix for a number.
2186
2187     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2188
2189     Suggested usage::
2190
2191         attempt_count = 0
2192         while True:
2193             attempt_count += 1
2194             if try_the_thing():
2195                 break
2196             print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2197
2198     >>> thify(1)
2199     'st'
2200     >>> thify(33)
2201     'rd'
2202     >>> thify(16)
2203     'th'
2204     """
2205     digit = str(n)
2206     assert is_integer_number(digit)
2207     digit = digit[-1:]
2208     if digit == "1":
2209         return "st"
2210     elif digit == "2":
2211         return "nd"
2212     elif digit == "3":
2213         return "rd"
2214     else:
2215         return "th"
2216
2217
2218 def ngrams(txt: str, n: int):
2219     """
2220     Args:
2221         txt: the string to create ngrams using
2222         n: how many words per ngram created?
2223
2224     Returns:
2225         Generates the ngrams from the input string.
2226
2227     See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2228
2229     >>> [x for x in ngrams('This is a test', 2)]
2230     ['This is', 'is a', 'a test']
2231     """
2232     words = txt.split()
2233     for ngram in ngrams_presplit(words, n):
2234         ret = ''
2235         for w in ngram:
2236             ret += f'{w} '
2237         yield ret.strip()
2238
2239
2240 def ngrams_presplit(words: Sequence[str], n: int):
2241     """
2242     Same as :meth:`ngrams` but with the string pre-split.
2243
2244     See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2245     """
2246     return list_utils.ngrams(words, n)
2247
2248
2249 def bigrams(txt: str):
2250     """Generates the bigrams (n=2) of the given string.
2251
2252     See also :meth:`ngrams`, :meth:`trigrams`.
2253
2254     >>> [x for x in bigrams('this is a test')]
2255     ['this is', 'is a', 'a test']
2256     """
2257     return ngrams(txt, 2)
2258
2259
2260 def trigrams(txt: str):
2261     """Generates the trigrams (n=3) of the given string.
2262
2263     See also :meth:`ngrams`, :meth:`bigrams`.
2264     """
2265     return ngrams(txt, 3)
2266
2267
2268 def shuffle_columns_into_list(
2269     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
2270 ) -> Iterable[str]:
2271     """Helper to shuffle / parse columnar data and return the results as a
2272     list.
2273
2274     Args:
2275         input_lines: A sequence of strings that represents text that
2276             has been broken into columns by the caller
2277         column_specs: an iterable collection of numeric sequences that
2278             indicate one or more column numbers to copy to form the Nth
2279             position in the output list.  See example below.
2280         delim: for column_specs that indicate we should copy more than
2281             one column from the input into this position, use delim to
2282             separate source data.  Defaults to ''.
2283
2284     Returns:
2285         A list of string created by following the instructions set forth
2286         in column_specs.
2287
2288     See also :meth:`shuffle_columns_into_dict`.
2289
2290     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2291     >>> shuffle_columns_into_list(
2292     ...     cols,
2293     ...     [ [8], [2, 3], [5, 6, 7] ],
2294     ...     delim='!',
2295     ... )
2296     ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2297     """
2298     out = []
2299
2300     # Column specs map input lines' columns into outputs.
2301     # [col1, col2...]
2302     for spec in column_specs:
2303         hunk = ''
2304         for n in spec:
2305             hunk = hunk + delim + input_lines[n]
2306         hunk = hunk.strip(delim)
2307         out.append(hunk)
2308     return out
2309
2310
2311 def shuffle_columns_into_dict(
2312     input_lines: Sequence[str],
2313     column_specs: Iterable[Tuple[str, Iterable[int]]],
2314     delim: str = '',
2315 ) -> Dict[str, str]:
2316     """Helper to shuffle / parse columnar data and return the results
2317     as a dict.
2318
2319     Args:
2320         input_lines: a sequence of strings that represents text that
2321             has been broken into columns by the caller
2322         column_specs: instructions for what dictionary keys to apply
2323             to individual or compound input column data.  See example
2324             below.
2325         delim: when forming compound output data by gluing more than
2326             one input column together, use this character to separate
2327             the source data.  Defaults to ''.
2328
2329     Returns:
2330         A dict formed by applying the column_specs instructions.
2331
2332     See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2333
2334     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2335     >>> shuffle_columns_into_dict(
2336     ...     cols,
2337     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2338     ...     delim='!',
2339     ... )
2340     {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2341     """
2342     out = {}
2343
2344     # Column specs map input lines' columns into outputs.
2345     # "key", [col1, col2...]
2346     for spec in column_specs:
2347         hunk = ''
2348         for n in spec[1]:
2349             hunk = hunk + delim + input_lines[n]
2350         hunk = hunk.strip(delim)
2351         out[spec[0]] = hunk
2352     return out
2353
2354
2355 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2356     """
2357     Interpolate a string with data from a dict.
2358
2359     Args:
2360         txt: the mad libs template
2361         values: what you and your kids chose for each category.
2362
2363     See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2364
2365     >>> interpolate_using_dict('This is a {adjective} {noun}.',
2366     ...                        {'adjective': 'good', 'noun': 'example'})
2367     'This is a good example.'
2368     """
2369     return _sprintf(txt.format(**values), end='')
2370
2371
2372 def to_ascii(txt: str):
2373     """
2374     Args:
2375         txt: the input data to encode
2376
2377     Returns:
2378         txt encoded as an ASCII byte string.
2379
2380     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2381     :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2382
2383     >>> to_ascii('test')
2384     b'test'
2385
2386     >>> to_ascii(b'1, 2, 3')
2387     b'1, 2, 3'
2388     """
2389     if isinstance(txt, str):
2390         return txt.encode('ascii')
2391     if isinstance(txt, bytes):
2392         return txt
2393     raise Exception('to_ascii works with strings and bytes')
2394
2395
2396 def to_base64(
2397     txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2398 ) -> bytes:
2399     """
2400     Args:
2401         txt: the input data to encode
2402         encoding: the encoding to use during conversion
2403         errors: how to handle encoding errors
2404
2405     Returns:
2406         txt encoded with a 64-chracter alphabet.  Similar to and compatible
2407         with uuencode/uudecode.
2408
2409     See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2410     :meth:`from_base64`.
2411
2412     >>> to_base64('hello?')
2413     b'aGVsbG8/\\n'
2414     """
2415     return base64.encodebytes(txt.encode(encoding, errors))
2416
2417
2418 def is_base64(txt: str) -> bool:
2419     """
2420     Args:
2421         txt: the string to check
2422
2423     Returns:
2424         True if txt is a valid base64 encoded string.  This assumes
2425         txt was encoded with Python's standard base64 alphabet which
2426         is the same as what uuencode/uudecode uses).
2427
2428     See also :meth:`to_base64`, :meth:`from_base64`.
2429
2430     >>> is_base64('test')    # all letters in the b64 alphabet
2431     True
2432
2433     >>> is_base64('another test, how do you like this one?')
2434     False
2435
2436     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
2437     True
2438
2439     """
2440     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2441     alphabet = set(a.encode('ascii'))
2442     for char in to_ascii(txt.strip()):
2443         if char not in alphabet:
2444             return False
2445     return True
2446
2447
2448 def from_base64(
2449     b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2450 ) -> str:
2451     """
2452     Args:
2453         b64: bytestring of 64-bit encoded data to decode / convert.
2454         encoding: the encoding to use during conversion
2455         errors: how to handle encoding errors
2456
2457     Returns:
2458         The decoded form of b64 as a normal python string.  Similar to
2459         and compatible with uuencode / uudecode.
2460
2461     See also :meth:`to_base64`, :meth:`is_base64`.
2462
2463     >>> from_base64(b'aGVsbG8/\\n')
2464     'hello?'
2465     """
2466     return base64.decodebytes(b64).decode(encoding, errors)
2467
2468
2469 def chunk(txt: str, chunk_size: int):
2470     """
2471     Args:
2472         txt: a string to be chunked into evenly spaced pieces.
2473         chunk_size: the size of each chunk to make
2474
2475     Returns:
2476         The original string chunked into evenly spaced pieces.
2477
2478     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2479     '01001101 11000101 10101010 10101010 10011111 10101000'
2480     """
2481     if len(txt) % chunk_size != 0:
2482         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2483         logger.warning(msg)
2484         warnings.warn(msg, stacklevel=2)
2485     for x in range(0, len(txt), chunk_size):
2486         yield txt[x : x + chunk_size]
2487
2488
2489 def to_bitstring(txt: str, *, delimiter: str = '') -> str:
2490     """
2491     Args:
2492         txt: the string to convert into a bitstring
2493         delimiter: character to insert between adjacent bytes.  Note that
2494             only bitstrings with delimiter='' are interpretable by
2495             :meth:`from_bitstring`.
2496
2497     Returns:
2498         txt converted to ascii/binary and then chopped into bytes.
2499
2500     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2501     :meth:`chunk`.
2502
2503     >>> to_bitstring('hello?')
2504     '011010000110010101101100011011000110111100111111'
2505
2506     >>> to_bitstring('test', delimiter=' ')
2507     '01110100 01100101 01110011 01110100'
2508
2509     >>> to_bitstring(b'test')
2510     '01110100011001010111001101110100'
2511     """
2512     etxt = to_ascii(txt)
2513     bits = bin(int.from_bytes(etxt, 'big'))
2514     bits = bits[2:]
2515     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2516
2517
2518 def is_bitstring(txt: str) -> bool:
2519     """
2520     Args:
2521         txt: the string to check
2522
2523     Returns:
2524         True if txt is a recognized bitstring and False otherwise.
2525         Note that if delimiter is non empty this code will not
2526         recognize the bitstring.
2527
2528     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2529     :meth:`chunk`.
2530
2531     >>> is_bitstring('011010000110010101101100011011000110111100111111')
2532     True
2533
2534     >>> is_bitstring('1234')
2535     False
2536     """
2537     return is_binary_integer_number(f'0b{txt}')
2538
2539
2540 def from_bitstring(
2541     bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2542 ) -> str:
2543     """
2544     Args:
2545         bits: the bitstring to convert back into a python string
2546         encoding: the encoding to use during conversion
2547         errors: how to handle encoding errors
2548
2549     Returns:
2550         The regular python string represented by bits.  Note that this
2551         code does not work with to_bitstring when delimiter is non-empty.
2552
2553     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2554     :meth:`chunk`.
2555
2556     >>> from_bitstring('011010000110010101101100011011000110111100111111')
2557     'hello?'
2558     """
2559     n = int(bits, 2)
2560     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2561
2562
2563 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2564     """
2565     Args:
2566         txt: an IP address to chunk up for sorting purposes
2567
2568     Returns:
2569         A tuple of IP components arranged such that the sorting of
2570         IP addresses using a normal comparator will do something sane
2571         and desireable.
2572
2573     See also :meth:`is_ip_v4`.
2574
2575     >>> ip_v4_sort_key('10.0.0.18')
2576     (10, 0, 0, 18)
2577
2578     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2579     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2580     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2581     """
2582     if not is_ip_v4(txt):
2583         print(f"not IP: {txt}")
2584         return None
2585     return tuple(int(x) for x in txt.split('.'))
2586
2587
2588 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2589     """
2590     Args:
2591         volume: the string to chunk up for sorting purposes
2592
2593     Returns:
2594         A tuple of volume's components such that the sorting of
2595         volumes using a normal comparator will do something sane
2596         and desireable.
2597
2598     See also :mod:`pyutils.files.file_utils`.
2599
2600     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2601     ('usr', 'local', 'bin')
2602
2603     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2604     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2605     ['/usr', '/usr/local', '/usr/local/bin']
2606     """
2607     return tuple(x for x in volume.split('/') if len(x) > 0)
2608
2609
2610 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2611     """
2612     Execute several replace operations in a row.
2613
2614     Args:
2615         in_str: the string in which to replace characters
2616         replace_set: the set of target characters to replace
2617         replacement: the character to replace any member of replace_set
2618             with
2619
2620     See also :meth:`replace_nth`.
2621
2622     Returns:
2623         The string with replacements executed.
2624
2625     >>> s = 'this_is a-test!'
2626     >>> replace_all(s, ' _-!', '')
2627     'thisisatest'
2628     """
2629     for char in replace_set:
2630         in_str = in_str.replace(char, replacement)
2631     return in_str
2632
2633
2634 def replace_nth(in_str: str, source: str, target: str, nth: int):
2635     """
2636     Replaces the nth occurrance of a substring within a string.
2637
2638     Args:
2639         in_str: the string in which to run the replacement
2640         source: the substring to replace
2641         target: the replacement text
2642         nth: which occurrance of source to replace?
2643
2644     See also :meth:`replace_all`.
2645
2646     >>> replace_nth('this is a test', ' ', '-', 3)
2647     'this is a-test'
2648     """
2649     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2650     before = in_str[:where]
2651     after = in_str[where:]
2652     after = after.replace(source, target, 1)
2653     return before + after
2654
2655
2656 if __name__ == '__main__':
2657     import doctest
2658
2659     doctest.testmod()