src/pyutils/string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7
   8 Modifications Copyright (c) 2021-2022 Scott Gasch
   9
  10 Permission is hereby granted, free of charge, to any person obtaining a copy
  11 of this software and associated documentation files (the "Software"), to deal
  12 in the Software without restriction, including without limitation the rights
  13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 copies of the Software, and to permit persons to whom the Software is
  15 furnished to do so, subject to the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included in all
  18 copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26 SOFTWARE.
  27
  28 This class is based on:
  29 https://github.com/daveoncode/python-string-utils.  See `NOTICE
  30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
  31 in the root of this module for a detailed enumeration of what work is
  32 Davide's and what work was added by Scott.
  33
  34 """
  35
  36 import base64
  37 import contextlib  # type: ignore
  38 import datetime
  39 import io
  40 import json
  41 import logging
  42 import numbers
  43 import random
  44 import re
  45 import string
  46 import unicodedata
  47 import warnings
  48 from itertools import zip_longest
  49 from typing import (
  50     Any,
  51     Callable,
  52     Dict,
  53     Iterable,
  54     List,
  55     Literal,
  56     Optional,
  57     Sequence,
  58     Tuple,
  59 )
  60 from uuid import uuid4
  61
  62 from pyutils import list_utils
  63
  64 logger = logging.getLogger(__name__)
  65
  66 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  67
  68 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  69
  70 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  71
  72 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  73
  74 URLS_RAW_STRING = (
  75     r"([a-z-]+://)"  # scheme
  76     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  77     r"(www\.)?"  # www.
  78     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  79     r"(:\d{2,})?"  # port number
  80     r"(/[a-z\d_%+-]*)*"  # folders
  81     r"(\.[a-z\d_%+-]+)*"  # file extension
  82     r"(\?[a-z\d_+%-=]*)?"  # query string
  83     r"(#\S*)?"  # hash
  84 )
  85
  86 URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
  87
  88 URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
  89
  90 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  91
  92 EMAILS_RAW_STRING = (
  93     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  94 )
  95
  96 EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
  97
  98 EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
  99
 100 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
 101
 102 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
 103
 104 SNAKE_CASE_TEST_RE = re.compile(
 105     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
 106 )
 107
 108 SNAKE_CASE_TEST_DASH_RE = re.compile(
 109     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 110 )
 111
 112 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 113
 114 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 115
 116 CREDIT_CARDS = {
 117     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 118     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 119     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 120     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 121     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 122     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 123 }
 124
 125 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 126
 127 UUID_RE = re.compile(
 128     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 129 )
 130
 131 UUID_HEX_OK_RE = re.compile(
 132     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 133     re.IGNORECASE,
 134 )
 135
 136 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 137
 138 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 139
 140 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 141
 142 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 143
 144 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 145
 146 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 147     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 148 )
 149
 150 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 151
 152 HTML_RE = re.compile(
 153     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 154     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 155 )
 156
 157 HTML_TAG_ONLY_RE = re.compile(
 158     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 159     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 160 )
 161
 162 SPACES_RE = re.compile(r"\s")
 163
 164 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 165
 166 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 167
 168 ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
 169
 170 NUM_SUFFIXES = {
 171     "Pb": (1024**5),
 172     "P": (1024**5),
 173     "Tb": (1024**4),
 174     "T": (1024**4),
 175     "Gb": (1024**3),
 176     "G": (1024**3),
 177     "Mb": (1024**2),
 178     "M": (1024**2),
 179     "Kb": (1024**1),
 180     "K": (1024**1),
 181 }
 182
 183 UNIT_WORDS = [
 184     "zero",
 185     "one",
 186     "two",
 187     "three",
 188     "four",
 189     "five",
 190     "six",
 191     "seven",
 192     "eight",
 193     "nine",
 194     "ten",
 195     "eleven",
 196     "twelve",
 197     "thirteen",
 198     "fourteen",
 199     "fifteen",
 200     "sixteen",
 201     "seventeen",
 202     "eighteen",
 203     "nineteen",
 204 ]
 205
 206 TENS_WORDS = [
 207     "",
 208     "",
 209     "twenty",
 210     "thirty",
 211     "forty",
 212     "fifty",
 213     "sixty",
 214     "seventy",
 215     "eighty",
 216     "ninety",
 217 ]
 218
 219 MAGNITUDE_SCALES = [
 220     "hundred",
 221     "thousand",
 222     "million",
 223     "billion",
 224     "trillion",
 225     "quadrillion",
 226 ]
 227
 228 NUM_WORDS = {}
 229 NUM_WORDS["and"] = (1, 0)
 230 for i, word in enumerate(UNIT_WORDS):
 231     NUM_WORDS[word] = (1, i)
 232 for i, word in enumerate(TENS_WORDS):
 233     NUM_WORDS[word] = (1, i * 10)
 234 for i, word in enumerate(MAGNITUDE_SCALES):
 235     if i == 0:
 236         NUM_WORDS[word] = (100, 0)
 237     else:
 238         NUM_WORDS[word] = (10 ** (i * 3), 0)
 239 NUM_WORDS['score'] = (20, 0)
 240
 241
 242 def is_none_or_empty(in_str: Optional[str]) -> bool:
 243     """
 244     Args:
 245         in_str: the string to test
 246
 247     Returns:
 248         True if the input string is either None or an empty string,
 249         False otherwise.
 250
 251     See also :meth:`is_string` and :meth:`is_empty_string`.
 252
 253     >>> is_none_or_empty("")
 254     True
 255     >>> is_none_or_empty(None)
 256     True
 257     >>> is_none_or_empty("   \t   ")
 258     True
 259     >>> is_none_or_empty('Test')
 260     False
 261     """
 262     return in_str is None or len(in_str.strip()) == 0
 263
 264
 265 def is_string(in_str: Any) -> bool:
 266     """
 267     Args:
 268         in_str: the object to test
 269
 270     Returns:
 271         True if the object is a string and False otherwise.
 272
 273     See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 274
 275     >>> is_string('test')
 276     True
 277     >>> is_string(123)
 278     False
 279     >>> is_string(100.3)
 280     False
 281     >>> is_string([1, 2, 3])
 282     False
 283     """
 284     return isinstance(in_str, str)
 285
 286
 287 def is_empty_string(in_str: Any) -> bool:
 288     """
 289     Args:
 290         in_str: the string to test
 291
 292     Returns:
 293         True if the string is empty and False otherwise.
 294
 295     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 296     """
 297     return is_empty(in_str)
 298
 299
 300 def is_empty(in_str: Any) -> bool:
 301     """
 302     Args:
 303         in_str: the string to test
 304
 305     Returns:
 306         True if the string is empty and false otherwise.
 307
 308     See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
 309
 310     >>> is_empty('')
 311     True
 312     >>> is_empty('    \t\t    ')
 313     True
 314     >>> is_empty('test')
 315     False
 316     >>> is_empty(100.88)
 317     False
 318     >>> is_empty([1, 2, 3])
 319     False
 320     """
 321     return is_string(in_str) and in_str.strip() == ""
 322
 323
 324 def is_full_string(in_str: Any) -> bool:
 325     """
 326     Args:
 327         in_str: the object to test
 328
 329     Returns:
 330         True if the object is a string and is not empty ('') and
 331         is not only composed of whitespace.
 332
 333     See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
 334
 335     >>> is_full_string('test!')
 336     True
 337     >>> is_full_string('')
 338     False
 339     >>> is_full_string('      ')
 340     False
 341     >>> is_full_string(100.999)
 342     False
 343     >>> is_full_string({"a": 1, "b": 2})
 344     False
 345     """
 346     return is_string(in_str) and in_str.strip() != ""
 347
 348
 349 def is_number(in_str: str) -> bool:
 350     """
 351     Args:
 352         in_str: the string to test
 353
 354     Returns:
 355         True if the string contains a valid numberic value and
 356         False otherwise.
 357
 358     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 359     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 360     etc...
 361
 362     >>> is_number(100.5)
 363     Traceback (most recent call last):
 364     ...
 365     ValueError: 100.5
 366     >>> is_number("100.5")
 367     True
 368     >>> is_number("test")
 369     False
 370     >>> is_number("99")
 371     True
 372     >>> is_number([1, 2, 3])
 373     Traceback (most recent call last):
 374     ...
 375     ValueError: [1, 2, 3]
 376     """
 377     if not is_string(in_str):
 378         raise ValueError(in_str)
 379     return NUMBER_RE.match(in_str) is not None
 380
 381
 382 def is_integer_number(in_str: str) -> bool:
 383     """
 384     Args:
 385         in_str: the string to test
 386
 387     Returns:
 388         True if the string contains a valid (signed or unsigned,
 389         decimal, hex, or octal, regular or scientific) integral
 390         expression and False otherwise.
 391
 392     See also :meth:`is_number`, :meth:`is_decimal_number`,
 393     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 394     etc...
 395
 396     >>> is_integer_number('42')
 397     True
 398     >>> is_integer_number('42.0')
 399     False
 400     """
 401     return (
 402         (is_number(in_str) and "." not in in_str)
 403         or is_hexidecimal_integer_number(in_str)
 404         or is_octal_integer_number(in_str)
 405         or is_binary_integer_number(in_str)
 406     )
 407
 408
 409 def is_hexidecimal_integer_number(in_str: str) -> bool:
 410     """
 411     Args:
 412         in_str: the string to test
 413
 414     Returns:
 415         True if the string is a hex integer number and False otherwise.
 416
 417     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 418     :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
 419
 420     >>> is_hexidecimal_integer_number('0x12345')
 421     True
 422     >>> is_hexidecimal_integer_number('0x1A3E')
 423     True
 424     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 425     False
 426     >>> is_hexidecimal_integer_number('-0xff')
 427     True
 428     >>> is_hexidecimal_integer_number('test')
 429     False
 430     >>> is_hexidecimal_integer_number(12345)  # Not a string
 431     Traceback (most recent call last):
 432     ...
 433     ValueError: 12345
 434     >>> is_hexidecimal_integer_number(101.4)
 435     Traceback (most recent call last):
 436     ...
 437     ValueError: 101.4
 438     >>> is_hexidecimal_integer_number(0x1A3E)
 439     Traceback (most recent call last):
 440     ...
 441     ValueError: 6718
 442     """
 443     if not is_string(in_str):
 444         raise ValueError(in_str)
 445     return HEX_NUMBER_RE.match(in_str) is not None
 446
 447
 448 def is_octal_integer_number(in_str: str) -> bool:
 449     """
 450     Args:
 451         in_str: the string to test
 452
 453     Returns:
 454         True if the string is a valid octal integral number and False otherwise.
 455
 456     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 457     :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
 458     etc...
 459
 460     >>> is_octal_integer_number('0o777')
 461     True
 462     >>> is_octal_integer_number('-0O115')
 463     True
 464     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 465     False
 466     >>> is_octal_integer_number('7777')  # Needs 0o
 467     False
 468     >>> is_octal_integer_number('test')
 469     False
 470     """
 471     if not is_string(in_str):
 472         raise ValueError(in_str)
 473     return OCT_NUMBER_RE.match(in_str) is not None
 474
 475
 476 def is_binary_integer_number(in_str: str) -> bool:
 477     """
 478     Args:
 479         in_str: the string to test
 480
 481     Returns:
 482         True if the string contains a binary integral number and False otherwise.
 483
 484     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 485     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 486     etc...
 487
 488     >>> is_binary_integer_number('0b10111')
 489     True
 490     >>> is_binary_integer_number('-0b111')
 491     True
 492     >>> is_binary_integer_number('0B10101')
 493     True
 494     >>> is_binary_integer_number('0b10102')
 495     False
 496     >>> is_binary_integer_number('0xFFF')
 497     False
 498     >>> is_binary_integer_number('test')
 499     False
 500     """
 501     if not is_string(in_str):
 502         raise ValueError(in_str)
 503     return BIN_NUMBER_RE.match(in_str) is not None
 504
 505
 506 def to_int(in_str: str) -> int:
 507     """
 508     Args:
 509         in_str: the string to convert
 510
 511     Returns:
 512         The integral value of the string or raises on error.
 513
 514     See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
 515     :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
 516     :meth:`is_binary_integer_number`, etc...
 517
 518     >>> to_int('1234')
 519     1234
 520     >>> to_int('0x1234')
 521     4660
 522     >>> to_int('0b01101')
 523     13
 524     >>> to_int('0o777')
 525     511
 526     >>> to_int('test')
 527     Traceback (most recent call last):
 528     ...
 529     ValueError: invalid literal for int() with base 10: 'test'
 530     """
 531     if not is_string(in_str):
 532         raise ValueError(in_str)
 533     if is_binary_integer_number(in_str):
 534         return int(in_str, 2)
 535     if is_octal_integer_number(in_str):
 536         return int(in_str, 8)
 537     if is_hexidecimal_integer_number(in_str):
 538         return int(in_str, 16)
 539     return int(in_str)
 540
 541
 542 def number_string_to_integer(in_str: str) -> int:
 543     """Convert a string containing a written-out number into an int.
 544
 545     Args:
 546         in_str: the string containing the long-hand written out integer number
 547             in English.  See examples below.
 548
 549     Returns:
 550         The integer whose value was parsed from in_str.
 551
 552     See also :meth:`integer_to_number_string`.
 553
 554     .. warning::
 555         This code only handles integers; it will not work with decimals / floats.
 556
 557     >>> number_string_to_integer("one hundred fifty two")
 558     152
 559
 560     >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
 561     10200054003
 562
 563     >>> number_string_to_integer("four-score and 7")
 564     87
 565
 566     >>> number_string_to_integer("fifty xyzzy three")
 567     Traceback (most recent call last):
 568     ...
 569     ValueError: Unknown word: xyzzy
 570     """
 571     if isinstance(in_str, int):
 572         return int(in_str)
 573
 574     current = result = 0
 575     in_str = in_str.replace('-', ' ')
 576     for w in in_str.split():
 577         if w not in NUM_WORDS:
 578             if is_integer_number(w):
 579                 current += int(w)
 580                 continue
 581             else:
 582                 raise ValueError("Unknown word: " + w)
 583         scale, increment = NUM_WORDS[w]
 584         current = current * scale + increment
 585         if scale > 100:
 586             result += current
 587             current = 0
 588     return result + current
 589
 590
 591 def integer_to_number_string(num: int) -> str:
 592     """
 593     Opposite of :meth:`number_string_to_integer`; converts a number to a written out
 594     longhand format in English.
 595
 596     Args:
 597         num: the integer number to convert
 598
 599     Returns:
 600         The long-hand written out English form of the number.  See examples below.
 601
 602     See also :meth:`number_string_to_integer`.
 603
 604     .. warning::
 605         This method does not handle decimals or floats, only ints.
 606
 607     >>> integer_to_number_string(9)
 608     'nine'
 609
 610     >>> integer_to_number_string(42)
 611     'forty two'
 612
 613     >>> integer_to_number_string(123219982)
 614     'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
 615     """
 616
 617     if num < 20:
 618         return UNIT_WORDS[num]
 619     if num < 100:
 620         ret = TENS_WORDS[num // 10]
 621         leftover = num % 10
 622         if leftover != 0:
 623             ret += ' ' + UNIT_WORDS[leftover]
 624         return ret
 625
 626     # If num > 100 go find the highest chunk and convert that, then recursively
 627     # convert the rest.  NUM_WORDS contains items like 'thousand' -> (1000, 0).
 628     # The second item in the tuple is an increment that can be ignored; the first
 629     # is the numeric "scale" of the entry.  So find the greatest entry in NUM_WORDS
 630     # still less than num.  For 123,456 it would be thousand.  Then pull out the
 631     # 123, convert it, and append "thousand".  Then do the rest.
 632     scales = {}
 633     for name, val in NUM_WORDS.items():
 634         if val[0] <= num:
 635             scales[name] = val[0]
 636     scale = max(scales.items(), key=lambda _: _[1])
 637
 638     # scale[1] = numeric magnitude (e.g. 1000)
 639     # scale[0] = name (e.g. "thousand")
 640     ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
 641     leftover = num % scale[1]
 642     if leftover != 0:
 643         ret += ' ' + integer_to_number_string(leftover)
 644     return ret
 645
 646
 647 def is_decimal_number(in_str: str) -> bool:
 648     """
 649     Args:
 650         in_str: the string to check
 651
 652     Returns:
 653         True if the given string represents a decimal or False
 654         otherwise.  A decimal may be signed or unsigned or use
 655         a "scientific notation".
 656
 657     See also :meth:`is_integer_number`.
 658
 659     .. note::
 660         We do not consider integers without a decimal point
 661         to be decimals; they return False (see example).
 662
 663     >>> is_decimal_number('42.0')
 664     True
 665     >>> is_decimal_number('42')
 666     False
 667     """
 668     return is_number(in_str) and "." in in_str
 669
 670
 671 def strip_escape_sequences(in_str: str) -> str:
 672     """
 673     Args:
 674         in_str: the string to strip of escape sequences.
 675
 676     Returns:
 677         in_str with escape sequences removed.
 678
 679     See also: :mod:`pyutils.ansi`.
 680
 681     .. note::
 682         What is considered to be an "escape sequence" is defined
 683         by a regular expression.  While this gets common ones,
 684         there may exist valid sequences that it doesn't match.
 685
 686     >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
 687     'this is a test!'
 688     """
 689     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 690     return in_str
 691
 692
 693 def add_thousands_separator(
 694     in_str: str, *, separator_char: str = ',', places: int = 3
 695 ) -> str:
 696     """
 697     Args:
 698         in_str: string or number to which to add thousands separator(s)
 699         separator_char: the separator character to add (defaults to comma)
 700         places: add a separator every N places (defaults to three)
 701
 702     Returns:
 703         A numeric string with thousands separators added appropriately.
 704
 705     >>> add_thousands_separator('12345678')
 706     '12,345,678'
 707     >>> add_thousands_separator(12345678)
 708     '12,345,678'
 709     >>> add_thousands_separator(12345678.99)
 710     '12,345,678.99'
 711     >>> add_thousands_separator('test')
 712     Traceback (most recent call last):
 713     ...
 714     ValueError: test
 715
 716     """
 717     if isinstance(in_str, numbers.Number):
 718         in_str = f'{in_str}'
 719     if is_number(in_str):
 720         return _add_thousands_separator(
 721             in_str, separator_char=separator_char, places=places
 722         )
 723     raise ValueError(in_str)
 724
 725
 726 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 727     """Internal helper"""
 728     decimal_part = ""
 729     if '.' in in_str:
 730         (in_str, decimal_part) = in_str.split('.')
 731     tmp = [iter(in_str[::-1])] * places
 732     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 733     if len(decimal_part) > 0:
 734         ret += '.'
 735         ret += decimal_part
 736     return ret
 737
 738
 739 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 740     """
 741     Args:
 742         in_str: the string to test
 743         allowed_schemes: an optional list of allowed schemes (e.g.
 744             ['http', 'https', 'ftp'].  If passed, only URLs that
 745             begin with the one of the schemes passed will be considered
 746             to be valid.  Otherwise, any scheme:// will be considered
 747             valid.
 748
 749     Returns:
 750         True if in_str contains a valid URL and False otherwise.
 751
 752     >>> is_url('http://www.mysite.com')
 753     True
 754     >>> is_url('https://mysite.com')
 755     True
 756     >>> is_url('.mysite.com')
 757     False
 758     >>> is_url('scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash')
 759     True
 760     """
 761     if not is_full_string(in_str):
 762         return False
 763
 764     valid = URL_RE.match(in_str) is not None
 765
 766     if allowed_schemes:
 767         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 768     return valid
 769
 770
 771 def is_email(in_str: Any) -> bool:
 772     """
 773     Args:
 774         in_str: the email address to check
 775
 776     Returns: True if the in_str contains a valid email (as defined by
 777         https://tools.ietf.org/html/rfc3696#section-3) or False
 778         otherwise.
 779
 780     >>> is_email('[email protected]')
 781     True
 782     >>> is_email('@gmail.com')
 783     False
 784     """
 785     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 786         return False
 787
 788     try:
 789         # we expect 2 tokens, one before "@" and one after, otherwise
 790         # we have an exception and the email is not valid.
 791         head, tail = in_str.split("@")
 792
 793         # head's size must be <= 64, tail <= 255, head must not start
 794         # with a dot or contain multiple consecutive dots.
 795         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 796             return False
 797
 798         # removes escaped spaces, so that later on the test regex will
 799         # accept the string.
 800         head = head.replace("\\ ", "")
 801         if head.startswith('"') and head.endswith('"'):
 802             head = head.replace(" ", "")[1:-1]
 803         return EMAIL_RE.match(head + "@" + tail) is not None
 804
 805     except ValueError:
 806         # borderline case in which we have multiple "@" signs but the
 807         # head part is correctly escaped.
 808         if ESCAPED_AT_SIGN.search(in_str) is not None:
 809             # replace "@" with "a" in the head
 810             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 811         return False
 812
 813
 814 def suffix_string_to_number(in_str: str) -> Optional[int]:
 815     """Takes a string like "33Gb" and converts it into a number (of bytes)
 816     like 34603008.
 817
 818     Args:
 819         in_str: the string with a suffix to be interpreted and removed.
 820
 821     Returns:
 822         An integer number of bytes or None to indicate an error.
 823
 824     See also :meth:`number_to_suffix_string`.
 825
 826     >>> suffix_string_to_number('1Mb')
 827     1048576
 828     >>> suffix_string_to_number('13.1Gb')
 829     14066017894
 830     >>> suffix_string_to_number('12345')
 831     12345
 832     >>> x = suffix_string_to_number('a lot')
 833     >>> x is None
 834     True
 835     """
 836
 837     def suffix_capitalize(s: str) -> str:
 838         if len(s) == 1:
 839             return s.upper()
 840         elif len(s) == 2:
 841             return f"{s[0].upper()}{s[1].lower()}"
 842         return suffix_capitalize(s[0:1])
 843
 844     if is_string(in_str):
 845         if is_integer_number(in_str):
 846             return to_int(in_str)
 847         suffixes = [in_str[-2:], in_str[-1:]]
 848         rest = [in_str[:-2], in_str[:-1]]
 849         for x in range(len(suffixes)):
 850             s = suffixes[x]
 851             s = suffix_capitalize(s)
 852             multiplier = NUM_SUFFIXES.get(s, None)
 853             if multiplier is not None:
 854                 r = rest[x]
 855                 if is_integer_number(r):
 856                     return to_int(r) * multiplier
 857                 if is_decimal_number(r):
 858                     return int(float(r) * multiplier)
 859     return None
 860
 861
 862 def number_to_suffix_string(num: int) -> Optional[str]:
 863     """Take a number (of bytes) and returns a string like "43.8Gb".
 864
 865     Args:
 866         num: an integer number of bytes
 867
 868     Returns:
 869         A string with a suffix representing num bytes concisely or
 870         None to indicate an error.
 871
 872     See also: :meth:`suffix_string_to_number`.
 873
 874     >>> number_to_suffix_string(14066017894)
 875     '13.1Gb'
 876     >>> number_to_suffix_string(1024 * 1024)
 877     '1.0Mb'
 878     """
 879     d = 0.0
 880     suffix = None
 881     for (sfx, size) in NUM_SUFFIXES.items():
 882         if num >= size:
 883             d = num / size
 884             suffix = sfx
 885             break
 886     if suffix is not None:
 887         return f"{d:.1f}{suffix}"
 888     else:
 889         return f'{num:d}'
 890
 891
 892 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 893     """
 894     Args:
 895         in_str: a string to check
 896         card_type: if provided, contains the card type to validate
 897             with.  Otherwise, all known credit card number types will
 898             be accepted.
 899
 900             Supported card types are the following:
 901
 902             * VISA
 903             * MASTERCARD
 904             * AMERICAN_EXPRESS
 905             * DINERS_CLUB
 906             * DISCOVER
 907             * JCB
 908
 909     Returns:
 910         True if in_str is a valid credit card number.
 911
 912     .. warning::
 913         This code is not verifying the authenticity of the credit card (i.e.
 914         not checking whether it's a real card that can be charged); rather
 915         it's only checking that the number follows the "rules" for numbering
 916         established by credit card issuers.
 917
 918     """
 919     if not is_full_string(in_str):
 920         return False
 921
 922     if card_type is not None:
 923         if card_type not in CREDIT_CARDS:
 924             raise KeyError(
 925                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 926             )
 927         return CREDIT_CARDS[card_type].match(in_str) is not None
 928     for c in CREDIT_CARDS:
 929         if CREDIT_CARDS[c].match(in_str) is not None:
 930             return True
 931     return False
 932
 933
 934 def is_camel_case(in_str: Any) -> bool:
 935     """
 936     Args:
 937         in_str: the string to test
 938
 939     Returns:
 940         True if the string is formatted as camel case and False otherwise.
 941         A string is considered camel case when:
 942
 943         * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 944         * it contains both lowercase and uppercase letters
 945         * it does not start with a number
 946
 947     See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
 948     """
 949     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 950
 951
 952 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 953     """
 954     Args:
 955         in_str: the string to test
 956         separator: the snake case separator character to use
 957
 958     Returns: True if the string is snake case and False otherwise.  A
 959         string is considered snake case when:
 960
 961         * it's composed only by lowercase/uppercase letters and digits
 962         * it contains at least one underscore (or provided separator)
 963         * it does not start with a number
 964
 965     See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
 966
 967     >>> is_snake_case('this_is_a_test')
 968     True
 969     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 970     True
 971     >>> is_snake_case('this-is-a-test')
 972     False
 973     >>> is_snake_case('this-is-a-test', separator='-')
 974     True
 975     """
 976     if is_full_string(in_str):
 977         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 978         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 979         r = re_map.get(
 980             separator,
 981             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 982         )
 983         return r.match(in_str) is not None
 984     return False
 985
 986
 987 def is_json(in_str: Any) -> bool:
 988     """
 989     Args:
 990         in_str: the string to test
 991
 992     Returns:
 993         True if the in_str contains valid JSON and False otherwise.
 994
 995     >>> is_json('{"name": "Peter"}')
 996     True
 997     >>> is_json('[1, 2, 3]')
 998     True
 999     >>> is_json('{nope}')
1000     False
1001     """
1002     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
1003         try:
1004             return isinstance(json.loads(in_str), (dict, list))
1005         except (TypeError, ValueError, OverflowError):
1006             pass
1007     return False
1008
1009
1010 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1011     """
1012     Args:
1013         in_str: the string to test
1014         allow_hex: should we allow hexidecimal digits in valid uuids?
1015
1016     Returns:
1017         True if the in_str contains a valid UUID and False otherwise.
1018
1019     See also :meth:`generate_uuid`.
1020
1021     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1022     True
1023     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1024     False
1025     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1026     True
1027     """
1028     # string casting is used to allow UUID itself as input data type
1029     s = str(in_str)
1030     if allow_hex:
1031         return UUID_HEX_OK_RE.match(s) is not None
1032     return UUID_RE.match(s) is not None
1033
1034
1035 def is_ip_v4(in_str: Any) -> bool:
1036     """
1037     Args:
1038         in_str: the string to test
1039
1040     Returns:
1041         True if in_str contains a valid IPv4 address and False otherwise.
1042
1043     See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1044     and :meth:`is_ip`.
1045
1046     >>> is_ip_v4('255.200.100.75')
1047     True
1048     >>> is_ip_v4('nope')
1049     False
1050     >>> is_ip_v4('255.200.100.999')  # 999 out of range
1051     False
1052     """
1053     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1054         return False
1055
1056     # checks that each entry in the ip is in the valid range (0 to 255)
1057     for token in in_str.split("."):
1058         if not 0 <= int(token) <= 255:
1059             return False
1060     return True
1061
1062
1063 def extract_ip_v4(in_str: Any) -> Optional[str]:
1064     """
1065     Args:
1066         in_str: the string to extract an IPv4 address from.
1067
1068     Returns:
1069         The first extracted IPv4 address from in_str or None if
1070         none were found or an error occurred.
1071
1072     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1073     and :meth:`is_ip`.
1074
1075     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
1076     '127.0.0.1'
1077     >>> extract_ip_v4('Your mom dresses you funny.')
1078     """
1079     if not is_full_string(in_str):
1080         return None
1081     m = ANYWHERE_IP_V4_RE.search(in_str)
1082     if m is not None:
1083         return m.group(0)
1084     return None
1085
1086
1087 def is_ip_v6(in_str: Any) -> bool:
1088     """
1089     Args:
1090         in_str: the string to test.
1091
1092     Returns:
1093         True if in_str contains a valid IPv6 address and False otherwise.
1094
1095     See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1096     and :meth:`is_ip`.
1097
1098     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1099     True
1100     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
1101     False
1102     """
1103     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1104
1105
1106 def extract_ip_v6(in_str: Any) -> Optional[str]:
1107     """
1108     Args:
1109         in_str: the string from which to extract an IPv6 address.
1110
1111     Returns:
1112         The first IPv6 address found in in_str or None if no address
1113         was found or an error occurred.
1114
1115     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1116     and :meth:`is_ip`.
1117
1118     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1119     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1120     >>> extract_ip_v6("(and she's ugly too, btw)")
1121     """
1122     if not is_full_string(in_str):
1123         return None
1124     m = ANYWHERE_IP_V6_RE.search(in_str)
1125     if m is not None:
1126         return m.group(0)
1127     return None
1128
1129
1130 def is_ip(in_str: Any) -> bool:
1131     """
1132     Args:
1133         in_str: the string to test.
1134
1135     Returns:
1136         True if in_str contains a valid IP address (either IPv4 or
1137         IPv6).
1138
1139     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1140     and :meth:`extract_ip_v4`.
1141
1142     >>> is_ip('255.200.100.75')
1143     True
1144     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1145     True
1146     >>> is_ip('1.2.3')
1147     False
1148     >>> is_ip('1.2.3.999')
1149     False
1150     """
1151     return is_ip_v6(in_str) or is_ip_v4(in_str)
1152
1153
1154 def extract_ip(in_str: Any) -> Optional[str]:
1155     """
1156     Args:
1157         in_str: the string from which to extract in IP address.
1158
1159     Returns:
1160         The first IP address (IPv4 or IPv6) found in in_str or
1161         None to indicate none found or an error condition.
1162
1163     See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1164     and :meth:`extract_ip_v4`.
1165
1166     >>> extract_ip('Attacker: 255.200.100.75')
1167     '255.200.100.75'
1168     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1169     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1170     >>> extract_ip('1.2.3')
1171     """
1172     ip = extract_ip_v4(in_str)
1173     if ip is None:
1174         ip = extract_ip_v6(in_str)
1175     return ip
1176
1177
1178 def is_mac_address(in_str: Any) -> bool:
1179     """
1180     Args:
1181         in_str: the string to test
1182
1183     Returns:
1184         True if in_str is a valid MAC address False otherwise.
1185
1186     See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1187
1188     >>> is_mac_address("34:29:8F:12:0D:2F")
1189     True
1190     >>> is_mac_address('34:29:8f:12:0d:2f')
1191     True
1192     >>> is_mac_address('34-29-8F-12-0D-2F')
1193     True
1194     >>> is_mac_address("test")
1195     False
1196     """
1197     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1198
1199
1200 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1201     """
1202     Args:
1203         in_str: the string from which to extract a MAC address.
1204         separator: the MAC address hex byte separator to use.
1205
1206     Returns:
1207         The first MAC address found in in_str or None to indicate no
1208         match or an error.
1209
1210     See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1211
1212     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1213     '34:29:8F:12:0D:2F'
1214
1215     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1216     'd8:5d:e2:34:54:86'
1217     """
1218     if not is_full_string(in_str):
1219         return None
1220     in_str.strip()
1221     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1222     if m is not None:
1223         mac = m.group(0)
1224         mac.replace(":", separator)
1225         mac.replace("-", separator)
1226         return mac
1227     return None
1228
1229
1230 def is_slug(in_str: Any, separator: str = "-") -> bool:
1231     """
1232     Args:
1233         in_str: string to test
1234         separator: the slug character to use
1235
1236     Returns:
1237         True if in_str is a slug string and False otherwise.
1238
1239     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1240
1241     >>> is_slug('my-blog-post-title')
1242     True
1243     >>> is_slug('My blog post title')
1244     False
1245     """
1246     if not is_full_string(in_str):
1247         return False
1248     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1249     return re.match(rex, in_str) is not None
1250
1251
1252 def contains_html(in_str: str) -> bool:
1253     """
1254     Args:
1255         in_str: the string to check for tags in
1256
1257     Returns:
1258         True if the given string contains HTML/XML tags and False
1259         otherwise.
1260
1261     See also :meth:`strip_html`.
1262
1263     .. warning::
1264         By design, this function matches ANY type of tag, so don't expect
1265         to use it as an HTML validator.  It's a quick sanity check at
1266         best.  See something like BeautifulSoup for a more full-featuered
1267         HTML parser.
1268
1269     >>> contains_html('my string is <strong>bold</strong>')
1270     True
1271     >>> contains_html('my string is not bold')
1272     False
1273
1274     """
1275     if not is_string(in_str):
1276         raise ValueError(in_str)
1277     return HTML_RE.search(in_str) is not None
1278
1279
1280 def words_count(in_str: str) -> int:
1281     """
1282     Args:
1283         in_str: the string to count words in
1284
1285     Returns:
1286         The number of words contained in the given string.
1287
1288     .. note::
1289         This method is "smart" in that it does consider only sequences
1290         of one or more letter and/or numbers to be "words".  Thus a
1291         string like this: "! @ # % ... []" will return zero.  Moreover
1292         it is aware of punctuation, so the count for a string like
1293         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1294         in the string).
1295
1296     >>> words_count('hello world')
1297     2
1298     >>> words_count('one,two,three.stop')
1299     4
1300     """
1301     if not is_string(in_str):
1302         raise ValueError(in_str)
1303     return len(WORDS_COUNT_RE.findall(in_str))
1304
1305
1306 def word_count(in_str: str) -> int:
1307     """
1308     Args:
1309         in_str: the string to count words in
1310
1311     Returns:
1312         The number of words contained in the given string.
1313
1314     .. note::
1315         This method is "smart" in that it does consider only sequences
1316         of one or more letter and/or numbers to be "words".  Thus a
1317         string like this: "! @ # % ... []" will return zero.  Moreover
1318         it is aware of punctuation, so the count for a string like
1319         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1320         in the string).
1321
1322     >>> word_count('hello world')
1323     2
1324     >>> word_count('one,two,three.stop')
1325     4
1326     """
1327     return words_count(in_str)
1328
1329
1330 def generate_uuid(omit_dashes: bool = False) -> str:
1331     """
1332     Args:
1333         omit_dashes: should we omit the dashes in the generated UUID?
1334
1335     Returns:
1336         A generated UUID string (using `uuid.uuid4()`) with or without
1337         dashes per the omit_dashes arg.
1338
1339     See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1340
1341     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1342     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1343     """
1344     uid = uuid4()
1345     if omit_dashes:
1346         return uid.hex
1347     return str(uid)
1348
1349
1350 def generate_random_alphanumeric_string(size: int) -> str:
1351     """
1352     Args:
1353         size: number of characters to generate
1354
1355     Returns:
1356         A string of the specified size containing random characters
1357         (uppercase/lowercase ascii letters and digits).
1358
1359     See also :meth:`asciify`, :meth:`generate_uuid`.
1360
1361     >>> random.seed(22)
1362     >>> generate_random_alphanumeric_string(9)
1363     '96ipbNClS'
1364     """
1365     if size < 1:
1366         raise ValueError("size must be >= 1")
1367     chars = string.ascii_letters + string.digits
1368     buffer = [random.choice(chars) for _ in range(size)]
1369     return from_char_list(buffer)
1370
1371
1372 def reverse(in_str: str) -> str:
1373     """
1374     Args:
1375         in_str: the string to reverse
1376
1377     Returns:
1378         The reversed (chracter by character) string.
1379
1380     >>> reverse('test')
1381     'tset'
1382     """
1383     if not is_string(in_str):
1384         raise ValueError(in_str)
1385     return in_str[::-1]
1386
1387
1388 def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
1389     """
1390     Args:
1391         in_str: the camel case string to convert
1392         separator: the snake case separator character to use
1393
1394     Returns:
1395         A snake case string equivalent to the camel case input or the
1396         original string if it is not a valid camel case string or some
1397         other error occurs.
1398
1399     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1400
1401     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1402     'mac_address_extractor_factory'
1403     >>> camel_case_to_snake_case('Luke Skywalker')
1404     'Luke Skywalker'
1405     """
1406     if not is_string(in_str):
1407         raise ValueError(in_str)
1408     if not is_camel_case(in_str):
1409         return in_str
1410     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1411
1412
1413 def snake_case_to_camel_case(
1414     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1415 ) -> str:
1416     """
1417     Args:
1418         in_str: the snake case string to convert
1419         upper_case_first: should we capitalize the first letter?
1420         separator: the separator character to use
1421
1422     Returns:
1423         A camel case string that is equivalent to the snake case string
1424         provided or the original string back again if it is not valid
1425         snake case or another error occurs.
1426
1427     See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1428
1429     >>> snake_case_to_camel_case('this_is_a_test')
1430     'ThisIsATest'
1431     >>> snake_case_to_camel_case('Han Solo')
1432     'Han Solo'
1433     """
1434     if not is_string(in_str):
1435         raise ValueError(in_str)
1436     if not is_snake_case(in_str, separator=separator):
1437         return in_str
1438     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1439     if not upper_case_first:
1440         tokens[0] = tokens[0].lower()
1441     return from_char_list(tokens)
1442
1443
1444 def to_char_list(in_str: str) -> List[str]:
1445     """
1446     Args:
1447         in_str: the string to split into a char list
1448
1449     Returns:
1450         A list of strings of length one each.
1451
1452     See also :meth:`from_char_list`.
1453
1454     >>> to_char_list('test')
1455     ['t', 'e', 's', 't']
1456     """
1457     if not is_string(in_str):
1458         return []
1459     return list(in_str)
1460
1461
1462 def from_char_list(in_list: List[str]) -> str:
1463     """
1464     Args:
1465         in_list: A list of characters to convert into a string.
1466
1467     Returns:
1468         The string resulting from gluing the characters in in_list
1469         together.
1470
1471     See also :meth:`to_char_list`.
1472
1473     >>> from_char_list(['t', 'e', 's', 't'])
1474     'test'
1475     """
1476     return "".join(in_list)
1477
1478
1479 def shuffle(in_str: str) -> Optional[str]:
1480     """
1481     Args:
1482         in_str: a string to shuffle randomly by character
1483
1484     Returns:
1485         A new string containing same chars of the given one but in
1486         a randomized order.  Note that in rare cases this could result
1487         in the same original string as no check is done.  Returns
1488         None to indicate error conditions.
1489
1490     >>> random.seed(22)
1491     >>> shuffle('awesome')
1492     'meosaew'
1493     """
1494     if not is_string(in_str):
1495         return None
1496     chars = to_char_list(in_str)
1497     random.shuffle(chars)
1498     return from_char_list(chars)
1499
1500
1501 def scramble(in_str: str) -> Optional[str]:
1502     """
1503     Args:
1504         in_str: a string to shuffle randomly by character
1505
1506     Returns:
1507         A new string containing same chars of the given one but in
1508         a randomized order.  Note that in rare cases this could result
1509         in the same original string as no check is done.  Returns
1510         None to indicate error conditions.
1511
1512     See also :mod:`pyutils.unscrambler`.
1513
1514     >>> random.seed(22)
1515     >>> scramble('awesome')
1516     'meosaew'
1517     """
1518     return shuffle(in_str)
1519
1520
1521 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1522     """
1523     Args:
1524         in_str: the string to strip tags from
1525         keep_tag_content: should we keep the inner contents of tags?
1526
1527     Returns:
1528         A string with all HTML tags removed (optionally with tag contents
1529         preserved).
1530
1531     See also :meth:`contains_html`.
1532
1533     .. note::
1534         This method uses simple regular expressions to strip tags and is
1535         not a full fledged HTML parser by any means.  Consider using
1536         something like BeautifulSoup if your needs are more than this
1537         simple code can fulfill.
1538
1539     >>> strip_html('test: <a href="foo/bar">click here</a>')
1540     'test: '
1541     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1542     'test: click here'
1543     """
1544     if not is_string(in_str):
1545         raise ValueError(in_str)
1546     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1547     return r.sub("", in_str)
1548
1549
1550 def asciify(in_str: str) -> str:
1551     """
1552     Args:
1553         in_str: the string to asciify.
1554
1555     Returns:
1556         An output string roughly equivalent to the original string
1557         where all content to are ascii-only.  This is accomplished
1558         by translating all non-ascii chars into their closest possible
1559         ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1560
1561     See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1562
1563     .. warning::
1564         Some chars may be lost if impossible to translate.
1565
1566     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1567     'eeuuooaaeynAAACIINOE'
1568     """
1569     if not is_string(in_str):
1570         raise ValueError(in_str)
1571
1572     # "NFKD" is the algorithm which is able to successfully translate
1573     # the most of non-ascii chars.
1574     normalized = unicodedata.normalize("NFKD", in_str)
1575
1576     # encode string forcing ascii and ignore any errors
1577     # (unrepresentable chars will be stripped out)
1578     ascii_bytes = normalized.encode("ascii", "ignore")
1579
1580     # turns encoded bytes into an utf-8 string
1581     return ascii_bytes.decode("utf-8")
1582
1583
1584 def slugify(in_str: str, *, separator: str = "-") -> str:
1585     """
1586     Args:
1587         in_str: the string to slugify
1588         separator: the character to use during sligification (default
1589             is a dash)
1590
1591     Returns:
1592         The converted string.  The returned string has the following properties:
1593
1594         * it has no spaces
1595         * all letters are in lower case
1596         * all punctuation signs and non alphanumeric chars are removed
1597         * words are divided using provided separator
1598         * all chars are encoded as ascii (by using :meth:`asciify`)
1599         * is safe for URL
1600
1601     See also :meth:`is_slug` and :meth:`asciify`.
1602
1603     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1604     'top-10-reasons-to-love-dogs'
1605     >>> slugify('Mönstér Mägnët')
1606     'monster-magnet'
1607     """
1608     if not is_string(in_str):
1609         raise ValueError(in_str)
1610
1611     # replace any character that is NOT letter or number with spaces
1612     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1613
1614     # replace spaces with join sign
1615     out = SPACES_RE.sub(separator, out)
1616
1617     # normalize joins (remove duplicates)
1618     out = re.sub(re.escape(separator) + r"+", separator, out)
1619     return asciify(out)
1620
1621
1622 def to_bool(in_str: str) -> bool:
1623     """
1624     Args:
1625         in_str: the string to convert to boolean
1626
1627     Returns:
1628         A boolean equivalent of the original string based on its contents.
1629         All conversion is case insensitive.  A positive boolean (True) is
1630         returned if the string value is any of the following:
1631
1632         * "true"
1633         * "t"
1634         * "1"
1635         * "yes"
1636         * "y"
1637         * "on"
1638
1639         Otherwise False is returned.
1640
1641     See also :mod:`pyutils.argparse_utils`.
1642
1643     >>> to_bool('True')
1644     True
1645
1646     >>> to_bool('1')
1647     True
1648
1649     >>> to_bool('yes')
1650     True
1651
1652     >>> to_bool('no')
1653     False
1654
1655     >>> to_bool('huh?')
1656     False
1657
1658     >>> to_bool('on')
1659     True
1660     """
1661     if not is_string(in_str):
1662         raise ValueError(in_str)
1663     return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
1664
1665
1666 def to_date(in_str: str) -> Optional[datetime.date]:
1667     """
1668     Args:
1669         in_str: the string to convert into a date
1670
1671     Returns:
1672         The datetime.date the string contained or None to indicate
1673         an error.  This parser is relatively clever; see
1674         :class:`datetimes.dateparse_utils` docs for details.
1675
1676     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
1677     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1678
1679     >>> to_date('9/11/2001')
1680     datetime.date(2001, 9, 11)
1681     >>> to_date('xyzzy')
1682     """
1683     import pyutils.datetimes.dateparse_utils as du
1684
1685     try:
1686         d = du.DateParser()  # type: ignore
1687         d.parse(in_str)
1688         return d.get_date()
1689     except du.ParseException:  # type: ignore
1690         pass
1691     return None
1692
1693
1694 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1695     """Finds and extracts a date from the string, if possible.
1696
1697     Args:
1698         in_str: the string to extract a date from
1699
1700     Returns:
1701         a datetime if date was found, otherwise None
1702
1703     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1704     :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1705
1706     >>> extract_date("filename.txt    dec 13, 2022")
1707     datetime.datetime(2022, 12, 13, 0, 0)
1708
1709     >>> extract_date("Dear Santa, please get me a pony.")
1710
1711     """
1712     import itertools
1713
1714     import pyutils.datetimes.dateparse_utils as du
1715
1716     d = du.DateParser()  # type: ignore
1717     chunks = in_str.split()
1718     for ngram in itertools.chain(
1719         list_utils.ngrams(chunks, 5),
1720         list_utils.ngrams(chunks, 4),
1721         list_utils.ngrams(chunks, 3),
1722         list_utils.ngrams(chunks, 2),
1723     ):
1724         try:
1725             expr = " ".join(ngram)
1726             logger.debug("Trying %s", expr)
1727             if d.parse(expr):
1728                 return d.get_datetime()
1729         except du.ParseException:  # type: ignore
1730             pass
1731     return None
1732
1733
1734 def is_valid_date(in_str: str) -> bool:
1735     """
1736     Args:
1737         in_str: the string to check
1738
1739     Returns:
1740         True if the string represents a valid date that we can recognize
1741         and False otherwise.  This parser is relatively clever; see
1742         :class:`datetimes.dateparse_utils` docs for details.
1743
1744     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1745     :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1746
1747     >>> is_valid_date('1/2/2022')
1748     True
1749     >>> is_valid_date('christmas')
1750     True
1751     >>> is_valid_date('next wednesday')
1752     True
1753     >>> is_valid_date('xyzzy')
1754     False
1755     """
1756     import pyutils.datetimes.dateparse_utils as dp
1757
1758     try:
1759         d = dp.DateParser()  # type: ignore
1760         _ = d.parse(in_str)
1761         return True
1762     except dp.ParseException:  # type: ignore
1763         pass
1764     return False
1765
1766
1767 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1768     """
1769     Args:
1770         in_str: string to parse into a datetime
1771
1772     Returns:
1773         A python datetime parsed from in_str or None to indicate
1774         an error.  This parser is relatively clever; see
1775         :class:`datetimes.dateparse_utils` docs for details.
1776
1777     See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1778     :meth:`extract_date`, :meth:`valid_datetime`.
1779
1780     >>> to_datetime('7/20/1969 02:56 GMT')
1781     datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1782     """
1783     import pyutils.datetimes.dateparse_utils as dp
1784
1785     try:
1786         d = dp.DateParser()  # type: ignore
1787         dt = d.parse(in_str)
1788         if isinstance(dt, datetime.datetime):
1789             return dt
1790     except Exception:
1791         pass
1792     return None
1793
1794
1795 def valid_datetime(in_str: str) -> bool:
1796     """
1797     Args:
1798         in_str: the string to check
1799
1800     Returns:
1801         True if in_str contains a valid datetime and False otherwise.
1802         This parser is relatively clever; see
1803         :class:`datetimes.dateparse_utils` docs for details.
1804
1805     >>> valid_datetime('next wednesday at noon')
1806     True
1807     >>> valid_datetime('3 weeks ago at midnight')
1808     True
1809     >>> valid_datetime('next easter at 5:00 am')
1810     True
1811     >>> valid_datetime('sometime soon')
1812     False
1813     """
1814     _ = to_datetime(in_str)
1815     if _ is not None:
1816         return True
1817     return False
1818
1819
1820 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1821     """
1822     Args:
1823         in_str: the string to squeeze
1824         character_to_squeeze: the character to remove runs of
1825             more than one in a row (default = space)
1826
1827     Returns: A "squeezed string" where runs of more than one
1828         character_to_squeeze into one.
1829
1830     >>> squeeze(' this        is       a    test    ')
1831     ' this is a test '
1832
1833     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1834     'one|!|two|!|three'
1835
1836     """
1837     return re.sub(
1838         r'(' + re.escape(character_to_squeeze) + r')+',
1839         character_to_squeeze,
1840         in_str,
1841     )
1842
1843
1844 def dedent(in_str: str) -> Optional[str]:
1845     """
1846     Args:
1847         in_str: the string to dedent
1848
1849     Returns:
1850         A string with tab indentation removed or None on error.
1851
1852     See also :meth:`indent`.
1853
1854     >>> dedent('\t\ttest\\n\t\ting')
1855     'test\\ning'
1856     """
1857     if not is_string(in_str):
1858         return None
1859     line_separator = '\n'
1860     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1861     return line_separator.join(lines)
1862
1863
1864 def indent(in_str: str, amount: int) -> str:
1865     """
1866     Args:
1867         in_str: the string to indent
1868         amount: count of spaces to indent each line by
1869
1870     Returns:
1871         An indented string created by prepending amount spaces.
1872
1873     See also :meth:`dedent`.
1874
1875     >>> indent('This is a test', 4)
1876     '    This is a test'
1877     """
1878     if not is_string(in_str):
1879         raise ValueError(in_str)
1880     line_separator = '\n'
1881     lines = [" " * amount + line for line in in_str.split(line_separator)]
1882     return line_separator.join(lines)
1883
1884
1885 def _sprintf(*args, **kwargs) -> str:
1886     """Internal helper."""
1887     ret = ""
1888
1889     sep = kwargs.pop("sep", None)
1890     if sep is not None:
1891         if not isinstance(sep, str):
1892             raise TypeError("sep must be None or a string")
1893
1894     end = kwargs.pop("end", None)
1895     if end is not None:
1896         if not isinstance(end, str):
1897             raise TypeError("end must be None or a string")
1898
1899     if kwargs:
1900         raise TypeError("invalid keyword arguments to sprint()")
1901
1902     if sep is None:
1903         sep = " "
1904     if end is None:
1905         end = "\n"
1906     for n, arg in enumerate(args):
1907         if n:
1908             ret += sep
1909         if isinstance(arg, str):
1910             ret += arg
1911         else:
1912             ret += str(arg)
1913     ret += end
1914     return ret
1915
1916
1917 def strip_ansi_sequences(in_str: str) -> str:
1918     """
1919     Args:
1920         in_str: the string to strip
1921
1922     Returns:
1923         in_str with recognized ANSI escape sequences removed.
1924
1925     See also :mod:`pyutils.ansi`.
1926
1927     .. warning::
1928         This method works by using a regular expression.
1929         It works for all ANSI escape sequences I've tested with but
1930         may miss some; caveat emptor.
1931
1932     >>> import ansi as a
1933     >>> s = a.fg('blue') + 'blue!' + a.reset()
1934     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1935     18
1936     >>> len(strip_ansi_sequences(s))
1937     5
1938     >>> strip_ansi_sequences(s)
1939     'blue!'
1940
1941     """
1942     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1943
1944
1945 class SprintfStdout(contextlib.AbstractContextManager):
1946     """
1947     A context manager that captures outputs to stdout to a buffer
1948     without printing them.
1949
1950     >>> with SprintfStdout() as buf:
1951     ...     print("test")
1952     ...     print("1, 2, 3")
1953     ...
1954     >>> print(buf(), end='')
1955     test
1956     1, 2, 3
1957     """
1958
1959     def __init__(self) -> None:
1960         self.destination = io.StringIO()
1961         self.recorder: contextlib.redirect_stdout
1962
1963     def __enter__(self) -> Callable[[], str]:
1964         self.recorder = contextlib.redirect_stdout(self.destination)
1965         self.recorder.__enter__()
1966         return lambda: self.destination.getvalue()
1967
1968     def __exit__(self, *args) -> Literal[False]:
1969         self.recorder.__exit__(*args)
1970         self.destination.seek(0)
1971         return False
1972
1973
1974 def capitalize_first_letter(in_str: str) -> str:
1975     """
1976     Args:
1977         in_str: the string to capitalize
1978
1979     Returns:
1980         in_str with the first character capitalized.
1981
1982     >>> capitalize_first_letter('test')
1983     'Test'
1984     >>> capitalize_first_letter("ALREADY!")
1985     'ALREADY!'
1986     """
1987     return in_str[0].upper() + in_str[1:]
1988
1989
1990 def it_they(n: int) -> str:
1991     """
1992     Args:
1993         n: how many of them are there?
1994
1995     Returns:
1996         'it' if n is one or 'they' otherwize.
1997
1998     See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
1999     :meth:`thify`.
2000
2001     Suggested usage::
2002
2003         n = num_files_saved_to_tmp()
2004         print(f'Saved file{pluralize(n)} successfully.')
2005         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2006
2007     >>> it_they(1)
2008     'it'
2009     >>> it_they(100)
2010     'they'
2011     """
2012     if n == 1:
2013         return "it"
2014     return "they"
2015
2016
2017 def is_are(n: int) -> str:
2018     """
2019     Args:
2020         n: how many of them are there?
2021
2022     Returns:
2023         'is' if n is one or 'are' otherwize.
2024
2025     See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2026     :meth:`thify`.
2027
2028     Suggested usage::
2029
2030         n = num_files_saved_to_tmp()
2031         print(f'Saved file{pluralize(n)} successfully.')
2032         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2033
2034     >>> is_are(1)
2035     'is'
2036     >>> is_are(2)
2037     'are'
2038
2039     """
2040     if n == 1:
2041         return "is"
2042     return "are"
2043
2044
2045 def pluralize(n: int) -> str:
2046     """
2047     Args:
2048         n: how many of them are there?
2049
2050     Returns:
2051         's' if n is greater than one otherwize ''.
2052
2053     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2054     :meth:`thify`.
2055
2056     Suggested usage::
2057
2058         n = num_files_saved_to_tmp()
2059         print(f'Saved file{pluralize(n)} successfully.')
2060         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2061
2062     >>> pluralize(15)
2063     's'
2064     >>> count = 1
2065     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2066     There is 1 file.
2067     >>> count = 4
2068     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2069     There are 4 files.
2070     """
2071     if n == 1:
2072         return ""
2073     return "s"
2074
2075
2076 def make_contractions(txt: str) -> str:
2077     """This code glues words in txt together to form (English)
2078     contractions.
2079
2080     Args:
2081         txt: the input text to be contractionized.
2082
2083     Returns:
2084         Output text identical to original input except for any
2085         recognized contractions are formed.
2086
2087     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2088
2089     .. note::
2090         The order in which we create contractions is defined by the
2091         implementation and what I thought made more sense when writing
2092         this code.
2093
2094     >>> make_contractions('It is nice today.')
2095     "It's nice today."
2096
2097     >>> make_contractions('I can    not even...')
2098     "I can't even..."
2099
2100     >>> make_contractions('She could not see!')
2101     "She couldn't see!"
2102
2103     >>> make_contractions('But she will not go.')
2104     "But she won't go."
2105
2106     >>> make_contractions('Verily, I shall not.')
2107     "Verily, I shan't."
2108
2109     >>> make_contractions('No you cannot.')
2110     "No you can't."
2111
2112     >>> make_contractions('I said you can not go.')
2113     "I said you can't go."
2114     """
2115
2116     first_second = [
2117         (
2118             [
2119                 'are',
2120                 'could',
2121                 'did',
2122                 'has',
2123                 'have',
2124                 'is',
2125                 'must',
2126                 'should',
2127                 'was',
2128                 'were',
2129                 'would',
2130             ],
2131             ['(n)o(t)'],
2132         ),
2133         (
2134             [
2135                 "I",
2136                 "you",
2137                 "he",
2138                 "she",
2139                 "it",
2140                 "we",
2141                 "they",
2142                 "how",
2143                 "why",
2144                 "when",
2145                 "where",
2146                 "who",
2147                 "there",
2148             ],
2149             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2150         ),
2151     ]
2152
2153     # Special cases: can't, shan't and won't.
2154     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2155     txt = re.sub(
2156         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2157     )
2158     txt = re.sub(
2159         r'\b(w)ill\s*(n)(o)(t)\b',
2160         r"\1\3\2'\4",
2161         txt,
2162         count=0,
2163         flags=re.IGNORECASE,
2164     )
2165
2166     for first_list, second_list in first_second:
2167         for first in first_list:
2168             for second in second_list:
2169                 # Disallow there're/where're.  They're valid English
2170                 # but sound weird.
2171                 if (first in set(['there', 'where'])) and second == 'a(re)':
2172                     continue
2173
2174                 pattern = fr'\b({first})\s+{second}\b'
2175                 if second == '(n)o(t)':
2176                     replacement = r"\1\2'\3"
2177                 else:
2178                     replacement = r"\1'\2"
2179                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2180
2181     return txt
2182
2183
2184 def thify(n: int) -> str:
2185     """
2186     Args:
2187         n: how many of them are there?
2188
2189     Returns:
2190         The proper cardinal suffix for a number.
2191
2192     See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2193
2194     Suggested usage::
2195
2196         attempt_count = 0
2197         while True:
2198             attempt_count += 1
2199             if try_the_thing():
2200                 break
2201             print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2202
2203     >>> thify(1)
2204     'st'
2205     >>> thify(33)
2206     'rd'
2207     >>> thify(16)
2208     'th'
2209     """
2210     digit = str(n)
2211     assert is_integer_number(digit)
2212     digit = digit[-1:]
2213     if digit == "1":
2214         return "st"
2215     elif digit == "2":
2216         return "nd"
2217     elif digit == "3":
2218         return "rd"
2219     else:
2220         return "th"
2221
2222
2223 def ngrams(txt: str, n: int):
2224     """
2225     Args:
2226         txt: the string to create ngrams using
2227         n: how many words per ngram created?
2228
2229     Returns:
2230         Generates the ngrams from the input string.
2231
2232     See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2233
2234     >>> [x for x in ngrams('This is a test', 2)]
2235     ['This is', 'is a', 'a test']
2236     """
2237     words = txt.split()
2238     for ngram in ngrams_presplit(words, n):
2239         ret = ''
2240         for w in ngram:
2241             ret += f'{w} '
2242         yield ret.strip()
2243
2244
2245 def ngrams_presplit(words: Sequence[str], n: int):
2246     """
2247     Same as :meth:`ngrams` but with the string pre-split.
2248
2249     See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2250     """
2251     return list_utils.ngrams(words, n)
2252
2253
2254 def bigrams(txt: str):
2255     """Generates the bigrams (n=2) of the given string.
2256
2257     See also :meth:`ngrams`, :meth:`trigrams`.
2258
2259     >>> [x for x in bigrams('this is a test')]
2260     ['this is', 'is a', 'a test']
2261     """
2262     return ngrams(txt, 2)
2263
2264
2265 def trigrams(txt: str):
2266     """Generates the trigrams (n=3) of the given string.
2267
2268     See also :meth:`ngrams`, :meth:`bigrams`.
2269     """
2270     return ngrams(txt, 3)
2271
2272
2273 def shuffle_columns_into_list(
2274     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
2275 ) -> Iterable[str]:
2276     """Helper to shuffle / parse columnar data and return the results as a
2277     list.
2278
2279     Args:
2280         input_lines: A sequence of strings that represents text that
2281             has been broken into columns by the caller
2282         column_specs: an iterable collection of numeric sequences that
2283             indicate one or more column numbers to copy to form the Nth
2284             position in the output list.  See example below.
2285         delim: for column_specs that indicate we should copy more than
2286             one column from the input into this position, use delim to
2287             separate source data.  Defaults to ''.
2288
2289     Returns:
2290         A list of string created by following the instructions set forth
2291         in column_specs.
2292
2293     See also :meth:`shuffle_columns_into_dict`.
2294
2295     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2296     >>> shuffle_columns_into_list(
2297     ...     cols,
2298     ...     [ [8], [2, 3], [5, 6, 7] ],
2299     ...     delim='!',
2300     ... )
2301     ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2302     """
2303     out = []
2304
2305     # Column specs map input lines' columns into outputs.
2306     # [col1, col2...]
2307     for spec in column_specs:
2308         hunk = ''
2309         for n in spec:
2310             hunk = hunk + delim + input_lines[n]
2311         hunk = hunk.strip(delim)
2312         out.append(hunk)
2313     return out
2314
2315
2316 def shuffle_columns_into_dict(
2317     input_lines: Sequence[str],
2318     column_specs: Iterable[Tuple[str, Iterable[int]]],
2319     delim: str = '',
2320 ) -> Dict[str, str]:
2321     """Helper to shuffle / parse columnar data and return the results
2322     as a dict.
2323
2324     Args:
2325         input_lines: a sequence of strings that represents text that
2326             has been broken into columns by the caller
2327         column_specs: instructions for what dictionary keys to apply
2328             to individual or compound input column data.  See example
2329             below.
2330         delim: when forming compound output data by gluing more than
2331             one input column together, use this character to separate
2332             the source data.  Defaults to ''.
2333
2334     Returns:
2335         A dict formed by applying the column_specs instructions.
2336
2337     See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2338
2339     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2340     >>> shuffle_columns_into_dict(
2341     ...     cols,
2342     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2343     ...     delim='!',
2344     ... )
2345     {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2346     """
2347     out = {}
2348
2349     # Column specs map input lines' columns into outputs.
2350     # "key", [col1, col2...]
2351     for spec in column_specs:
2352         hunk = ''
2353         for n in spec[1]:
2354             hunk = hunk + delim + input_lines[n]
2355         hunk = hunk.strip(delim)
2356         out[spec[0]] = hunk
2357     return out
2358
2359
2360 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2361     """
2362     Interpolate a string with data from a dict.
2363
2364     Args:
2365         txt: the mad libs template
2366         values: what you and your kids chose for each category.
2367
2368     See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2369
2370     >>> interpolate_using_dict('This is a {adjective} {noun}.',
2371     ...                        {'adjective': 'good', 'noun': 'example'})
2372     'This is a good example.'
2373     """
2374     return _sprintf(txt.format(**values), end='')
2375
2376
2377 def to_ascii(txt: str):
2378     """
2379     Args:
2380         txt: the input data to encode
2381
2382     Returns:
2383         txt encoded as an ASCII byte string.
2384
2385     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2386     :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2387
2388     >>> to_ascii('test')
2389     b'test'
2390
2391     >>> to_ascii(b'1, 2, 3')
2392     b'1, 2, 3'
2393     """
2394     if isinstance(txt, str):
2395         return txt.encode('ascii')
2396     if isinstance(txt, bytes):
2397         return txt
2398     raise Exception('to_ascii works with strings and bytes')
2399
2400
2401 def to_base64(
2402     txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2403 ) -> bytes:
2404     """
2405     Args:
2406         txt: the input data to encode
2407         encoding: the encoding to use during conversion
2408         errors: how to handle encoding errors
2409
2410     Returns:
2411         txt encoded with a 64-chracter alphabet.  Similar to and compatible
2412         with uuencode/uudecode.
2413
2414     See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2415     :meth:`from_base64`.
2416
2417     >>> to_base64('hello?')
2418     b'aGVsbG8/\\n'
2419     """
2420     return base64.encodebytes(txt.encode(encoding, errors))
2421
2422
2423 def is_base64(txt: str) -> bool:
2424     """
2425     Args:
2426         txt: the string to check
2427
2428     Returns:
2429         True if txt is a valid base64 encoded string.  This assumes
2430         txt was encoded with Python's standard base64 alphabet which
2431         is the same as what uuencode/uudecode uses).
2432
2433     See also :meth:`to_base64`, :meth:`from_base64`.
2434
2435     >>> is_base64('test')    # all letters in the b64 alphabet
2436     True
2437
2438     >>> is_base64('another test, how do you like this one?')
2439     False
2440
2441     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
2442     True
2443
2444     """
2445     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2446     alphabet = set(a.encode('ascii'))
2447     for char in to_ascii(txt.strip()):
2448         if char not in alphabet:
2449             return False
2450     return True
2451
2452
2453 def from_base64(
2454     b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2455 ) -> str:
2456     """
2457     Args:
2458         b64: bytestring of 64-bit encoded data to decode / convert.
2459         encoding: the encoding to use during conversion
2460         errors: how to handle encoding errors
2461
2462     Returns:
2463         The decoded form of b64 as a normal python string.  Similar to
2464         and compatible with uuencode / uudecode.
2465
2466     See also :meth:`to_base64`, :meth:`is_base64`.
2467
2468     >>> from_base64(b'aGVsbG8/\\n')
2469     'hello?'
2470     """
2471     return base64.decodebytes(b64).decode(encoding, errors)
2472
2473
2474 def chunk(txt: str, chunk_size: int):
2475     """
2476     Args:
2477         txt: a string to be chunked into evenly spaced pieces.
2478         chunk_size: the size of each chunk to make
2479
2480     Returns:
2481         The original string chunked into evenly spaced pieces.
2482
2483     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2484     '01001101 11000101 10101010 10101010 10011111 10101000'
2485     """
2486     if len(txt) % chunk_size != 0:
2487         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2488         logger.warning(msg)
2489         warnings.warn(msg, stacklevel=2)
2490     for x in range(0, len(txt), chunk_size):
2491         yield txt[x : x + chunk_size]
2492
2493
2494 def to_bitstring(txt: str, *, delimiter: str = '') -> str:
2495     """
2496     Args:
2497         txt: the string to convert into a bitstring
2498         delimiter: character to insert between adjacent bytes.  Note that
2499             only bitstrings with delimiter='' are interpretable by
2500             :meth:`from_bitstring`.
2501
2502     Returns:
2503         txt converted to ascii/binary and then chopped into bytes.
2504
2505     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2506     :meth:`chunk`.
2507
2508     >>> to_bitstring('hello?')
2509     '011010000110010101101100011011000110111100111111'
2510
2511     >>> to_bitstring('test', delimiter=' ')
2512     '01110100 01100101 01110011 01110100'
2513
2514     >>> to_bitstring(b'test')
2515     '01110100011001010111001101110100'
2516     """
2517     etxt = to_ascii(txt)
2518     bits = bin(int.from_bytes(etxt, 'big'))
2519     bits = bits[2:]
2520     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2521
2522
2523 def is_bitstring(txt: str) -> bool:
2524     """
2525     Args:
2526         txt: the string to check
2527
2528     Returns:
2529         True if txt is a recognized bitstring and False otherwise.
2530         Note that if delimiter is non empty this code will not
2531         recognize the bitstring.
2532
2533     See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2534     :meth:`chunk`.
2535
2536     >>> is_bitstring('011010000110010101101100011011000110111100111111')
2537     True
2538
2539     >>> is_bitstring('1234')
2540     False
2541     """
2542     return is_binary_integer_number(f'0b{txt}')
2543
2544
2545 def from_bitstring(
2546     bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2547 ) -> str:
2548     """
2549     Args:
2550         bits: the bitstring to convert back into a python string
2551         encoding: the encoding to use during conversion
2552         errors: how to handle encoding errors
2553
2554     Returns:
2555         The regular python string represented by bits.  Note that this
2556         code does not work with to_bitstring when delimiter is non-empty.
2557
2558     See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2559     :meth:`chunk`.
2560
2561     >>> from_bitstring('011010000110010101101100011011000110111100111111')
2562     'hello?'
2563     """
2564     n = int(bits, 2)
2565     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2566
2567
2568 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2569     """
2570     Args:
2571         txt: an IP address to chunk up for sorting purposes
2572
2573     Returns:
2574         A tuple of IP components arranged such that the sorting of
2575         IP addresses using a normal comparator will do something sane
2576         and desireable.
2577
2578     See also :meth:`is_ip_v4`.
2579
2580     >>> ip_v4_sort_key('10.0.0.18')
2581     (10, 0, 0, 18)
2582
2583     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2584     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2585     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2586     """
2587     if not is_ip_v4(txt):
2588         print(f"not IP: {txt}")
2589         return None
2590     return tuple(int(x) for x in txt.split('.'))
2591
2592
2593 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2594     """
2595     Args:
2596         volume: the string to chunk up for sorting purposes
2597
2598     Returns:
2599         A tuple of volume's components such that the sorting of
2600         volumes using a normal comparator will do something sane
2601         and desireable.
2602
2603     See also :mod:`pyutils.files.file_utils`.
2604
2605     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2606     ('usr', 'local', 'bin')
2607
2608     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2609     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2610     ['/usr', '/usr/local', '/usr/local/bin']
2611     """
2612     return tuple(x for x in volume.split('/') if len(x) > 0)
2613
2614
2615 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2616     """
2617     Execute several replace operations in a row.
2618
2619     Args:
2620         in_str: the string in which to replace characters
2621         replace_set: the set of target characters to replace
2622         replacement: the character to replace any member of replace_set
2623             with
2624
2625     See also :meth:`replace_nth`.
2626
2627     Returns:
2628         The string with replacements executed.
2629
2630     >>> s = 'this_is a-test!'
2631     >>> replace_all(s, ' _-!', '')
2632     'thisisatest'
2633     """
2634     for char in replace_set:
2635         in_str = in_str.replace(char, replacement)
2636     return in_str
2637
2638
2639 def replace_nth(in_str: str, source: str, target: str, nth: int):
2640     """
2641     Replaces the nth occurrance of a substring within a string.
2642
2643     Args:
2644         in_str: the string in which to run the replacement
2645         source: the substring to replace
2646         target: the replacement text
2647         nth: which occurrance of source to replace?
2648
2649     See also :meth:`replace_all`.
2650
2651     >>> replace_nth('this is a test', ' ', '-', 3)
2652     'this is a-test'
2653     """
2654     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2655     before = in_str[:where]
2656     after = in_str[where:]
2657     after = after.replace(source, target, 1)
2658     return before + after
2659
2660
2661 if __name__ == '__main__':
2662     import doctest
2663
2664     doctest.testmod()