src/pyutils/string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7
   8 Modifications Copyright (c) 2021-2022 Scott Gasch
   9
  10 Permission is hereby granted, free of charge, to any person obtaining a copy
  11 of this software and associated documentation files (the "Software"), to deal
  12 in the Software without restriction, including without limitation the rights
  13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 copies of the Software, and to permit persons to whom the Software is
  15 furnished to do so, subject to the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included in all
  18 copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26 SOFTWARE.
  27
  28 This class is based on: https://github.com/daveoncode/python-string-utils.
  29 See NOTICE in the root of this module for a detailed enumeration of what
  30 work is Davide's and what work was added by Scott.
  31 """
  32
  33 import base64
  34 import contextlib  # type: ignore
  35 import datetime
  36 import io
  37 import json
  38 import logging
  39 import numbers
  40 import random
  41 import re
  42 import string
  43 import unicodedata
  44 import warnings
  45 from itertools import zip_longest
  46 from typing import (
  47     Any,
  48     Callable,
  49     Dict,
  50     Iterable,
  51     List,
  52     Literal,
  53     Optional,
  54     Sequence,
  55     Tuple,
  56 )
  57 from uuid import uuid4
  58
  59 from pyutils import list_utils
  60
  61 logger = logging.getLogger(__name__)
  62
  63 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  64
  65 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  66
  67 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  68
  69 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  70
  71 URLS_RAW_STRING = (
  72     r"([a-z-]+://)"  # scheme
  73     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  74     r"(www\.)?"  # www.
  75     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  76     r"(:\d{2,})?"  # port number
  77     r"(/[a-z\d_%+-]*)*"  # folders
  78     r"(\.[a-z\d_%+-]+)*"  # file extension
  79     r"(\?[a-z\d_+%-=]*)?"  # query string
  80     r"(#\S*)?"  # hash
  81 )
  82
  83 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  84
  85 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  86
  87 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  88
  89 EMAILS_RAW_STRING = (
  90     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  91 )
  92
  93 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  94
  95 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  96
  97 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
  98
  99 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
 100
 101 SNAKE_CASE_TEST_RE = re.compile(
 102     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
 103 )
 104
 105 SNAKE_CASE_TEST_DASH_RE = re.compile(
 106     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 107 )
 108
 109 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 110
 111 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 112
 113 CREDIT_CARDS = {
 114     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 115     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 116     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 117     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 118     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 119     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 120 }
 121
 122 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 123
 124 UUID_RE = re.compile(
 125     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 126 )
 127
 128 UUID_HEX_OK_RE = re.compile(
 129     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 130     re.IGNORECASE,
 131 )
 132
 133 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 134
 135 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 136
 137 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 138
 139 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 140
 141 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 142
 143 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 144     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 145 )
 146
 147 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 148
 149 HTML_RE = re.compile(
 150     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 151     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 152 )
 153
 154 HTML_TAG_ONLY_RE = re.compile(
 155     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 156     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 157 )
 158
 159 SPACES_RE = re.compile(r"\s")
 160
 161 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 162
 163 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 164
 165 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 166
 167 NUM_SUFFIXES = {
 168     "Pb": (1024**5),
 169     "P": (1024**5),
 170     "Tb": (1024**4),
 171     "T": (1024**4),
 172     "Gb": (1024**3),
 173     "G": (1024**3),
 174     "Mb": (1024**2),
 175     "M": (1024**2),
 176     "Kb": (1024**1),
 177     "K": (1024**1),
 178 }
 179
 180 UNIT_WORDS = [
 181     "zero",
 182     "one",
 183     "two",
 184     "three",
 185     "four",
 186     "five",
 187     "six",
 188     "seven",
 189     "eight",
 190     "nine",
 191     "ten",
 192     "eleven",
 193     "twelve",
 194     "thirteen",
 195     "fourteen",
 196     "fifteen",
 197     "sixteen",
 198     "seventeen",
 199     "eighteen",
 200     "nineteen",
 201 ]
 202
 203 TENS_WORDS = [
 204     "",
 205     "",
 206     "twenty",
 207     "thirty",
 208     "forty",
 209     "fifty",
 210     "sixty",
 211     "seventy",
 212     "eighty",
 213     "ninety",
 214 ]
 215
 216 scales = ["hundred", "thousand", "million", "billion", "trillion", "quadrillion"]
 217
 218 NUM_WORDS = {}
 219 NUM_WORDS["and"] = (1, 0)
 220 for i, word in enumerate(UNIT_WORDS):
 221     NUM_WORDS[word] = (1, i)
 222 for i, word in enumerate(TENS_WORDS):
 223     NUM_WORDS[word] = (1, i * 10)
 224 for i, word in enumerate(scales):
 225     if i == 0:
 226         NUM_WORDS[word] = (100, 0)
 227     else:
 228         NUM_WORDS[word] = (10 ** (i * 3), 0)
 229 NUM_WORDS['score'] = (20, 0)
 230
 231
 232 def is_none_or_empty(in_str: Optional[str]) -> bool:
 233     """
 234     Args:
 235         in_str: the string to test
 236
 237     Returns:
 238         True if the input string is either None or an empty string,
 239         False otherwise.
 240
 241     >>> is_none_or_empty("")
 242     True
 243     >>> is_none_or_empty(None)
 244     True
 245     >>> is_none_or_empty("   \t   ")
 246     True
 247     >>> is_none_or_empty('Test')
 248     False
 249     """
 250     return in_str is None or len(in_str.strip()) == 0
 251
 252
 253 def is_string(obj: Any) -> bool:
 254     """
 255     Args:
 256         in_str: the object to test
 257
 258     Returns:
 259         True if the object is a string and False otherwise.
 260
 261     >>> is_string('test')
 262     True
 263     >>> is_string(123)
 264     False
 265     >>> is_string(100.3)
 266     False
 267     >>> is_string([1, 2, 3])
 268     False
 269     """
 270     return isinstance(obj, str)
 271
 272
 273 def is_empty_string(in_str: Any) -> bool:
 274     """
 275     Args:
 276         in_str: the string to test
 277
 278     Returns:
 279         True if the string is empty and False otherwise.
 280     """
 281     return is_empty(in_str)
 282
 283
 284 def is_empty(in_str: Any) -> bool:
 285     """
 286     Args:
 287         in_str: the string to test
 288
 289     Returns:
 290         True if the string is empty and false otherwise.
 291
 292     >>> is_empty('')
 293     True
 294     >>> is_empty('    \t\t    ')
 295     True
 296     >>> is_empty('test')
 297     False
 298     >>> is_empty(100.88)
 299     False
 300     >>> is_empty([1, 2, 3])
 301     False
 302     """
 303     return is_string(in_str) and in_str.strip() == ""
 304
 305
 306 def is_full_string(in_str: Any) -> bool:
 307     """
 308     Args:
 309         in_str: the object to test
 310
 311     Returns:
 312         True if the object is a string and is not empty ('') and
 313         is not only composed of whitespace.
 314
 315     >>> is_full_string('test!')
 316     True
 317     >>> is_full_string('')
 318     False
 319     >>> is_full_string('      ')
 320     False
 321     >>> is_full_string(100.999)
 322     False
 323     >>> is_full_string({"a": 1, "b": 2})
 324     False
 325     """
 326     return is_string(in_str) and in_str.strip() != ""
 327
 328
 329 def is_number(in_str: str) -> bool:
 330     """
 331     Args:
 332         in_str: the string to test
 333
 334     Returns:
 335         True if the string contains a valid numberic value and
 336         False otherwise.
 337
 338     >>> is_number(100.5)
 339     Traceback (most recent call last):
 340     ...
 341     ValueError: 100.5
 342     >>> is_number("100.5")
 343     True
 344     >>> is_number("test")
 345     False
 346     >>> is_number("99")
 347     True
 348     >>> is_number([1, 2, 3])
 349     Traceback (most recent call last):
 350     ...
 351     ValueError: [1, 2, 3]
 352     """
 353     if not is_string(in_str):
 354         raise ValueError(in_str)
 355     return NUMBER_RE.match(in_str) is not None
 356
 357
 358 def is_integer_number(in_str: str) -> bool:
 359     """
 360     Args:
 361         in_str: the string to test
 362
 363     Returns:
 364         True if the string contains a valid (signed or unsigned,
 365         decimal, hex, or octal, regular or scientific) integral
 366         expression and False otherwise.
 367
 368     >>> is_integer_number('42')
 369     True
 370     >>> is_integer_number('42.0')
 371     False
 372     """
 373     return (
 374         (is_number(in_str) and "." not in in_str)
 375         or is_hexidecimal_integer_number(in_str)
 376         or is_octal_integer_number(in_str)
 377         or is_binary_integer_number(in_str)
 378     )
 379
 380
 381 def is_hexidecimal_integer_number(in_str: str) -> bool:
 382     """
 383     Args:
 384         in_str: the string to test
 385
 386     Returns:
 387         True if the string is a hex integer number and False otherwise.
 388
 389     >>> is_hexidecimal_integer_number('0x12345')
 390     True
 391     >>> is_hexidecimal_integer_number('0x1A3E')
 392     True
 393     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 394     False
 395     >>> is_hexidecimal_integer_number('-0xff')
 396     True
 397     >>> is_hexidecimal_integer_number('test')
 398     False
 399     >>> is_hexidecimal_integer_number(12345)  # Not a string
 400     Traceback (most recent call last):
 401     ...
 402     ValueError: 12345
 403     >>> is_hexidecimal_integer_number(101.4)
 404     Traceback (most recent call last):
 405     ...
 406     ValueError: 101.4
 407     >>> is_hexidecimal_integer_number(0x1A3E)
 408     Traceback (most recent call last):
 409     ...
 410     ValueError: 6718
 411     """
 412     if not is_string(in_str):
 413         raise ValueError(in_str)
 414     return HEX_NUMBER_RE.match(in_str) is not None
 415
 416
 417 def is_octal_integer_number(in_str: str) -> bool:
 418     """
 419     Args:
 420         in_str: the string to test
 421
 422     Returns:
 423         True if the string is a valid octal integral number and False otherwise.
 424
 425     >>> is_octal_integer_number('0o777')
 426     True
 427     >>> is_octal_integer_number('-0O115')
 428     True
 429     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 430     False
 431     >>> is_octal_integer_number('7777')  # Needs 0o
 432     False
 433     >>> is_octal_integer_number('test')
 434     False
 435     """
 436     if not is_string(in_str):
 437         raise ValueError(in_str)
 438     return OCT_NUMBER_RE.match(in_str) is not None
 439
 440
 441 def is_binary_integer_number(in_str: str) -> bool:
 442     """
 443     Args:
 444         in_str: the string to test
 445
 446     Returns:
 447         True if the string contains a binary integral number and False otherwise.
 448
 449     >>> is_binary_integer_number('0b10111')
 450     True
 451     >>> is_binary_integer_number('-0b111')
 452     True
 453     >>> is_binary_integer_number('0B10101')
 454     True
 455     >>> is_binary_integer_number('0b10102')
 456     False
 457     >>> is_binary_integer_number('0xFFF')
 458     False
 459     >>> is_binary_integer_number('test')
 460     False
 461     """
 462     if not is_string(in_str):
 463         raise ValueError(in_str)
 464     return BIN_NUMBER_RE.match(in_str) is not None
 465
 466
 467 def to_int(in_str: str) -> int:
 468     """
 469     Args:
 470         in_str: the string to convert
 471
 472     Returns:
 473         The integral value of the string or raises on error.
 474
 475     >>> to_int('1234')
 476     1234
 477     >>> to_int('test')
 478     Traceback (most recent call last):
 479     ...
 480     ValueError: invalid literal for int() with base 10: 'test'
 481     """
 482     if not is_string(in_str):
 483         raise ValueError(in_str)
 484     if is_binary_integer_number(in_str):
 485         return int(in_str, 2)
 486     if is_octal_integer_number(in_str):
 487         return int(in_str, 8)
 488     if is_hexidecimal_integer_number(in_str):
 489         return int(in_str, 16)
 490     return int(in_str)
 491
 492
 493 def number_string_to_integer(in_str: str) -> int:
 494     """Convert a string containing a written-out number into an int.
 495
 496     >>> number_string_to_integer("one hundred fifty two")
 497     152
 498
 499     >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
 500     10200054003
 501
 502     >>> number_string_to_integer("four-score and 7")
 503     87
 504
 505     >>> number_string_to_integer("fifty xyzzy three")
 506     Traceback (most recent call last):
 507     ...
 508     ValueError: Unknown word: xyzzy
 509     """
 510     if type(in_str) == int:
 511         return in_str
 512
 513     current = result = 0
 514     in_str = in_str.replace('-', ' ')
 515     for word in in_str.split():
 516         if word not in NUM_WORDS:
 517             if is_integer_number(word):
 518                 current += int(word)
 519                 continue
 520             else:
 521                 raise ValueError("Unknown word: " + word)
 522         scale, increment = NUM_WORDS[word]
 523         current = current * scale + increment
 524         if scale > 100:
 525             result += current
 526             current = 0
 527     return result + current
 528
 529
 530 def integer_to_number_string(num: int) -> str:
 531     """
 532     Opposite of number_string_to_integer; convert a number to a written out
 533     longhand format.
 534
 535     >>> integer_to_number_string(9)
 536     'nine'
 537
 538     >>> integer_to_number_string(42)
 539     'forty two'
 540
 541     >>> integer_to_number_string(123219982)
 542     'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
 543
 544     """
 545
 546     if num < 20:
 547         return UNIT_WORDS[num]
 548     if num < 100:
 549         ret = TENS_WORDS[num // 10]
 550         leftover = num % 10
 551         if leftover != 0:
 552             ret += ' ' + UNIT_WORDS[leftover]
 553         return ret
 554
 555     # If num > 100 go find the highest chunk and convert that, then recursively
 556     # convert the rest.  NUM_WORDS contains items like 'thousand' -> (1000, 0).
 557     # The second item in the tuple is an increment that can be ignored; the first
 558     # is the numeric "scale" of the entry.  So find the greatest entry in NUM_WORDS
 559     # still less than num.  For 123,456 it would be thousand.  Then pull out the
 560     # 123, convert it, and append "thousand".  Then do the rest.
 561     scales = {}
 562     for name, val in NUM_WORDS.items():
 563         if val[0] <= num:
 564             scales[name] = val[0]
 565     scale = max(scales.items(), key=lambda _: _[1])
 566
 567     # scale[1] = numeric magnitude (e.g. 1000)
 568     # scale[0] = name (e.g. "thousand")
 569     ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
 570     leftover = num % scale[1]
 571     if leftover != 0:
 572         ret += ' ' + integer_to_number_string(leftover)
 573     return ret
 574
 575
 576 def is_decimal_number(in_str: str) -> bool:
 577     """
 578     Args:
 579         in_str: the string to check
 580
 581     Returns:
 582         True if the given string represents a decimal or False
 583         otherwise.  A decimal may be signed or unsigned or use
 584         a "scientific notation".
 585
 586     .. note::
 587         We do not consider integers without a decimal point
 588         to be decimals; they return False (see example).
 589
 590     >>> is_decimal_number('42.0')
 591     True
 592     >>> is_decimal_number('42')
 593     False
 594     """
 595     return is_number(in_str) and "." in in_str
 596
 597
 598 def strip_escape_sequences(in_str: str) -> str:
 599     """
 600     Args:
 601         in_str: the string to strip of escape sequences.
 602
 603     Returns:
 604         in_str with escape sequences removed.
 605
 606     .. note::
 607         What is considered to be an "escape sequence" is defined
 608         by a regular expression.  While this gets common ones,
 609         there may exist valid sequences that it doesn't match.
 610
 611     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 612     'this is a test!'
 613     """
 614     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 615     return in_str
 616
 617
 618 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 619     """
 620     Args:
 621         in_str: string or number to which to add thousands separator(s)
 622         separator_char: the separator character to add (defaults to comma)
 623         places: add a separator every N places (defaults to three)
 624
 625     Returns:
 626         A numeric string with thousands separators added appropriately.
 627
 628     >>> add_thousands_separator('12345678')
 629     '12,345,678'
 630     >>> add_thousands_separator(12345678)
 631     '12,345,678'
 632     >>> add_thousands_separator(12345678.99)
 633     '12,345,678.99'
 634     >>> add_thousands_separator('test')
 635     Traceback (most recent call last):
 636     ...
 637     ValueError: test
 638
 639     """
 640     if isinstance(in_str, numbers.Number):
 641         in_str = f'{in_str}'
 642     if is_number(in_str):
 643         return _add_thousands_separator(
 644             in_str, separator_char=separator_char, places=places
 645         )
 646     raise ValueError(in_str)
 647
 648
 649 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 650     decimal_part = ""
 651     if '.' in in_str:
 652         (in_str, decimal_part) = in_str.split('.')
 653     tmp = [iter(in_str[::-1])] * places
 654     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 655     if len(decimal_part) > 0:
 656         ret += '.'
 657         ret += decimal_part
 658     return ret
 659
 660
 661 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 662     """
 663     Args:
 664         in_str: the string to test
 665         allowed_schemes: an optional list of allowed schemes (e.g.
 666             ['http', 'https', 'ftp'].  If passed, only URLs that
 667             begin with the one of the schemes passed will be considered
 668             to be valid.  Otherwise, any scheme:// will be considered
 669             valid.
 670
 671     Returns:
 672         True if in_str contains a valid URL and False otherwise.
 673
 674     >>> is_url('http://www.mysite.com')
 675     True
 676     >>> is_url('https://mysite.com')
 677     True
 678     >>> is_url('.mysite.com')
 679     False
 680     >>> is_url('scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash')
 681     True
 682     """
 683     if not is_full_string(in_str):
 684         return False
 685
 686     valid = URL_RE.match(in_str) is not None
 687
 688     if allowed_schemes:
 689         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 690     return valid
 691
 692
 693 def is_email(in_str: Any) -> bool:
 694     """
 695     Args:
 696         in_str: the email address to check
 697
 698     Returns: True if the in_str contains a valid email (as defined by
 699         https://tools.ietf.org/html/rfc3696#section-3) or False
 700         otherwise.
 701
 702     >>> is_email('[email protected]')
 703     True
 704     >>> is_email('@gmail.com')
 705     False
 706     """
 707     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 708         return False
 709
 710     try:
 711         # we expect 2 tokens, one before "@" and one after, otherwise
 712         # we have an exception and the email is not valid.
 713         head, tail = in_str.split("@")
 714
 715         # head's size must be <= 64, tail <= 255, head must not start
 716         # with a dot or contain multiple consecutive dots.
 717         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 718             return False
 719
 720         # removes escaped spaces, so that later on the test regex will
 721         # accept the string.
 722         head = head.replace("\\ ", "")
 723         if head.startswith('"') and head.endswith('"'):
 724             head = head.replace(" ", "")[1:-1]
 725         return EMAIL_RE.match(head + "@" + tail) is not None
 726
 727     except ValueError:
 728         # borderline case in which we have multiple "@" signs but the
 729         # head part is correctly escaped.
 730         if ESCAPED_AT_SIGN.search(in_str) is not None:
 731             # replace "@" with "a" in the head
 732             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 733         return False
 734
 735
 736 def suffix_string_to_number(in_str: str) -> Optional[int]:
 737     """Takes a string like "33Gb" and converts it into a number (of bytes)
 738     like 34603008.
 739
 740     Args:
 741         in_str: the string with a suffix to be interpreted and removed.
 742
 743     Returns:
 744         An integer number of bytes or None to indicate an error.
 745
 746     >>> suffix_string_to_number('1Mb')
 747     1048576
 748     >>> suffix_string_to_number('13.1Gb')
 749     14066017894
 750     """
 751
 752     def suffix_capitalize(s: str) -> str:
 753         if len(s) == 1:
 754             return s.upper()
 755         elif len(s) == 2:
 756             return f"{s[0].upper()}{s[1].lower()}"
 757         return suffix_capitalize(s[0:1])
 758
 759     if is_string(in_str):
 760         if is_integer_number(in_str):
 761             return to_int(in_str)
 762         suffixes = [in_str[-2:], in_str[-1:]]
 763         rest = [in_str[:-2], in_str[:-1]]
 764         for x in range(len(suffixes)):
 765             s = suffixes[x]
 766             s = suffix_capitalize(s)
 767             multiplier = NUM_SUFFIXES.get(s, None)
 768             if multiplier is not None:
 769                 r = rest[x]
 770                 if is_integer_number(r):
 771                     return to_int(r) * multiplier
 772                 if is_decimal_number(r):
 773                     return int(float(r) * multiplier)
 774     return None
 775
 776
 777 def number_to_suffix_string(num: int) -> Optional[str]:
 778     """Take a number (of bytes) and returns a string like "43.8Gb".
 779
 780     Args:
 781         num: an integer number of bytes
 782
 783     Returns:
 784         A string with a suffix representing num bytes concisely or
 785         None to indicate an error.
 786
 787     >>> number_to_suffix_string(14066017894)
 788     '13.1Gb'
 789     >>> number_to_suffix_string(1024 * 1024)
 790     '1.0Mb'
 791     """
 792     d = 0.0
 793     suffix = None
 794     for (sfx, size) in NUM_SUFFIXES.items():
 795         if num >= size:
 796             d = num / size
 797             suffix = sfx
 798             break
 799     if suffix is not None:
 800         return f"{d:.1f}{suffix}"
 801     else:
 802         return f'{num:d}'
 803
 804
 805 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 806     """
 807     Args:
 808         in_str: a string to check
 809         card_type: if provided, contains the card type to validate
 810             with.  Otherwise, all known credit card number types will
 811             be accepted.
 812
 813             Supported card types are the following:
 814
 815             * VISA
 816             * MASTERCARD
 817             * AMERICAN_EXPRESS
 818             * DINERS_CLUB
 819             * DISCOVER
 820             * JCB
 821
 822     Returns:
 823         True if in_str is a valid credit card number.
 824     """
 825     if not is_full_string(in_str):
 826         return False
 827
 828     if card_type is not None:
 829         if card_type not in CREDIT_CARDS:
 830             raise KeyError(
 831                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 832             )
 833         return CREDIT_CARDS[card_type].match(in_str) is not None
 834     for c in CREDIT_CARDS:
 835         if CREDIT_CARDS[c].match(in_str) is not None:
 836             return True
 837     return False
 838
 839
 840 def is_camel_case(in_str: Any) -> bool:
 841     """
 842     Args:
 843         in_str: the string to test
 844
 845     Returns:
 846         True if the string is formatted as camel case and False otherwise.
 847         A string is considered camel case when:
 848
 849         * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 850         * it contains both lowercase and uppercase letters
 851         * it does not start with a number
 852     """
 853     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 854
 855
 856 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 857     """
 858     Args:
 859         in_str: the string to test
 860
 861     Returns: True if the string is snake case and False otherwise.  A
 862         string is considered snake case when:
 863
 864         * it's composed only by lowercase/uppercase letters and digits
 865         * it contains at least one underscore (or provided separator)
 866         * it does not start with a number
 867
 868     >>> is_snake_case('this_is_a_test')
 869     True
 870     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 871     True
 872     >>> is_snake_case('this-is-a-test')
 873     False
 874     >>> is_snake_case('this-is-a-test', separator='-')
 875     True
 876     """
 877     if is_full_string(in_str):
 878         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 879         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 880         r = re_map.get(
 881             separator,
 882             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 883         )
 884         return r.match(in_str) is not None
 885     return False
 886
 887
 888 def is_json(in_str: Any) -> bool:
 889     """
 890     Args:
 891         in_str: the string to test
 892
 893     Returns:
 894         True if the in_str contains valid JSON and False otherwise.
 895
 896     >>> is_json('{"name": "Peter"}')
 897     True
 898     >>> is_json('[1, 2, 3]')
 899     True
 900     >>> is_json('{nope}')
 901     False
 902     """
 903     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 904         try:
 905             return isinstance(json.loads(in_str), (dict, list))
 906         except (TypeError, ValueError, OverflowError):
 907             pass
 908     return False
 909
 910
 911 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 912     """
 913     Args:
 914         in_str: the string to test
 915
 916     Returns:
 917         True if the in_str contains a valid UUID and False otherwise.
 918
 919     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 920     True
 921     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 922     False
 923     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 924     True
 925     """
 926     # string casting is used to allow UUID itself as input data type
 927     s = str(in_str)
 928     if allow_hex:
 929         return UUID_HEX_OK_RE.match(s) is not None
 930     return UUID_RE.match(s) is not None
 931
 932
 933 def is_ip_v4(in_str: Any) -> bool:
 934     """
 935     Args:
 936         in_str: the string to test
 937
 938     Returns:
 939         True if in_str contains a valid IPv4 address and False otherwise.
 940
 941     >>> is_ip_v4('255.200.100.75')
 942     True
 943     >>> is_ip_v4('nope')
 944     False
 945     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 946     False
 947     """
 948     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 949         return False
 950
 951     # checks that each entry in the ip is in the valid range (0 to 255)
 952     for token in in_str.split("."):
 953         if not 0 <= int(token) <= 255:
 954             return False
 955     return True
 956
 957
 958 def extract_ip_v4(in_str: Any) -> Optional[str]:
 959     """
 960     Args:
 961         in_str: the string to extract an IPv4 address from.
 962
 963     Returns:
 964         The first extracted IPv4 address from in_str or None if
 965         none were found or an error occurred.
 966
 967     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 968     '127.0.0.1'
 969     >>> extract_ip_v4('Your mom dresses you funny.')
 970     """
 971     if not is_full_string(in_str):
 972         return None
 973     m = ANYWHERE_IP_V4_RE.search(in_str)
 974     if m is not None:
 975         return m.group(0)
 976     return None
 977
 978
 979 def is_ip_v6(in_str: Any) -> bool:
 980     """
 981     Args:
 982         in_str: the string to test.
 983
 984     Returns:
 985         True if in_str contains a valid IPv6 address and False otherwise.
 986
 987     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 988     True
 989     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 990     False
 991     """
 992     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 993
 994
 995 def extract_ip_v6(in_str: Any) -> Optional[str]:
 996     """
 997     Args:
 998         in_str: the string from which to extract an IPv6 address.
 999
1000     Returns:
1001         The first IPv6 address found in in_str or None if no address
1002         was found or an error occurred.
1003
1004     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1005     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1006     >>> extract_ip_v6("(and she's ugly too, btw)")
1007     """
1008     if not is_full_string(in_str):
1009         return None
1010     m = ANYWHERE_IP_V6_RE.search(in_str)
1011     if m is not None:
1012         return m.group(0)
1013     return None
1014
1015
1016 def is_ip(in_str: Any) -> bool:
1017     """
1018     Args:
1019         in_str: the string to test.
1020
1021     Returns:
1022         True if in_str contains a valid IP address (either IPv4 or
1023         IPv6).
1024
1025     >>> is_ip('255.200.100.75')
1026     True
1027     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1028     True
1029     >>> is_ip('1.2.3')
1030     False
1031     >>> is_ip('1.2.3.999')
1032     False
1033     """
1034     return is_ip_v6(in_str) or is_ip_v4(in_str)
1035
1036
1037 def extract_ip(in_str: Any) -> Optional[str]:
1038     """
1039     Args:
1040         in_str: the string from which to extract in IP address.
1041
1042     Returns:
1043         The first IP address (IPv4 or IPv6) found in in_str or
1044         None to indicate none found or an error condition.
1045
1046     >>> extract_ip('Attacker: 255.200.100.75')
1047     '255.200.100.75'
1048     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1049     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1050     >>> extract_ip('1.2.3')
1051     """
1052     ip = extract_ip_v4(in_str)
1053     if ip is None:
1054         ip = extract_ip_v6(in_str)
1055     return ip
1056
1057
1058 def is_mac_address(in_str: Any) -> bool:
1059     """
1060     Args:
1061         in_str: the string to test
1062
1063     Returns:
1064         True if in_str is a valid MAC address False otherwise.
1065
1066     >>> is_mac_address("34:29:8F:12:0D:2F")
1067     True
1068     >>> is_mac_address('34:29:8f:12:0d:2f')
1069     True
1070     >>> is_mac_address('34-29-8F-12-0D-2F')
1071     True
1072     >>> is_mac_address("test")
1073     False
1074     """
1075     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1076
1077
1078 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1079     """
1080     Args:
1081         in_str: the string from which to extract a MAC address.
1082
1083     Returns:
1084         The first MAC address found in in_str or None to indicate no
1085         match or an error.
1086
1087     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1088     '34:29:8F:12:0D:2F'
1089
1090     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1091     'd8:5d:e2:34:54:86'
1092     """
1093     if not is_full_string(in_str):
1094         return None
1095     in_str.strip()
1096     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1097     if m is not None:
1098         mac = m.group(0)
1099         mac.replace(":", separator)
1100         mac.replace("-", separator)
1101         return mac
1102     return None
1103
1104
1105 def is_slug(in_str: Any, separator: str = "-") -> bool:
1106     """
1107     Args:
1108         in_str: string to test
1109
1110     Returns:
1111         True if in_str is a slug string and False otherwise.
1112
1113     >>> is_slug('my-blog-post-title')
1114     True
1115     >>> is_slug('My blog post title')
1116     False
1117     """
1118     if not is_full_string(in_str):
1119         return False
1120     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1121     return re.match(rex, in_str) is not None
1122
1123
1124 def contains_html(in_str: str) -> bool:
1125     """
1126     Args:
1127         in_str: the string to check for tags in
1128
1129     Returns:
1130         True if the given string contains HTML/XML tags and False
1131         otherwise.
1132
1133     .. warning::
1134         By design, this function matches ANY type of tag, so don't expect
1135         to use it as an HTML validator.  It's a quick sanity check at
1136         best.  See something like BeautifulSoup for a more full-featuered
1137         HTML parser.
1138
1139     >>> contains_html('my string is <strong>bold</strong>')
1140     True
1141     >>> contains_html('my string is not bold')
1142     False
1143
1144     """
1145     if not is_string(in_str):
1146         raise ValueError(in_str)
1147     return HTML_RE.search(in_str) is not None
1148
1149
1150 def words_count(in_str: str) -> int:
1151     """
1152     Args:
1153         in_str: the string to count words in
1154
1155     Returns:
1156         The number of words contained in the given string.
1157
1158     .. note::
1159
1160         This method is "smart" in that it does consider only sequences
1161         of one or more letter and/or numbers to be "words".  Thus a
1162         string like this: "! @ # % ... []" will return zero.  Moreover
1163         it is aware of punctuation, so the count for a string like
1164         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1165         in the string).
1166
1167     >>> words_count('hello world')
1168     2
1169     >>> words_count('one,two,three.stop')
1170     4
1171     """
1172     if not is_string(in_str):
1173         raise ValueError(in_str)
1174     return len(WORDS_COUNT_RE.findall(in_str))
1175
1176
1177 def word_count(in_str: str) -> int:
1178     """
1179     Args:
1180         in_str: the string to count words in
1181
1182     Returns:
1183         The number of words contained in the given string.
1184
1185     .. note::
1186
1187         This method is "smart" in that it does consider only sequences
1188         of one or more letter and/or numbers to be "words".  Thus a
1189         string like this: "! @ # % ... []" will return zero.  Moreover
1190         it is aware of punctuation, so the count for a string like
1191         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1192         in the string).
1193
1194     >>> word_count('hello world')
1195     2
1196     >>> word_count('one,two,three.stop')
1197     4
1198     """
1199     return words_count(in_str)
1200
1201
1202 def generate_uuid(omit_dashes: bool = False) -> str:
1203     """
1204     Args:
1205         omit_dashes: should we omit the dashes in the generated UUID?
1206
1207     Returns:
1208         A generated UUID string (using `uuid.uuid4()`) with or without
1209         dashes per the omit_dashes arg.
1210
1211     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1212     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1213     """
1214     uid = uuid4()
1215     if omit_dashes:
1216         return uid.hex
1217     return str(uid)
1218
1219
1220 def generate_random_alphanumeric_string(size: int) -> str:
1221     """
1222     Args:
1223         size: number of characters to generate
1224
1225     Returns:
1226         A string of the specified size containing random characters
1227         (uppercase/lowercase ascii letters and digits).
1228
1229     >>> random.seed(22)
1230     >>> generate_random_alphanumeric_string(9)
1231     '96ipbNClS'
1232     """
1233     if size < 1:
1234         raise ValueError("size must be >= 1")
1235     chars = string.ascii_letters + string.digits
1236     buffer = [random.choice(chars) for _ in range(size)]
1237     return from_char_list(buffer)
1238
1239
1240 def reverse(in_str: str) -> str:
1241     """
1242     Args:
1243         in_str: the string to reverse
1244
1245     Returns:
1246         The reversed (chracter by character) string.
1247
1248     >>> reverse('test')
1249     'tset'
1250     """
1251     if not is_string(in_str):
1252         raise ValueError(in_str)
1253     return in_str[::-1]
1254
1255
1256 def camel_case_to_snake_case(in_str, *, separator="_"):
1257     """
1258     Args:
1259         in_str: the camel case string to convert
1260
1261     Returns:
1262         A snake case string equivalent to the camel case input or the
1263         original string if it is not a valid camel case string or some
1264         other error occurs.
1265
1266     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1267     'mac_address_extractor_factory'
1268     >>> camel_case_to_snake_case('Luke Skywalker')
1269     'Luke Skywalker'
1270     """
1271     if not is_string(in_str):
1272         raise ValueError(in_str)
1273     if not is_camel_case(in_str):
1274         return in_str
1275     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1276
1277
1278 def snake_case_to_camel_case(
1279     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1280 ) -> str:
1281     """
1282     Args:
1283         in_str: the snake case string to convert
1284
1285     Returns:
1286         A camel case string that is equivalent to the snake case string
1287         provided or the original string back again if it is not valid
1288         snake case or another error occurs.
1289
1290     >>> snake_case_to_camel_case('this_is_a_test')
1291     'ThisIsATest'
1292     >>> snake_case_to_camel_case('Han Solo')
1293     'Han Solo'
1294     """
1295     if not is_string(in_str):
1296         raise ValueError(in_str)
1297     if not is_snake_case(in_str, separator=separator):
1298         return in_str
1299     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1300     if not upper_case_first:
1301         tokens[0] = tokens[0].lower()
1302     return from_char_list(tokens)
1303
1304
1305 def to_char_list(in_str: str) -> List[str]:
1306     """
1307     Args:
1308         in_str: the string to split into a char list
1309
1310     Returns:
1311         A list of strings of length one each.
1312
1313     >>> to_char_list('test')
1314     ['t', 'e', 's', 't']
1315     """
1316     if not is_string(in_str):
1317         return []
1318     return list(in_str)
1319
1320
1321 def from_char_list(in_list: List[str]) -> str:
1322     """
1323     Args:
1324         in_list: A list of characters to convert into a string.
1325
1326     Returns:
1327         The string resulting from gluing the characters in in_list
1328         together.
1329
1330     >>> from_char_list(['t', 'e', 's', 't'])
1331     'test'
1332     """
1333     return "".join(in_list)
1334
1335
1336 def shuffle(in_str: str) -> Optional[str]:
1337     """
1338     Args:
1339         in_str: a string to shuffle randomly by character
1340
1341     Returns:
1342         A new string containing same chars of the given one but in
1343         a randomized order.  Note that in rare cases this could result
1344         in the same original string as no check is done.  Returns
1345         None to indicate error conditions.
1346
1347     >>> random.seed(22)
1348     >>> shuffle('awesome')
1349     'meosaew'
1350     """
1351     if not is_string(in_str):
1352         return None
1353     chars = to_char_list(in_str)
1354     random.shuffle(chars)
1355     return from_char_list(chars)
1356
1357
1358 def scramble(in_str: str) -> Optional[str]:
1359     """
1360     Args:
1361         in_str: a string to shuffle randomly by character
1362
1363     Returns:
1364         A new string containing same chars of the given one but in
1365         a randomized order.  Note that in rare cases this could result
1366         in the same original string as no check is done.  Returns
1367         None to indicate error conditions.
1368
1369     >>> random.seed(22)
1370     >>> scramble('awesome')
1371     'meosaew'
1372     """
1373     return shuffle(in_str)
1374
1375
1376 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1377     """
1378     Args:
1379         in_str: the string to strip tags from
1380         keep_tag_content: should we keep the inner contents of tags?
1381
1382     Returns:
1383         A string with all HTML tags removed (optionally with tag contents
1384         preserved).
1385
1386     .. note::
1387         This method uses simple regular expressions to strip tags and is
1388         not a full fledged HTML parser by any means.  Consider using
1389         something like BeautifulSoup if your needs are more than this
1390         simple code can fulfill.
1391
1392     >>> strip_html('test: <a href="foo/bar">click here</a>')
1393     'test: '
1394     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1395     'test: click here'
1396     """
1397     if not is_string(in_str):
1398         raise ValueError(in_str)
1399     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1400     return r.sub("", in_str)
1401
1402
1403 def asciify(in_str: str) -> str:
1404     """
1405     Args:
1406         in_str: the string to asciify.
1407
1408     Returns:
1409         An output string roughly equivalent to the original string
1410         where all content to are ascii-only.  This is accomplished
1411         by translating all non-ascii chars into their closest possible
1412         ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1413
1414     .. warning::
1415         Some chars may be lost if impossible to translate.
1416
1417     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1418     'eeuuooaaeynAAACIINOE'
1419     """
1420     if not is_string(in_str):
1421         raise ValueError(in_str)
1422
1423     # "NFKD" is the algorithm which is able to successfully translate
1424     # the most of non-ascii chars.
1425     normalized = unicodedata.normalize("NFKD", in_str)
1426
1427     # encode string forcing ascii and ignore any errors
1428     # (unrepresentable chars will be stripped out)
1429     ascii_bytes = normalized.encode("ascii", "ignore")
1430
1431     # turns encoded bytes into an utf-8 string
1432     return ascii_bytes.decode("utf-8")
1433
1434
1435 def slugify(in_str: str, *, separator: str = "-") -> str:
1436     """
1437     Args:
1438         in_str: the string to slugify
1439         separator: the character to use during sligification (default
1440             is a dash)
1441
1442     Returns:
1443         The converted string.  The returned string has the following properties:
1444
1445         * it has no spaces
1446         * all letters are in lower case
1447         * all punctuation signs and non alphanumeric chars are removed
1448         * words are divided using provided separator
1449         * all chars are encoded as ascii (by using :meth:`asciify`)
1450         * is safe for URL
1451
1452     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1453     'top-10-reasons-to-love-dogs'
1454     >>> slugify('Mönstér Mägnët')
1455     'monster-magnet'
1456     """
1457     if not is_string(in_str):
1458         raise ValueError(in_str)
1459
1460     # replace any character that is NOT letter or number with spaces
1461     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1462
1463     # replace spaces with join sign
1464     out = SPACES_RE.sub(separator, out)
1465
1466     # normalize joins (remove duplicates)
1467     out = re.sub(re.escape(separator) + r"+", separator, out)
1468     return asciify(out)
1469
1470
1471 def to_bool(in_str: str) -> bool:
1472     """
1473     Args:
1474         in_str: the string to convert to boolean
1475
1476     Returns:
1477         A boolean equivalent of the original string based on its contents.
1478         All conversion is case insensitive.  A positive boolean (True) is
1479         returned if the string value is any of the following:
1480
1481         * "true"
1482         * "t"
1483         * "1"
1484         * "yes"
1485         * "y"
1486         * "on"
1487
1488         Otherwise False is returned.
1489
1490     >>> to_bool('True')
1491     True
1492
1493     >>> to_bool('1')
1494     True
1495
1496     >>> to_bool('yes')
1497     True
1498
1499     >>> to_bool('no')
1500     False
1501
1502     >>> to_bool('huh?')
1503     False
1504
1505     >>> to_bool('on')
1506     True
1507     """
1508     if not is_string(in_str):
1509         raise ValueError(in_str)
1510     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1511
1512
1513 def to_date(in_str: str) -> Optional[datetime.date]:
1514     """
1515     Args:
1516         in_str: the string to convert into a date
1517
1518     Returns:
1519         The datetime.date the string contained or None to indicate
1520         an error.  This parser is relatively clever; see
1521         :class:`datetimez.dateparse_utils` docs for details.
1522
1523     >>> to_date('9/11/2001')
1524     datetime.date(2001, 9, 11)
1525     >>> to_date('xyzzy')
1526     """
1527     import pyutils.datetimez.dateparse_utils as du
1528
1529     try:
1530         d = du.DateParser()  # type: ignore
1531         d.parse(in_str)
1532         return d.get_date()
1533     except du.ParseException:  # type: ignore
1534         msg = f'Unable to parse date {in_str}.'
1535         logger.warning(msg)
1536     return None
1537
1538
1539 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1540     """Finds and extracts a date from the string, if possible.
1541
1542     Args:
1543         in_str: the string to extract a date from
1544
1545     Returns:
1546         a datetime if date was found, otherwise None
1547
1548     >>> extract_date("filename.txt    dec 13, 2022")
1549     datetime.datetime(2022, 12, 13, 0, 0)
1550
1551     >>> extract_date("Dear Santa, please get me a pony.")
1552
1553     """
1554     import itertools
1555
1556     import pyutils.datetimez.dateparse_utils as du
1557
1558     d = du.DateParser()  # type: ignore
1559     chunks = in_str.split()
1560     for ngram in itertools.chain(
1561         list_utils.ngrams(chunks, 5),
1562         list_utils.ngrams(chunks, 4),
1563         list_utils.ngrams(chunks, 3),
1564         list_utils.ngrams(chunks, 2),
1565     ):
1566         try:
1567             expr = " ".join(ngram)
1568             logger.debug(f"Trying {expr}")
1569             if d.parse(expr):
1570                 return d.get_datetime()
1571         except du.ParseException:  # type: ignore
1572             pass
1573     return None
1574
1575
1576 def is_valid_date(in_str: str) -> bool:
1577     """
1578     Args:
1579         in_str: the string to check
1580
1581     Returns:
1582         True if the string represents a valid date that we can recognize
1583         and False otherwise.  This parser is relatively clever; see
1584         :class:`datetimez.dateparse_utils` docs for details.
1585
1586     >>> is_valid_date('1/2/2022')
1587     True
1588     >>> is_valid_date('christmas')
1589     True
1590     >>> is_valid_date('next wednesday')
1591     True
1592     >>> is_valid_date('xyzzy')
1593     False
1594     """
1595     import pyutils.datetimez.dateparse_utils as dp
1596
1597     try:
1598         d = dp.DateParser()  # type: ignore
1599         _ = d.parse(in_str)
1600         return True
1601     except dp.ParseException:  # type: ignore
1602         msg = f'Unable to parse date {in_str}.'
1603         logger.warning(msg)
1604     return False
1605
1606
1607 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1608     """
1609     Args:
1610         in_str: string to parse into a datetime
1611
1612     Returns:
1613         A python datetime parsed from in_str or None to indicate
1614         an error.  This parser is relatively clever; see
1615         :class:`datetimez.dateparse_utils` docs for details.
1616
1617     >>> to_datetime('7/20/1969 02:56 GMT')
1618     datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1619     """
1620     import pyutils.datetimez.dateparse_utils as dp
1621
1622     try:
1623         d = dp.DateParser()  # type: ignore
1624         dt = d.parse(in_str)
1625         if isinstance(dt, datetime.datetime):
1626             return dt
1627     except Exception:
1628         msg = f'Unable to parse datetime {in_str}.'
1629         logger.warning(msg)
1630     return None
1631
1632
1633 def valid_datetime(in_str: str) -> bool:
1634     """
1635     Args:
1636         in_str: the string to check
1637
1638     Returns:
1639         True if in_str contains a valid datetime and False otherwise.
1640         This parser is relatively clever; see
1641         :class:`datetimez.dateparse_utils` docs for details.
1642
1643     >>> valid_datetime('next wednesday at noon')
1644     True
1645     >>> valid_datetime('3 weeks ago at midnight')
1646     True
1647     >>> valid_datetime('next easter at 5:00 am')
1648     True
1649     >>> valid_datetime('sometime soon')
1650     False
1651     """
1652     _ = to_datetime(in_str)
1653     if _ is not None:
1654         return True
1655     msg = f'Unable to parse datetime {in_str}.'
1656     logger.warning(msg)
1657     return False
1658
1659
1660 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1661     """
1662     Args:
1663         in_str: the string to squeeze
1664         character_to_squeeze: the character to remove runs of
1665             more than one in a row (default = space)
1666
1667     Returns: A "squeezed string" where runs of more than one
1668         character_to_squeeze into one.
1669
1670     >>> squeeze(' this        is       a    test    ')
1671     ' this is a test '
1672
1673     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1674     'one|!|two|!|three'
1675
1676     """
1677     return re.sub(
1678         r'(' + re.escape(character_to_squeeze) + r')+',
1679         character_to_squeeze,
1680         in_str,
1681     )
1682
1683
1684 def dedent(in_str: str) -> Optional[str]:
1685     """
1686     Args:
1687         in_str: the string to dedent
1688
1689     Returns:
1690         A string with tab indentation removed or None on error.
1691
1692     .. note::
1693
1694         Inspired by analogous Scala function.
1695
1696     >>> dedent('\t\ttest\\n\t\ting')
1697     'test\\ning'
1698     """
1699     if not is_string(in_str):
1700         return None
1701     line_separator = '\n'
1702     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1703     return line_separator.join(lines)
1704
1705
1706 def indent(in_str: str, amount: int) -> str:
1707     """
1708     Args:
1709         in_str: the string to indent
1710         amount: count of spaces to indent each line by
1711
1712     Returns:
1713         An indented string created by prepending amount spaces.
1714
1715     >>> indent('This is a test', 4)
1716     '    This is a test'
1717     """
1718     if not is_string(in_str):
1719         raise ValueError(in_str)
1720     line_separator = '\n'
1721     lines = [" " * amount + line for line in in_str.split(line_separator)]
1722     return line_separator.join(lines)
1723
1724
1725 def sprintf(*args, **kwargs) -> str:
1726     """
1727     Args:
1728         This function uses the same syntax as the builtin print
1729         function.
1730
1731     Returns:
1732         An interpolated string capturing print output, like man(3)
1733         `sprintf`.
1734     """
1735     ret = ""
1736
1737     sep = kwargs.pop("sep", None)
1738     if sep is not None:
1739         if not isinstance(sep, str):
1740             raise TypeError("sep must be None or a string")
1741
1742     end = kwargs.pop("end", None)
1743     if end is not None:
1744         if not isinstance(end, str):
1745             raise TypeError("end must be None or a string")
1746
1747     if kwargs:
1748         raise TypeError("invalid keyword arguments to sprint()")
1749
1750     if sep is None:
1751         sep = " "
1752     if end is None:
1753         end = "\n"
1754     for i, arg in enumerate(args):
1755         if i:
1756             ret += sep
1757         if isinstance(arg, str):
1758             ret += arg
1759         else:
1760             ret += str(arg)
1761     ret += end
1762     return ret
1763
1764
1765 def strip_ansi_sequences(in_str: str) -> str:
1766     """
1767     Args:
1768         in_str: the string to strip
1769
1770     Returns:
1771         in_str with recognized ANSI escape sequences removed.
1772
1773     .. warning::
1774         This method works by using a regular expression.
1775         It works for all ANSI escape sequences I've tested with but
1776         may miss some; caveat emptor.
1777
1778     >>> import ansi as a
1779     >>> s = a.fg('blue') + 'blue!' + a.reset()
1780     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1781     18
1782     >>> len(strip_ansi_sequences(s))
1783     5
1784     >>> strip_ansi_sequences(s)
1785     'blue!'
1786
1787     """
1788     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1789
1790
1791 class SprintfStdout(contextlib.AbstractContextManager):
1792     """
1793     A context manager that captures outputs to stdout to a buffer
1794     without printing them.
1795
1796     >>> with SprintfStdout() as buf:
1797     ...     print("test")
1798     ...     print("1, 2, 3")
1799     ...
1800     >>> print(buf(), end='')
1801     test
1802     1, 2, 3
1803
1804     """
1805
1806     def __init__(self) -> None:
1807         self.destination = io.StringIO()
1808         self.recorder: contextlib.redirect_stdout
1809
1810     def __enter__(self) -> Callable[[], str]:
1811         self.recorder = contextlib.redirect_stdout(self.destination)
1812         self.recorder.__enter__()
1813         return lambda: self.destination.getvalue()
1814
1815     def __exit__(self, *args) -> Literal[False]:
1816         self.recorder.__exit__(*args)
1817         self.destination.seek(0)
1818         return False
1819
1820
1821 def capitalize_first_letter(in_str: str) -> str:
1822     """
1823     Args:
1824         in_str: the string to capitalize
1825
1826     Returns:
1827         in_str with the first character capitalized.
1828
1829     >>> capitalize_first_letter('test')
1830     'Test'
1831     >>> capitalize_first_letter("ALREADY!")
1832     'ALREADY!'
1833
1834     """
1835     return in_str[0].upper() + in_str[1:]
1836
1837
1838 def it_they(n: int) -> str:
1839     """
1840     Args:
1841         n: how many of them are there?
1842
1843     Returns:
1844         'it' if n is one or 'they' otherwize.
1845
1846     Suggested usage::
1847
1848         n = num_files_saved_to_tmp()
1849         print(f'Saved file{pluralize(n)} successfully.')
1850         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1851
1852     >>> it_they(1)
1853     'it'
1854     >>> it_they(100)
1855     'they'
1856     """
1857     if n == 1:
1858         return "it"
1859     return "they"
1860
1861
1862 def is_are(n: int) -> str:
1863     """
1864     Args:
1865         n: how many of them are there?
1866
1867     Returns:
1868         'is' if n is one or 'are' otherwize.
1869
1870     Suggested usage::
1871
1872         n = num_files_saved_to_tmp()
1873         print(f'Saved file{pluralize(n)} successfully.')
1874         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1875
1876     >>> is_are(1)
1877     'is'
1878     >>> is_are(2)
1879     'are'
1880
1881     """
1882     if n == 1:
1883         return "is"
1884     return "are"
1885
1886
1887 def pluralize(n: int) -> str:
1888     """
1889     Args:
1890         n: how many of them are there?
1891
1892     Returns:
1893         's' if n is greater than one otherwize ''.
1894
1895     Suggested usage::
1896
1897         n = num_files_saved_to_tmp()
1898         print(f'Saved file{pluralize(n)} successfully.')
1899         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1900
1901     >>> pluralize(15)
1902     's'
1903     >>> count = 1
1904     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1905     There is 1 file.
1906     >>> count = 4
1907     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1908     There are 4 files.
1909     """
1910     if n == 1:
1911         return ""
1912     return "s"
1913
1914
1915 def make_contractions(txt: str) -> str:
1916     """This code glues words in txt together to form (English)
1917     contractions.
1918
1919     Args:
1920         txt: the input text to be contractionized.
1921
1922     Returns:
1923         Output text identical to original input except for any
1924         recognized contractions are formed.
1925
1926     .. note::
1927         The order in which we create contractions is defined by the
1928         implementation and what I thought made more sense when writing
1929         this code.
1930
1931     >>> make_contractions('It is nice today.')
1932     "It's nice today."
1933
1934     >>> make_contractions('I can    not even...')
1935     "I can't even..."
1936
1937     >>> make_contractions('She could not see!')
1938     "She couldn't see!"
1939
1940     >>> make_contractions('But she will not go.')
1941     "But she won't go."
1942
1943     >>> make_contractions('Verily, I shall not.')
1944     "Verily, I shan't."
1945
1946     >>> make_contractions('No you cannot.')
1947     "No you can't."
1948
1949     >>> make_contractions('I said you can not go.')
1950     "I said you can't go."
1951     """
1952
1953     first_second = [
1954         (
1955             [
1956                 'are',
1957                 'could',
1958                 'did',
1959                 'has',
1960                 'have',
1961                 'is',
1962                 'must',
1963                 'should',
1964                 'was',
1965                 'were',
1966                 'would',
1967             ],
1968             ['(n)o(t)'],
1969         ),
1970         (
1971             [
1972                 "I",
1973                 "you",
1974                 "he",
1975                 "she",
1976                 "it",
1977                 "we",
1978                 "they",
1979                 "how",
1980                 "why",
1981                 "when",
1982                 "where",
1983                 "who",
1984                 "there",
1985             ],
1986             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1987         ),
1988     ]
1989
1990     # Special cases: can't, shan't and won't.
1991     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1992     txt = re.sub(
1993         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
1994     )
1995     txt = re.sub(
1996         r'\b(w)ill\s*(n)(o)(t)\b',
1997         r"\1\3\2'\4",
1998         txt,
1999         count=0,
2000         flags=re.IGNORECASE,
2001     )
2002
2003     for first_list, second_list in first_second:
2004         for first in first_list:
2005             for second in second_list:
2006                 # Disallow there're/where're.  They're valid English
2007                 # but sound weird.
2008                 if (first in ('there', 'where')) and second == 'a(re)':
2009                     continue
2010
2011                 pattern = fr'\b({first})\s+{second}\b'
2012                 if second == '(n)o(t)':
2013                     replacement = r"\1\2'\3"
2014                 else:
2015                     replacement = r"\1'\2"
2016                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2017
2018     return txt
2019
2020
2021 def thify(n: int) -> str:
2022     """
2023     Args:
2024         n: how many of them are there?
2025
2026     Returns:
2027         The proper cardinal suffix for a number.
2028
2029     Suggested usage::
2030
2031         attempt_count = 0
2032         while True:
2033             attempt_count += 1
2034             if try_the_thing():
2035                 break
2036             print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2037
2038     >>> thify(1)
2039     'st'
2040     >>> thify(33)
2041     'rd'
2042     >>> thify(16)
2043     'th'
2044     """
2045     digit = str(n)
2046     assert is_integer_number(digit)
2047     digit = digit[-1:]
2048     if digit == "1":
2049         return "st"
2050     elif digit == "2":
2051         return "nd"
2052     elif digit == "3":
2053         return "rd"
2054     else:
2055         return "th"
2056
2057
2058 def ngrams(txt: str, n: int):
2059     """
2060     Args:
2061         txt: the string to create ngrams using
2062         n: how many words per ngram created?
2063
2064     Returns:
2065         Generates the ngrams from the input string.
2066
2067     >>> [x for x in ngrams('This is a test', 2)]
2068     ['This is', 'is a', 'a test']
2069     """
2070     words = txt.split()
2071     for ngram in ngrams_presplit(words, n):
2072         ret = ''
2073         for word in ngram:
2074             ret += f'{word} '
2075         yield ret.strip()
2076
2077
2078 def ngrams_presplit(words: Sequence[str], n: int):
2079     """
2080     Same as :meth:`ngrams` but with the string pre-split.
2081     """
2082     return list_utils.ngrams(words, n)
2083
2084
2085 def bigrams(txt: str):
2086     """Generates the bigrams (n=2) of the given string.
2087
2088     >>> [x for x in bigrams('this is a test')]
2089     ['this is', 'is a', 'a test']
2090     """
2091     return ngrams(txt, 2)
2092
2093
2094 def trigrams(txt: str):
2095     """Generates the trigrams (n=3) of the given string."""
2096     return ngrams(txt, 3)
2097
2098
2099 def shuffle_columns_into_list(
2100     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
2101 ) -> Iterable[str]:
2102     """Helper to shuffle / parse columnar data and return the results as a
2103     list.
2104
2105     Args:
2106         input_lines: A sequence of strings that represents text that
2107             has been broken into columns by the caller
2108         column_specs: an iterable collection of numeric sequences that
2109             indicate one or more column numbers to copy to form the Nth
2110             position in the output list.  See example below.
2111         delim: for column_specs that indicate we should copy more than
2112             one column from the input into this position, use delim to
2113             separate source data.  Defaults to ''.
2114
2115     Returns:
2116         A list of string created by following the instructions set forth
2117         in column_specs.
2118
2119     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2120     >>> shuffle_columns_into_list(
2121     ...     cols,
2122     ...     [ [8], [2, 3], [5, 6, 7] ],
2123     ...     delim='!',
2124     ... )
2125     ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2126     """
2127     out = []
2128
2129     # Column specs map input lines' columns into outputs.
2130     # [col1, col2...]
2131     for spec in column_specs:
2132         hunk = ''
2133         for n in spec:
2134             hunk = hunk + delim + input_lines[n]
2135         hunk = hunk.strip(delim)
2136         out.append(hunk)
2137     return out
2138
2139
2140 def shuffle_columns_into_dict(
2141     input_lines: Sequence[str],
2142     column_specs: Iterable[Tuple[str, Iterable[int]]],
2143     delim='',
2144 ) -> Dict[str, str]:
2145     """Helper to shuffle / parse columnar data and return the results
2146     as a dict.
2147
2148     Args:
2149         input_lines: a sequence of strings that represents text that
2150             has been broken into columns by the caller
2151         column_specs: instructions for what dictionary keys to apply
2152             to individual or compound input column data.  See example
2153             below.
2154         delim: when forming compound output data by gluing more than
2155             one input column together, use this character to separate
2156             the source data.  Defaults to ''.
2157
2158     Returns:
2159         A dict formed by applying the column_specs instructions.
2160
2161     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2162     >>> shuffle_columns_into_dict(
2163     ...     cols,
2164     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2165     ...     delim='!',
2166     ... )
2167     {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2168     """
2169     out = {}
2170
2171     # Column specs map input lines' columns into outputs.
2172     # "key", [col1, col2...]
2173     for spec in column_specs:
2174         hunk = ''
2175         for n in spec[1]:
2176             hunk = hunk + delim + input_lines[n]
2177         hunk = hunk.strip(delim)
2178         out[spec[0]] = hunk
2179     return out
2180
2181
2182 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2183     """
2184     Interpolate a string with data from a dict.
2185
2186     Args:
2187         txt: the mad libs template
2188         values: what you and your kids chose for each category.
2189
2190     >>> interpolate_using_dict('This is a {adjective} {noun}.',
2191     ...                        {'adjective': 'good', 'noun': 'example'})
2192     'This is a good example.'
2193     """
2194     return sprintf(txt.format(**values), end='')
2195
2196
2197 def to_ascii(txt: str):
2198     """
2199     Args:
2200         txt: the input data to encode
2201
2202     Returns:
2203         txt encoded as an ASCII byte string.
2204
2205     >>> to_ascii('test')
2206     b'test'
2207
2208     >>> to_ascii(b'1, 2, 3')
2209     b'1, 2, 3'
2210     """
2211     if isinstance(txt, str):
2212         return txt.encode('ascii')
2213     if isinstance(txt, bytes):
2214         return txt
2215     raise Exception('to_ascii works with strings and bytes')
2216
2217
2218 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
2219     """
2220     Args:
2221         txt: the input data to encode
2222
2223     Returns:
2224         txt encoded with a 64-chracter alphabet.  Similar to and compatible
2225         with uuencode/uudecode.
2226
2227     >>> to_base64('hello?')
2228     b'aGVsbG8/\\n'
2229     """
2230     return base64.encodebytes(txt.encode(encoding, errors))
2231
2232
2233 def is_base64(txt: str) -> bool:
2234     """
2235     Args:
2236         txt: the string to check
2237
2238     Returns:
2239         True if txt is a valid base64 encoded string.  This assumes
2240         txt was encoded with Python's standard base64 alphabet which
2241         is the same as what uuencode/uudecode uses).
2242
2243     >>> is_base64('test')    # all letters in the b64 alphabet
2244     True
2245
2246     >>> is_base64('another test, how do you like this one?')
2247     False
2248
2249     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
2250     True
2251
2252     """
2253     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2254     alphabet = set(a.encode('ascii'))
2255     for char in to_ascii(txt.strip()):
2256         if char not in alphabet:
2257             return False
2258     return True
2259
2260
2261 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
2262     """
2263     Args:
2264         b64: bytestring of 64-bit encoded data to decode / convert.
2265
2266     Returns:
2267         The decoded form of b64 as a normal python string.  Similar to
2268         and compatible with uuencode / uudecode.
2269
2270     >>> from_base64(b'aGVsbG8/\\n')
2271     'hello?'
2272     """
2273     return base64.decodebytes(b64).decode(encoding, errors)
2274
2275
2276 def chunk(txt: str, chunk_size: int):
2277     """
2278     Args:
2279         txt: a string to be chunked into evenly spaced pieces.
2280         chunk_size: the size of each chunk to make
2281
2282     Returns:
2283         The original string chunked into evenly spaced pieces.
2284
2285     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2286     '01001101 11000101 10101010 10101010 10011111 10101000'
2287     """
2288     if len(txt) % chunk_size != 0:
2289         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2290         logger.warning(msg)
2291         warnings.warn(msg, stacklevel=2)
2292     for x in range(0, len(txt), chunk_size):
2293         yield txt[x : x + chunk_size]
2294
2295
2296 def to_bitstring(txt: str, *, delimiter='') -> str:
2297     """
2298     Args:
2299         txt: the string to convert into a bitstring
2300         delimiter: character to insert between adjacent bytes.  Note that
2301             only bitstrings with delimiter='' are interpretable by
2302             :meth:`from_bitstring`.
2303
2304     Returns:
2305         txt converted to ascii/binary and then chopped into bytes.
2306
2307     >>> to_bitstring('hello?')
2308     '011010000110010101101100011011000110111100111111'
2309
2310     >>> to_bitstring('test', delimiter=' ')
2311     '01110100 01100101 01110011 01110100'
2312
2313     >>> to_bitstring(b'test')
2314     '01110100011001010111001101110100'
2315     """
2316     etxt = to_ascii(txt)
2317     bits = bin(int.from_bytes(etxt, 'big'))
2318     bits = bits[2:]
2319     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2320
2321
2322 def is_bitstring(txt: str) -> bool:
2323     """
2324     Args:
2325         txt: the string to check
2326
2327     Returns:
2328         True if txt is a recognized bitstring and False otherwise.
2329         Note that if delimiter is non empty this code will not
2330         recognize the bitstring.
2331
2332     >>> is_bitstring('011010000110010101101100011011000110111100111111')
2333     True
2334
2335     >>> is_bitstring('1234')
2336     False
2337     """
2338     return is_binary_integer_number(f'0b{txt}')
2339
2340
2341 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
2342     """
2343     Args:
2344         bits: the bitstring to convert back into a python string
2345         encoding: the encoding to use
2346
2347     Returns:
2348         The regular python string represented by bits.  Note that this
2349         code does not work with to_bitstring when delimiter is non-empty.
2350
2351     >>> from_bitstring('011010000110010101101100011011000110111100111111')
2352     'hello?'
2353     """
2354     n = int(bits, 2)
2355     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2356
2357
2358 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2359     """
2360     Args:
2361         txt: an IP address to chunk up for sorting purposes
2362
2363     Returns:
2364         A tuple of IP components arranged such that the sorting of
2365         IP addresses using a normal comparator will do something sane
2366         and desireable.
2367
2368     >>> ip_v4_sort_key('10.0.0.18')
2369     (10, 0, 0, 18)
2370
2371     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2372     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2373     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2374     """
2375     if not is_ip_v4(txt):
2376         print(f"not IP: {txt}")
2377         return None
2378     return tuple(int(x) for x in txt.split('.'))
2379
2380
2381 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2382     """
2383     Args:
2384         volume: the string to chunk up for sorting purposes
2385
2386     Returns:
2387         A tuple of volume's components such that the sorting of
2388         volumes using a normal comparator will do something sane
2389         and desireable.
2390
2391     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2392     ('usr', 'local', 'bin')
2393
2394     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2395     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2396     ['/usr', '/usr/local', '/usr/local/bin']
2397     """
2398     return tuple(x for x in volume.split('/') if len(x) > 0)
2399
2400
2401 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2402     """
2403     Execute several replace operations in a row.
2404
2405     Args:
2406         in_str: the string in which to replace characters
2407         replace_set: the set of target characters to replace
2408         replacement: the character to replace any member of replace_set
2409             with
2410
2411     Returns:
2412         The string with replacements executed.
2413
2414     >>> s = 'this_is a-test!'
2415     >>> replace_all(s, ' _-!', '')
2416     'thisisatest'
2417     """
2418     for char in replace_set:
2419         in_str = in_str.replace(char, replacement)
2420     return in_str
2421
2422
2423 def replace_nth(in_str: str, source: str, target: str, nth: int):
2424     """
2425     Replaces the nth occurrance of a substring within a string.
2426
2427     Args:
2428         in_str: the string in which to run the replacement
2429         source: the substring to replace
2430         target: the replacement text
2431         nth: which occurrance of source to replace?
2432
2433     >>> replace_nth('this is a test', ' ', '-', 3)
2434     'this is a-test'
2435     """
2436     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2437     before = in_str[:where]
2438     after = in_str[where:]
2439     after = after.replace(source, target, 1)
2440     return before + after
2441
2442
2443 if __name__ == '__main__':
2444     import doctest
2445
2446     doctest.testmod()