src/pyutils/string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7 Modifications Copyright (c) 2021-2022 Scott Gasch
   8
   9 Permission is hereby granted, free of charge, to any person obtaining a copy
  10 of this software and associated documentation files (the "Software"), to deal
  11 in the Software without restriction, including without limitation the rights
  12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  13 copies of the Software, and to permit persons to whom the Software is
  14 furnished to do so, subject to the following conditions:
  15
  16 The above copyright notice and this permission notice shall be included in all
  17 copies or substantial portions of the Software.
  18
  19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  25 SOFTWARE.
  26
  27 This class is based on: https://github.com/daveoncode/python-string-utils.
  28 See NOTICE in the root of this module for a detailed enumeration of what
  29 work is Davide's and what work was added by Scott.
  30 """
  31
  32 import base64
  33 import contextlib  # type: ignore
  34 import datetime
  35 import io
  36 import json
  37 import logging
  38 import numbers
  39 import random
  40 import re
  41 import string
  42 import unicodedata
  43 import warnings
  44 from itertools import zip_longest
  45 from typing import (
  46     Any,
  47     Callable,
  48     Dict,
  49     Iterable,
  50     List,
  51     Literal,
  52     Optional,
  53     Sequence,
  54     Tuple,
  55 )
  56 from uuid import uuid4
  57
  58 from pyutils import list_utils
  59
  60 logger = logging.getLogger(__name__)
  61
  62 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  63
  64 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  65
  66 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  67
  68 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  69
  70 URLS_RAW_STRING = (
  71     r"([a-z-]+://)"  # scheme
  72     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  73     r"(www\.)?"  # www.
  74     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  75     r"(:\d{2,})?"  # port number
  76     r"(/[a-z\d_%+-]*)*"  # folders
  77     r"(\.[a-z\d_%+-]+)*"  # file extension
  78     r"(\?[a-z\d_+%-=]*)?"  # query string
  79     r"(#\S*)?"  # hash
  80 )
  81
  82 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  83
  84 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  85
  86 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  87
  88 EMAILS_RAW_STRING = (
  89     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  90 )
  91
  92 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  93
  94 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  95
  96 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
  97
  98 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  99
 100 SNAKE_CASE_TEST_RE = re.compile(
 101     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
 102 )
 103
 104 SNAKE_CASE_TEST_DASH_RE = re.compile(
 105     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 106 )
 107
 108 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 109
 110 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 111
 112 CREDIT_CARDS = {
 113     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 114     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 115     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 116     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 117     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 118     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 119 }
 120
 121 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 122
 123 UUID_RE = re.compile(
 124     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 125 )
 126
 127 UUID_HEX_OK_RE = re.compile(
 128     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 129     re.IGNORECASE,
 130 )
 131
 132 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 133
 134 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 135
 136 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 137
 138 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 139
 140 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 141
 142 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 143     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 144 )
 145
 146 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 147
 148 HTML_RE = re.compile(
 149     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 150     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 151 )
 152
 153 HTML_TAG_ONLY_RE = re.compile(
 154     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 155     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 156 )
 157
 158 SPACES_RE = re.compile(r"\s")
 159
 160 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 161
 162 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 163
 164 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 165
 166 NUM_SUFFIXES = {
 167     "Pb": (1024**5),
 168     "P": (1024**5),
 169     "Tb": (1024**4),
 170     "T": (1024**4),
 171     "Gb": (1024**3),
 172     "G": (1024**3),
 173     "Mb": (1024**2),
 174     "M": (1024**2),
 175     "Kb": (1024**1),
 176     "K": (1024**1),
 177 }
 178
 179 units = [
 180     "zero",
 181     "one",
 182     "two",
 183     "three",
 184     "four",
 185     "five",
 186     "six",
 187     "seven",
 188     "eight",
 189     "nine",
 190     "ten",
 191     "eleven",
 192     "twelve",
 193     "thirteen",
 194     "fourteen",
 195     "fifteen",
 196     "sixteen",
 197     "seventeen",
 198     "eighteen",
 199     "nineteen",
 200 ]
 201
 202 tens = [
 203     "",
 204     "",
 205     "twenty",
 206     "thirty",
 207     "forty",
 208     "fifty",
 209     "sixty",
 210     "seventy",
 211     "eighty",
 212     "ninety",
 213 ]
 214
 215 scales = ["hundred", "thousand", "million", "billion", "trillion"]
 216
 217 NUM_WORDS = {}
 218 NUM_WORDS["and"] = (1, 0)
 219 for i, word in enumerate(units):
 220     NUM_WORDS[word] = (1, i)
 221 for i, word in enumerate(tens):
 222     NUM_WORDS[word] = (1, i * 10)
 223 for i, word in enumerate(scales):
 224     NUM_WORDS[word] = (10 ** (i * 3 or 2), 0)
 225 NUM_WORDS['score'] = (20, 0)
 226
 227
 228 def is_none_or_empty(in_str: Optional[str]) -> bool:
 229     """
 230     Args:
 231         in_str: the string to test
 232
 233     Returns:
 234         True if the input string is either None or an empty string,
 235         False otherwise.
 236
 237     >>> is_none_or_empty("")
 238     True
 239     >>> is_none_or_empty(None)
 240     True
 241     >>> is_none_or_empty("   \t   ")
 242     True
 243     >>> is_none_or_empty('Test')
 244     False
 245     """
 246     return in_str is None or len(in_str.strip()) == 0
 247
 248
 249 def is_string(obj: Any) -> bool:
 250     """
 251     Args:
 252         in_str: the object to test
 253
 254     Returns:
 255         True if the object is a string and False otherwise.
 256
 257     >>> is_string('test')
 258     True
 259     >>> is_string(123)
 260     False
 261     >>> is_string(100.3)
 262     False
 263     >>> is_string([1, 2, 3])
 264     False
 265     """
 266     return isinstance(obj, str)
 267
 268
 269 def is_empty_string(in_str: Any) -> bool:
 270     """
 271     Args:
 272         in_str: the string to test
 273
 274     Returns:
 275         True if the string is empty and False otherwise.
 276     """
 277     return is_empty(in_str)
 278
 279
 280 def is_empty(in_str: Any) -> bool:
 281     """
 282     Args:
 283         in_str: the string to test
 284
 285     Returns:
 286         True if the string is empty and false otherwise.
 287
 288     >>> is_empty('')
 289     True
 290     >>> is_empty('    \t\t    ')
 291     True
 292     >>> is_empty('test')
 293     False
 294     >>> is_empty(100.88)
 295     False
 296     >>> is_empty([1, 2, 3])
 297     False
 298     """
 299     return is_string(in_str) and in_str.strip() == ""
 300
 301
 302 def is_full_string(in_str: Any) -> bool:
 303     """
 304     Args:
 305         in_str: the object to test
 306
 307     Returns:
 308         True if the object is a string and is not empty ('') and
 309         is not only composed of whitespace.
 310
 311     >>> is_full_string('test!')
 312     True
 313     >>> is_full_string('')
 314     False
 315     >>> is_full_string('      ')
 316     False
 317     >>> is_full_string(100.999)
 318     False
 319     >>> is_full_string({"a": 1, "b": 2})
 320     False
 321     """
 322     return is_string(in_str) and in_str.strip() != ""
 323
 324
 325 def is_number(in_str: str) -> bool:
 326     """
 327     Args:
 328         in_str: the string to test
 329
 330     Returns:
 331         True if the string contains a valid numberic value and
 332         False otherwise.
 333
 334     >>> is_number(100.5)
 335     Traceback (most recent call last):
 336     ...
 337     ValueError: 100.5
 338     >>> is_number("100.5")
 339     True
 340     >>> is_number("test")
 341     False
 342     >>> is_number("99")
 343     True
 344     >>> is_number([1, 2, 3])
 345     Traceback (most recent call last):
 346     ...
 347     ValueError: [1, 2, 3]
 348     """
 349     if not is_string(in_str):
 350         raise ValueError(in_str)
 351     return NUMBER_RE.match(in_str) is not None
 352
 353
 354 def is_integer_number(in_str: str) -> bool:
 355     """
 356     Args:
 357         in_str: the string to test
 358
 359     Returns:
 360         True if the string contains a valid (signed or unsigned,
 361         decimal, hex, or octal, regular or scientific) integral
 362         expression and False otherwise.
 363
 364     >>> is_integer_number('42')
 365     True
 366     >>> is_integer_number('42.0')
 367     False
 368     """
 369     return (
 370         (is_number(in_str) and "." not in in_str)
 371         or is_hexidecimal_integer_number(in_str)
 372         or is_octal_integer_number(in_str)
 373         or is_binary_integer_number(in_str)
 374     )
 375
 376
 377 def is_hexidecimal_integer_number(in_str: str) -> bool:
 378     """
 379     Args:
 380         in_str: the string to test
 381
 382     Returns:
 383         True if the string is a hex integer number and False otherwise.
 384
 385     >>> is_hexidecimal_integer_number('0x12345')
 386     True
 387     >>> is_hexidecimal_integer_number('0x1A3E')
 388     True
 389     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 390     False
 391     >>> is_hexidecimal_integer_number('-0xff')
 392     True
 393     >>> is_hexidecimal_integer_number('test')
 394     False
 395     >>> is_hexidecimal_integer_number(12345)  # Not a string
 396     Traceback (most recent call last):
 397     ...
 398     ValueError: 12345
 399     >>> is_hexidecimal_integer_number(101.4)
 400     Traceback (most recent call last):
 401     ...
 402     ValueError: 101.4
 403     >>> is_hexidecimal_integer_number(0x1A3E)
 404     Traceback (most recent call last):
 405     ...
 406     ValueError: 6718
 407     """
 408     if not is_string(in_str):
 409         raise ValueError(in_str)
 410     return HEX_NUMBER_RE.match(in_str) is not None
 411
 412
 413 def is_octal_integer_number(in_str: str) -> bool:
 414     """
 415     Args:
 416         in_str: the string to test
 417
 418     Returns:
 419         True if the string is a valid octal integral number and False otherwise.
 420
 421     >>> is_octal_integer_number('0o777')
 422     True
 423     >>> is_octal_integer_number('-0O115')
 424     True
 425     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 426     False
 427     >>> is_octal_integer_number('7777')  # Needs 0o
 428     False
 429     >>> is_octal_integer_number('test')
 430     False
 431     """
 432     if not is_string(in_str):
 433         raise ValueError(in_str)
 434     return OCT_NUMBER_RE.match(in_str) is not None
 435
 436
 437 def is_binary_integer_number(in_str: str) -> bool:
 438     """
 439     Args:
 440         in_str: the string to test
 441
 442     Returns:
 443         True if the string contains a binary integral number and False otherwise.
 444
 445     >>> is_binary_integer_number('0b10111')
 446     True
 447     >>> is_binary_integer_number('-0b111')
 448     True
 449     >>> is_binary_integer_number('0B10101')
 450     True
 451     >>> is_binary_integer_number('0b10102')
 452     False
 453     >>> is_binary_integer_number('0xFFF')
 454     False
 455     >>> is_binary_integer_number('test')
 456     False
 457     """
 458     if not is_string(in_str):
 459         raise ValueError(in_str)
 460     return BIN_NUMBER_RE.match(in_str) is not None
 461
 462
 463 def to_int(in_str: str) -> int:
 464     """
 465     Args:
 466         in_str: the string to convert
 467
 468     Returns:
 469         The integral value of the string or raises on error.
 470
 471     >>> to_int('1234')
 472     1234
 473     >>> to_int('test')
 474     Traceback (most recent call last):
 475     ...
 476     ValueError: invalid literal for int() with base 10: 'test'
 477     """
 478     if not is_string(in_str):
 479         raise ValueError(in_str)
 480     if is_binary_integer_number(in_str):
 481         return int(in_str, 2)
 482     if is_octal_integer_number(in_str):
 483         return int(in_str, 8)
 484     if is_hexidecimal_integer_number(in_str):
 485         return int(in_str, 16)
 486     return int(in_str)
 487
 488
 489 def number_string_to_integer(in_str: str) -> int:
 490     """Convert a string containing a written-out number into an int.
 491
 492     >>> number_string_to_integer("one hundred fifty two")
 493     152
 494
 495     >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
 496     10200054003
 497
 498     >>> number_string_to_integer("four-score and 7")
 499     87
 500
 501     >>> number_string_to_integer("fifty xyzzy three")
 502     Traceback (most recent call last):
 503     ...
 504     ValueError: Unknown word: xyzzy
 505     """
 506     if type(in_str) == int:
 507         return in_str
 508
 509     current = result = 0
 510     in_str = in_str.replace('-', ' ')
 511     for word in in_str.split():
 512         if word not in NUM_WORDS:
 513             if is_integer_number(word):
 514                 current += int(word)
 515                 continue
 516             else:
 517                 raise ValueError("Unknown word: " + word)
 518         scale, increment = NUM_WORDS[word]
 519         current = current * scale + increment
 520         if scale > 100:
 521             result += current
 522             current = 0
 523     return result + current
 524
 525
 526 def is_decimal_number(in_str: str) -> bool:
 527     """
 528     Args:
 529         in_str: the string to check
 530
 531     Returns:
 532         True if the given string represents a decimal or False
 533         otherwise.  A decimal may be signed or unsigned or use
 534         a "scientific notation".
 535
 536     .. note::
 537         We do not consider integers without a decimal point
 538         to be decimals; they return False (see example).
 539
 540     >>> is_decimal_number('42.0')
 541     True
 542     >>> is_decimal_number('42')
 543     False
 544     """
 545     return is_number(in_str) and "." in in_str
 546
 547
 548 def strip_escape_sequences(in_str: str) -> str:
 549     """
 550     Args:
 551         in_str: the string to strip of escape sequences.
 552
 553     Returns:
 554         in_str with escape sequences removed.
 555
 556     .. note::
 557         What is considered to be an "escape sequence" is defined
 558         by a regular expression.  While this gets common ones,
 559         there may exist valid sequences that it doesn't match.
 560
 561     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 562     'this is a test!'
 563     """
 564     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 565     return in_str
 566
 567
 568 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 569     """
 570     Args:
 571         in_str: string or number to which to add thousands separator(s)
 572         separator_char: the separator character to add (defaults to comma)
 573         places: add a separator every N places (defaults to three)
 574
 575     Returns:
 576         A numeric string with thousands separators added appropriately.
 577
 578     >>> add_thousands_separator('12345678')
 579     '12,345,678'
 580     >>> add_thousands_separator(12345678)
 581     '12,345,678'
 582     >>> add_thousands_separator(12345678.99)
 583     '12,345,678.99'
 584     >>> add_thousands_separator('test')
 585     Traceback (most recent call last):
 586     ...
 587     ValueError: test
 588
 589     """
 590     if isinstance(in_str, numbers.Number):
 591         in_str = f'{in_str}'
 592     if is_number(in_str):
 593         return _add_thousands_separator(
 594             in_str, separator_char=separator_char, places=places
 595         )
 596     raise ValueError(in_str)
 597
 598
 599 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 600     decimal_part = ""
 601     if '.' in in_str:
 602         (in_str, decimal_part) = in_str.split('.')
 603     tmp = [iter(in_str[::-1])] * places
 604     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 605     if len(decimal_part) > 0:
 606         ret += '.'
 607         ret += decimal_part
 608     return ret
 609
 610
 611 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 612     """
 613     Args:
 614         in_str: the string to test
 615         allowed_schemes: an optional list of allowed schemes (e.g.
 616             ['http', 'https', 'ftp'].  If passed, only URLs that
 617             begin with the one of the schemes passed will be considered
 618             to be valid.  Otherwise, any scheme:// will be considered
 619             valid.
 620
 621     Returns:
 622         True if in_str contains a valid URL and False otherwise.
 623
 624     >>> is_url('http://www.mysite.com')
 625     True
 626     >>> is_url('https://mysite.com')
 627     True
 628     >>> is_url('.mysite.com')
 629     False
 630     >>> is_url('scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash')
 631     True
 632     """
 633     if not is_full_string(in_str):
 634         return False
 635
 636     valid = URL_RE.match(in_str) is not None
 637
 638     if allowed_schemes:
 639         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 640     return valid
 641
 642
 643 def is_email(in_str: Any) -> bool:
 644     """
 645     Args:
 646         in_str: the email address to check
 647
 648     Returns: True if the in_str contains a valid email (as defined by
 649         https://tools.ietf.org/html/rfc3696#section-3) or False
 650         otherwise.
 651
 652     >>> is_email('[email protected]')
 653     True
 654     >>> is_email('@gmail.com')
 655     False
 656     """
 657     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 658         return False
 659
 660     try:
 661         # we expect 2 tokens, one before "@" and one after, otherwise
 662         # we have an exception and the email is not valid.
 663         head, tail = in_str.split("@")
 664
 665         # head's size must be <= 64, tail <= 255, head must not start
 666         # with a dot or contain multiple consecutive dots.
 667         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 668             return False
 669
 670         # removes escaped spaces, so that later on the test regex will
 671         # accept the string.
 672         head = head.replace("\\ ", "")
 673         if head.startswith('"') and head.endswith('"'):
 674             head = head.replace(" ", "")[1:-1]
 675         return EMAIL_RE.match(head + "@" + tail) is not None
 676
 677     except ValueError:
 678         # borderline case in which we have multiple "@" signs but the
 679         # head part is correctly escaped.
 680         if ESCAPED_AT_SIGN.search(in_str) is not None:
 681             # replace "@" with "a" in the head
 682             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 683         return False
 684
 685
 686 def suffix_string_to_number(in_str: str) -> Optional[int]:
 687     """Takes a string like "33Gb" and converts it into a number (of bytes)
 688     like 34603008.
 689
 690     Args:
 691         in_str: the string with a suffix to be interpreted and removed.
 692
 693     Returns:
 694         An integer number of bytes or None to indicate an error.
 695
 696     >>> suffix_string_to_number('1Mb')
 697     1048576
 698     >>> suffix_string_to_number('13.1Gb')
 699     14066017894
 700     """
 701
 702     def suffix_capitalize(s: str) -> str:
 703         if len(s) == 1:
 704             return s.upper()
 705         elif len(s) == 2:
 706             return f"{s[0].upper()}{s[1].lower()}"
 707         return suffix_capitalize(s[0:1])
 708
 709     if is_string(in_str):
 710         if is_integer_number(in_str):
 711             return to_int(in_str)
 712         suffixes = [in_str[-2:], in_str[-1:]]
 713         rest = [in_str[:-2], in_str[:-1]]
 714         for x in range(len(suffixes)):
 715             s = suffixes[x]
 716             s = suffix_capitalize(s)
 717             multiplier = NUM_SUFFIXES.get(s, None)
 718             if multiplier is not None:
 719                 r = rest[x]
 720                 if is_integer_number(r):
 721                     return to_int(r) * multiplier
 722                 if is_decimal_number(r):
 723                     return int(float(r) * multiplier)
 724     return None
 725
 726
 727 def number_to_suffix_string(num: int) -> Optional[str]:
 728     """Take a number (of bytes) and returns a string like "43.8Gb".
 729
 730     Args:
 731         num: an integer number of bytes
 732
 733     Returns:
 734         A string with a suffix representing num bytes concisely or
 735         None to indicate an error.
 736
 737     >>> number_to_suffix_string(14066017894)
 738     '13.1Gb'
 739     >>> number_to_suffix_string(1024 * 1024)
 740     '1.0Mb'
 741     """
 742     d = 0.0
 743     suffix = None
 744     for (sfx, size) in NUM_SUFFIXES.items():
 745         if num >= size:
 746             d = num / size
 747             suffix = sfx
 748             break
 749     if suffix is not None:
 750         return f"{d:.1f}{suffix}"
 751     else:
 752         return f'{num:d}'
 753
 754
 755 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 756     """
 757     Args:
 758         in_str: a string to check
 759         card_type: if provided, contains the card type to validate
 760             with.  Otherwise, all known credit card number types will
 761             be accepted.
 762
 763             Supported card types are the following:
 764
 765             * VISA
 766             * MASTERCARD
 767             * AMERICAN_EXPRESS
 768             * DINERS_CLUB
 769             * DISCOVER
 770             * JCB
 771
 772     Returns:
 773         True if in_str is a valid credit card number.
 774     """
 775     if not is_full_string(in_str):
 776         return False
 777
 778     if card_type is not None:
 779         if card_type not in CREDIT_CARDS:
 780             raise KeyError(
 781                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 782             )
 783         return CREDIT_CARDS[card_type].match(in_str) is not None
 784     for c in CREDIT_CARDS:
 785         if CREDIT_CARDS[c].match(in_str) is not None:
 786             return True
 787     return False
 788
 789
 790 def is_camel_case(in_str: Any) -> bool:
 791     """
 792     Args:
 793         in_str: the string to test
 794
 795     Returns:
 796         True if the string is formatted as camel case and False otherwise.
 797         A string is considered camel case when:
 798
 799         * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 800         * it contains both lowercase and uppercase letters
 801         * it does not start with a number
 802     """
 803     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 804
 805
 806 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 807     """
 808     Args:
 809         in_str: the string to test
 810
 811     Returns: True if the string is snake case and False otherwise.  A
 812         string is considered snake case when:
 813
 814         * it's composed only by lowercase/uppercase letters and digits
 815         * it contains at least one underscore (or provided separator)
 816         * it does not start with a number
 817
 818     >>> is_snake_case('this_is_a_test')
 819     True
 820     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 821     True
 822     >>> is_snake_case('this-is-a-test')
 823     False
 824     >>> is_snake_case('this-is-a-test', separator='-')
 825     True
 826     """
 827     if is_full_string(in_str):
 828         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 829         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 830         r = re_map.get(
 831             separator,
 832             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 833         )
 834         return r.match(in_str) is not None
 835     return False
 836
 837
 838 def is_json(in_str: Any) -> bool:
 839     """
 840     Args:
 841         in_str: the string to test
 842
 843     Returns:
 844         True if the in_str contains valid JSON and False otherwise.
 845
 846     >>> is_json('{"name": "Peter"}')
 847     True
 848     >>> is_json('[1, 2, 3]')
 849     True
 850     >>> is_json('{nope}')
 851     False
 852     """
 853     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 854         try:
 855             return isinstance(json.loads(in_str), (dict, list))
 856         except (TypeError, ValueError, OverflowError):
 857             pass
 858     return False
 859
 860
 861 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 862     """
 863     Args:
 864         in_str: the string to test
 865
 866     Returns:
 867         True if the in_str contains a valid UUID and False otherwise.
 868
 869     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 870     True
 871     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 872     False
 873     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 874     True
 875     """
 876     # string casting is used to allow UUID itself as input data type
 877     s = str(in_str)
 878     if allow_hex:
 879         return UUID_HEX_OK_RE.match(s) is not None
 880     return UUID_RE.match(s) is not None
 881
 882
 883 def is_ip_v4(in_str: Any) -> bool:
 884     """
 885     Args:
 886         in_str: the string to test
 887
 888     Returns:
 889         True if in_str contains a valid IPv4 address and False otherwise.
 890
 891     >>> is_ip_v4('255.200.100.75')
 892     True
 893     >>> is_ip_v4('nope')
 894     False
 895     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 896     False
 897     """
 898     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 899         return False
 900
 901     # checks that each entry in the ip is in the valid range (0 to 255)
 902     for token in in_str.split("."):
 903         if not 0 <= int(token) <= 255:
 904             return False
 905     return True
 906
 907
 908 def extract_ip_v4(in_str: Any) -> Optional[str]:
 909     """
 910     Args:
 911         in_str: the string to extract an IPv4 address from.
 912
 913     Returns:
 914         The first extracted IPv4 address from in_str or None if
 915         none were found or an error occurred.
 916
 917     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 918     '127.0.0.1'
 919     >>> extract_ip_v4('Your mom dresses you funny.')
 920     """
 921     if not is_full_string(in_str):
 922         return None
 923     m = ANYWHERE_IP_V4_RE.search(in_str)
 924     if m is not None:
 925         return m.group(0)
 926     return None
 927
 928
 929 def is_ip_v6(in_str: Any) -> bool:
 930     """
 931     Args:
 932         in_str: the string to test.
 933
 934     Returns:
 935         True if in_str contains a valid IPv6 address and False otherwise.
 936
 937     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 938     True
 939     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 940     False
 941     """
 942     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 943
 944
 945 def extract_ip_v6(in_str: Any) -> Optional[str]:
 946     """
 947     Args:
 948         in_str: the string from which to extract an IPv6 address.
 949
 950     Returns:
 951         The first IPv6 address found in in_str or None if no address
 952         was found or an error occurred.
 953
 954     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 955     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 956     >>> extract_ip_v6("(and she's ugly too, btw)")
 957     """
 958     if not is_full_string(in_str):
 959         return None
 960     m = ANYWHERE_IP_V6_RE.search(in_str)
 961     if m is not None:
 962         return m.group(0)
 963     return None
 964
 965
 966 def is_ip(in_str: Any) -> bool:
 967     """
 968     Args:
 969         in_str: the string to test.
 970
 971     Returns:
 972         True if in_str contains a valid IP address (either IPv4 or
 973         IPv6).
 974
 975     >>> is_ip('255.200.100.75')
 976     True
 977     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 978     True
 979     >>> is_ip('1.2.3')
 980     False
 981     >>> is_ip('1.2.3.999')
 982     False
 983     """
 984     return is_ip_v6(in_str) or is_ip_v4(in_str)
 985
 986
 987 def extract_ip(in_str: Any) -> Optional[str]:
 988     """
 989     Args:
 990         in_str: the string from which to extract in IP address.
 991
 992     Returns:
 993         The first IP address (IPv4 or IPv6) found in in_str or
 994         None to indicate none found or an error condition.
 995
 996     >>> extract_ip('Attacker: 255.200.100.75')
 997     '255.200.100.75'
 998     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 999     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1000     >>> extract_ip('1.2.3')
1001     """
1002     ip = extract_ip_v4(in_str)
1003     if ip is None:
1004         ip = extract_ip_v6(in_str)
1005     return ip
1006
1007
1008 def is_mac_address(in_str: Any) -> bool:
1009     """
1010     Args:
1011         in_str: the string to test
1012
1013     Returns:
1014         True if in_str is a valid MAC address False otherwise.
1015
1016     >>> is_mac_address("34:29:8F:12:0D:2F")
1017     True
1018     >>> is_mac_address('34:29:8f:12:0d:2f')
1019     True
1020     >>> is_mac_address('34-29-8F-12-0D-2F')
1021     True
1022     >>> is_mac_address("test")
1023     False
1024     """
1025     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1026
1027
1028 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1029     """
1030     Args:
1031         in_str: the string from which to extract a MAC address.
1032
1033     Returns:
1034         The first MAC address found in in_str or None to indicate no
1035         match or an error.
1036
1037     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1038     '34:29:8F:12:0D:2F'
1039
1040     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1041     'd8:5d:e2:34:54:86'
1042     """
1043     if not is_full_string(in_str):
1044         return None
1045     in_str.strip()
1046     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1047     if m is not None:
1048         mac = m.group(0)
1049         mac.replace(":", separator)
1050         mac.replace("-", separator)
1051         return mac
1052     return None
1053
1054
1055 def is_slug(in_str: Any, separator: str = "-") -> bool:
1056     """
1057     Args:
1058         in_str: string to test
1059
1060     Returns:
1061         True if in_str is a slug string and False otherwise.
1062
1063     >>> is_slug('my-blog-post-title')
1064     True
1065     >>> is_slug('My blog post title')
1066     False
1067     """
1068     if not is_full_string(in_str):
1069         return False
1070     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1071     return re.match(rex, in_str) is not None
1072
1073
1074 def contains_html(in_str: str) -> bool:
1075     """
1076     Args:
1077         in_str: the string to check for tags in
1078
1079     Returns:
1080         True if the given string contains HTML/XML tags and False
1081         otherwise.
1082
1083     .. warning::
1084         By design, this function matches ANY type of tag, so don't expect
1085         to use it as an HTML validator.  It's a quick sanity check at
1086         best.  See something like BeautifulSoup for a more full-featuered
1087         HTML parser.
1088
1089     >>> contains_html('my string is <strong>bold</strong>')
1090     True
1091     >>> contains_html('my string is not bold')
1092     False
1093
1094     """
1095     if not is_string(in_str):
1096         raise ValueError(in_str)
1097     return HTML_RE.search(in_str) is not None
1098
1099
1100 def words_count(in_str: str) -> int:
1101     """
1102     Args:
1103         in_str: the string to count words in
1104
1105     Returns:
1106         The number of words contained in the given string.
1107
1108     .. note::
1109
1110         This method is "smart" in that it does consider only sequences
1111         of one or more letter and/or numbers to be "words".  Thus a
1112         string like this: "! @ # % ... []" will return zero.  Moreover
1113         it is aware of punctuation, so the count for a string like
1114         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1115         in the string).
1116
1117     >>> words_count('hello world')
1118     2
1119     >>> words_count('one,two,three.stop')
1120     4
1121     """
1122     if not is_string(in_str):
1123         raise ValueError(in_str)
1124     return len(WORDS_COUNT_RE.findall(in_str))
1125
1126
1127 def word_count(in_str: str) -> int:
1128     """
1129     Args:
1130         in_str: the string to count words in
1131
1132     Returns:
1133         The number of words contained in the given string.
1134
1135     .. note::
1136
1137         This method is "smart" in that it does consider only sequences
1138         of one or more letter and/or numbers to be "words".  Thus a
1139         string like this: "! @ # % ... []" will return zero.  Moreover
1140         it is aware of punctuation, so the count for a string like
1141         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1142         in the string).
1143
1144     >>> word_count('hello world')
1145     2
1146     >>> word_count('one,two,three.stop')
1147     4
1148     """
1149     return words_count(in_str)
1150
1151
1152 def generate_uuid(omit_dashes: bool = False) -> str:
1153     """
1154     Args:
1155         omit_dashes: should we omit the dashes in the generated UUID?
1156
1157     Returns:
1158         A generated UUID string (using `uuid.uuid4()`) with or without
1159         dashes per the omit_dashes arg.
1160
1161     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1162     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1163     """
1164     uid = uuid4()
1165     if omit_dashes:
1166         return uid.hex
1167     return str(uid)
1168
1169
1170 def generate_random_alphanumeric_string(size: int) -> str:
1171     """
1172     Args:
1173         size: number of characters to generate
1174
1175     Returns:
1176         A string of the specified size containing random characters
1177         (uppercase/lowercase ascii letters and digits).
1178
1179     >>> random.seed(22)
1180     >>> generate_random_alphanumeric_string(9)
1181     '96ipbNClS'
1182     """
1183     if size < 1:
1184         raise ValueError("size must be >= 1")
1185     chars = string.ascii_letters + string.digits
1186     buffer = [random.choice(chars) for _ in range(size)]
1187     return from_char_list(buffer)
1188
1189
1190 def reverse(in_str: str) -> str:
1191     """
1192     Args:
1193         in_str: the string to reverse
1194
1195     Returns:
1196         The reversed (chracter by character) string.
1197
1198     >>> reverse('test')
1199     'tset'
1200     """
1201     if not is_string(in_str):
1202         raise ValueError(in_str)
1203     return in_str[::-1]
1204
1205
1206 def camel_case_to_snake_case(in_str, *, separator="_"):
1207     """
1208     Args:
1209         in_str: the camel case string to convert
1210
1211     Returns:
1212         A snake case string equivalent to the camel case input or the
1213         original string if it is not a valid camel case string or some
1214         other error occurs.
1215
1216     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1217     'mac_address_extractor_factory'
1218     >>> camel_case_to_snake_case('Luke Skywalker')
1219     'Luke Skywalker'
1220     """
1221     if not is_string(in_str):
1222         raise ValueError(in_str)
1223     if not is_camel_case(in_str):
1224         return in_str
1225     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1226
1227
1228 def snake_case_to_camel_case(
1229     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1230 ) -> str:
1231     """
1232     Args:
1233         in_str: the snake case string to convert
1234
1235     Returns:
1236         A camel case string that is equivalent to the snake case string
1237         provided or the original string back again if it is not valid
1238         snake case or another error occurs.
1239
1240     >>> snake_case_to_camel_case('this_is_a_test')
1241     'ThisIsATest'
1242     >>> snake_case_to_camel_case('Han Solo')
1243     'Han Solo'
1244     """
1245     if not is_string(in_str):
1246         raise ValueError(in_str)
1247     if not is_snake_case(in_str, separator=separator):
1248         return in_str
1249     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1250     if not upper_case_first:
1251         tokens[0] = tokens[0].lower()
1252     return from_char_list(tokens)
1253
1254
1255 def to_char_list(in_str: str) -> List[str]:
1256     """
1257     Args:
1258         in_str: the string to split into a char list
1259
1260     Returns:
1261         A list of strings of length one each.
1262
1263     >>> to_char_list('test')
1264     ['t', 'e', 's', 't']
1265     """
1266     if not is_string(in_str):
1267         return []
1268     return list(in_str)
1269
1270
1271 def from_char_list(in_list: List[str]) -> str:
1272     """
1273     Args:
1274         in_list: A list of characters to convert into a string.
1275
1276     Returns:
1277         The string resulting from gluing the characters in in_list
1278         together.
1279
1280     >>> from_char_list(['t', 'e', 's', 't'])
1281     'test'
1282     """
1283     return "".join(in_list)
1284
1285
1286 def shuffle(in_str: str) -> Optional[str]:
1287     """
1288     Args:
1289         in_str: a string to shuffle randomly by character
1290
1291     Returns:
1292         A new string containing same chars of the given one but in
1293         a randomized order.  Note that in rare cases this could result
1294         in the same original string as no check is done.  Returns
1295         None to indicate error conditions.
1296
1297     >>> random.seed(22)
1298     >>> shuffle('awesome')
1299     'meosaew'
1300     """
1301     if not is_string(in_str):
1302         return None
1303     chars = to_char_list(in_str)
1304     random.shuffle(chars)
1305     return from_char_list(chars)
1306
1307
1308 def scramble(in_str: str) -> Optional[str]:
1309     """
1310     Args:
1311         in_str: a string to shuffle randomly by character
1312
1313     Returns:
1314         A new string containing same chars of the given one but in
1315         a randomized order.  Note that in rare cases this could result
1316         in the same original string as no check is done.  Returns
1317         None to indicate error conditions.
1318
1319     >>> random.seed(22)
1320     >>> scramble('awesome')
1321     'meosaew'
1322     """
1323     return shuffle(in_str)
1324
1325
1326 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1327     """
1328     Args:
1329         in_str: the string to strip tags from
1330         keep_tag_content: should we keep the inner contents of tags?
1331
1332     Returns:
1333         A string with all HTML tags removed (optionally with tag contents
1334         preserved).
1335
1336     .. note::
1337         This method uses simple regular expressions to strip tags and is
1338         not a full fledged HTML parser by any means.  Consider using
1339         something like BeautifulSoup if your needs are more than this
1340         simple code can fulfill.
1341
1342     >>> strip_html('test: <a href="foo/bar">click here</a>')
1343     'test: '
1344     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1345     'test: click here'
1346     """
1347     if not is_string(in_str):
1348         raise ValueError(in_str)
1349     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1350     return r.sub("", in_str)
1351
1352
1353 def asciify(in_str: str) -> str:
1354     """
1355     Args:
1356         in_str: the string to asciify.
1357
1358     Returns:
1359         An output string roughly equivalent to the original string
1360         where all content to are ascii-only.  This is accomplished
1361         by translating all non-ascii chars into their closest possible
1362         ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1363
1364     .. warning::
1365         Some chars may be lost if impossible to translate.
1366
1367     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1368     'eeuuooaaeynAAACIINOE'
1369     """
1370     if not is_string(in_str):
1371         raise ValueError(in_str)
1372
1373     # "NFKD" is the algorithm which is able to successfully translate
1374     # the most of non-ascii chars.
1375     normalized = unicodedata.normalize("NFKD", in_str)
1376
1377     # encode string forcing ascii and ignore any errors
1378     # (unrepresentable chars will be stripped out)
1379     ascii_bytes = normalized.encode("ascii", "ignore")
1380
1381     # turns encoded bytes into an utf-8 string
1382     return ascii_bytes.decode("utf-8")
1383
1384
1385 def slugify(in_str: str, *, separator: str = "-") -> str:
1386     """
1387     Args:
1388         in_str: the string to slugify
1389         separator: the character to use during sligification (default
1390             is a dash)
1391
1392     Returns:
1393         The converted string.  The returned string has the following properties:
1394
1395         * it has no spaces
1396         * all letters are in lower case
1397         * all punctuation signs and non alphanumeric chars are removed
1398         * words are divided using provided separator
1399         * all chars are encoded as ascii (by using :meth:`asciify`)
1400         * is safe for URL
1401
1402     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1403     'top-10-reasons-to-love-dogs'
1404     >>> slugify('Mönstér Mägnët')
1405     'monster-magnet'
1406     """
1407     if not is_string(in_str):
1408         raise ValueError(in_str)
1409
1410     # replace any character that is NOT letter or number with spaces
1411     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1412
1413     # replace spaces with join sign
1414     out = SPACES_RE.sub(separator, out)
1415
1416     # normalize joins (remove duplicates)
1417     out = re.sub(re.escape(separator) + r"+", separator, out)
1418     return asciify(out)
1419
1420
1421 def to_bool(in_str: str) -> bool:
1422     """
1423     Args:
1424         in_str: the string to convert to boolean
1425
1426     Returns:
1427         A boolean equivalent of the original string based on its contents.
1428         All conversion is case insensitive.  A positive boolean (True) is
1429         returned if the string value is any of the following:
1430
1431         * "true"
1432         * "t"
1433         * "1"
1434         * "yes"
1435         * "y"
1436         * "on"
1437
1438         Otherwise False is returned.
1439
1440     >>> to_bool('True')
1441     True
1442
1443     >>> to_bool('1')
1444     True
1445
1446     >>> to_bool('yes')
1447     True
1448
1449     >>> to_bool('no')
1450     False
1451
1452     >>> to_bool('huh?')
1453     False
1454
1455     >>> to_bool('on')
1456     True
1457     """
1458     if not is_string(in_str):
1459         raise ValueError(in_str)
1460     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1461
1462
1463 def to_date(in_str: str) -> Optional[datetime.date]:
1464     """
1465     Args:
1466         in_str: the string to convert into a date
1467
1468     Returns:
1469         The datetime.date the string contained or None to indicate
1470         an error.  This parser is relatively clever; see
1471         :class:`datetimez.dateparse_utils` docs for details.
1472
1473     >>> to_date('9/11/2001')
1474     datetime.date(2001, 9, 11)
1475     >>> to_date('xyzzy')
1476     """
1477     import pyutils.datetimez.dateparse_utils as du
1478
1479     try:
1480         d = du.DateParser()  # type: ignore
1481         d.parse(in_str)
1482         return d.get_date()
1483     except du.ParseException:  # type: ignore
1484         msg = f'Unable to parse date {in_str}.'
1485         logger.warning(msg)
1486     return None
1487
1488
1489 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1490     """Finds and extracts a date from the string, if possible.
1491
1492     Args:
1493         in_str: the string to extract a date from
1494
1495     Returns:
1496         a datetime if date was found, otherwise None
1497
1498     >>> extract_date("filename.txt    dec 13, 2022")
1499     datetime.datetime(2022, 12, 13, 0, 0)
1500
1501     >>> extract_date("Dear Santa, please get me a pony.")
1502
1503     """
1504     import itertools
1505
1506     import pyutils.datetimez.dateparse_utils as du
1507
1508     d = du.DateParser()  # type: ignore
1509     chunks = in_str.split()
1510     for ngram in itertools.chain(
1511         list_utils.ngrams(chunks, 5),
1512         list_utils.ngrams(chunks, 4),
1513         list_utils.ngrams(chunks, 3),
1514         list_utils.ngrams(chunks, 2),
1515     ):
1516         try:
1517             expr = " ".join(ngram)
1518             logger.debug(f"Trying {expr}")
1519             if d.parse(expr):
1520                 return d.get_datetime()
1521         except du.ParseException:  # type: ignore
1522             pass
1523     return None
1524
1525
1526 def is_valid_date(in_str: str) -> bool:
1527     """
1528     Args:
1529         in_str: the string to check
1530
1531     Returns:
1532         True if the string represents a valid date that we can recognize
1533         and False otherwise.  This parser is relatively clever; see
1534         :class:`datetimez.dateparse_utils` docs for details.
1535
1536     >>> is_valid_date('1/2/2022')
1537     True
1538     >>> is_valid_date('christmas')
1539     True
1540     >>> is_valid_date('next wednesday')
1541     True
1542     >>> is_valid_date('xyzzy')
1543     False
1544     """
1545     import pyutils.datetimez.dateparse_utils as dp
1546
1547     try:
1548         d = dp.DateParser()  # type: ignore
1549         _ = d.parse(in_str)
1550         return True
1551     except dp.ParseException:  # type: ignore
1552         msg = f'Unable to parse date {in_str}.'
1553         logger.warning(msg)
1554     return False
1555
1556
1557 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1558     """
1559     Args:
1560         in_str: string to parse into a datetime
1561
1562     Returns:
1563         A python datetime parsed from in_str or None to indicate
1564         an error.  This parser is relatively clever; see
1565         :class:`datetimez.dateparse_utils` docs for details.
1566
1567     >>> to_datetime('7/20/1969 02:56 GMT')
1568     datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1569     """
1570     import pyutils.datetimez.dateparse_utils as dp
1571
1572     try:
1573         d = dp.DateParser()  # type: ignore
1574         dt = d.parse(in_str)
1575         if isinstance(dt, datetime.datetime):
1576             return dt
1577     except Exception:
1578         msg = f'Unable to parse datetime {in_str}.'
1579         logger.warning(msg)
1580     return None
1581
1582
1583 def valid_datetime(in_str: str) -> bool:
1584     """
1585     Args:
1586         in_str: the string to check
1587
1588     Returns:
1589         True if in_str contains a valid datetime and False otherwise.
1590         This parser is relatively clever; see
1591         :class:`datetimez.dateparse_utils` docs for details.
1592
1593     >>> valid_datetime('next wednesday at noon')
1594     True
1595     >>> valid_datetime('3 weeks ago at midnight')
1596     True
1597     >>> valid_datetime('next easter at 5:00 am')
1598     True
1599     >>> valid_datetime('sometime soon')
1600     False
1601     """
1602     _ = to_datetime(in_str)
1603     if _ is not None:
1604         return True
1605     msg = f'Unable to parse datetime {in_str}.'
1606     logger.warning(msg)
1607     return False
1608
1609
1610 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1611     """
1612     Args:
1613         in_str: the string to squeeze
1614         character_to_squeeze: the character to remove runs of
1615             more than one in a row (default = space)
1616
1617     Returns: A "squeezed string" where runs of more than one
1618         character_to_squeeze into one.
1619
1620     >>> squeeze(' this        is       a    test    ')
1621     ' this is a test '
1622
1623     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1624     'one|!|two|!|three'
1625
1626     """
1627     return re.sub(
1628         r'(' + re.escape(character_to_squeeze) + r')+',
1629         character_to_squeeze,
1630         in_str,
1631     )
1632
1633
1634 def dedent(in_str: str) -> Optional[str]:
1635     """
1636     Args:
1637         in_str: the string to dedent
1638
1639     Returns:
1640         A string with tab indentation removed or None on error.
1641
1642     .. note::
1643
1644         Inspired by analogous Scala function.
1645
1646     >>> dedent('\t\ttest\\n\t\ting')
1647     'test\\ning'
1648     """
1649     if not is_string(in_str):
1650         return None
1651     line_separator = '\n'
1652     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1653     return line_separator.join(lines)
1654
1655
1656 def indent(in_str: str, amount: int) -> str:
1657     """
1658     Args:
1659         in_str: the string to indent
1660         amount: count of spaces to indent each line by
1661
1662     Returns:
1663         An indented string created by prepending amount spaces.
1664
1665     >>> indent('This is a test', 4)
1666     '    This is a test'
1667     """
1668     if not is_string(in_str):
1669         raise ValueError(in_str)
1670     line_separator = '\n'
1671     lines = [" " * amount + line for line in in_str.split(line_separator)]
1672     return line_separator.join(lines)
1673
1674
1675 def sprintf(*args, **kwargs) -> str:
1676     """
1677     Args:
1678         This function uses the same syntax as the builtin print
1679         function.
1680
1681     Returns:
1682         An interpolated string capturing print output, like man(3)
1683         :code:sprintf.
1684     """
1685     ret = ""
1686
1687     sep = kwargs.pop("sep", None)
1688     if sep is not None:
1689         if not isinstance(sep, str):
1690             raise TypeError("sep must be None or a string")
1691
1692     end = kwargs.pop("end", None)
1693     if end is not None:
1694         if not isinstance(end, str):
1695             raise TypeError("end must be None or a string")
1696
1697     if kwargs:
1698         raise TypeError("invalid keyword arguments to sprint()")
1699
1700     if sep is None:
1701         sep = " "
1702     if end is None:
1703         end = "\n"
1704     for i, arg in enumerate(args):
1705         if i:
1706             ret += sep
1707         if isinstance(arg, str):
1708             ret += arg
1709         else:
1710             ret += str(arg)
1711     ret += end
1712     return ret
1713
1714
1715 def strip_ansi_sequences(in_str: str) -> str:
1716     """
1717     Args:
1718         in_str: the string to strip
1719
1720     Returns:
1721         in_str with recognized ANSI escape sequences removed.
1722
1723     .. warning::
1724         This method works by using a regular expression.
1725         It works for all ANSI escape sequences I've tested with but
1726         may miss some; caveat emptor.
1727
1728     >>> import ansi as a
1729     >>> s = a.fg('blue') + 'blue!' + a.reset()
1730     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1731     18
1732     >>> len(strip_ansi_sequences(s))
1733     5
1734     >>> strip_ansi_sequences(s)
1735     'blue!'
1736
1737     """
1738     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1739
1740
1741 class SprintfStdout(contextlib.AbstractContextManager):
1742     """
1743     A context manager that captures outputs to stdout to a buffer
1744     without printing them.
1745
1746     >>> with SprintfStdout() as buf:
1747     ...     print("test")
1748     ...     print("1, 2, 3")
1749     ...
1750     >>> print(buf(), end='')
1751     test
1752     1, 2, 3
1753
1754     """
1755
1756     def __init__(self) -> None:
1757         self.destination = io.StringIO()
1758         self.recorder: contextlib.redirect_stdout
1759
1760     def __enter__(self) -> Callable[[], str]:
1761         self.recorder = contextlib.redirect_stdout(self.destination)
1762         self.recorder.__enter__()
1763         return lambda: self.destination.getvalue()
1764
1765     def __exit__(self, *args) -> Literal[False]:
1766         self.recorder.__exit__(*args)
1767         self.destination.seek(0)
1768         return False
1769
1770
1771 def capitalize_first_letter(in_str: str) -> str:
1772     """
1773     Args:
1774         in_str: the string to capitalize
1775
1776     Returns:
1777         in_str with the first character capitalized.
1778
1779     >>> capitalize_first_letter('test')
1780     'Test'
1781     >>> capitalize_first_letter("ALREADY!")
1782     'ALREADY!'
1783
1784     """
1785     return in_str[0].upper() + in_str[1:]
1786
1787
1788 def it_they(n: int) -> str:
1789     """
1790     Args:
1791         n: how many of them are there?
1792
1793     Returns:
1794         'it' if n is one or 'they' otherwize.
1795
1796     Suggested usage::
1797
1798         n = num_files_saved_to_tmp()
1799         print(f'Saved file{pluralize(n)} successfully.')
1800         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1801
1802     >>> it_they(1)
1803     'it'
1804     >>> it_they(100)
1805     'they'
1806     """
1807     if n == 1:
1808         return "it"
1809     return "they"
1810
1811
1812 def is_are(n: int) -> str:
1813     """
1814     Args:
1815         n: how many of them are there?
1816
1817     Returns:
1818         'is' if n is one or 'are' otherwize.
1819
1820     Suggested usage::
1821
1822         n = num_files_saved_to_tmp()
1823         print(f'Saved file{pluralize(n)} successfully.')
1824         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1825
1826     >>> is_are(1)
1827     'is'
1828     >>> is_are(2)
1829     'are'
1830
1831     """
1832     if n == 1:
1833         return "is"
1834     return "are"
1835
1836
1837 def pluralize(n: int) -> str:
1838     """
1839     Args:
1840         n: how many of them are there?
1841
1842     Returns:
1843         's' if n is greater than one otherwize ''.
1844
1845     Suggested usage::
1846
1847         n = num_files_saved_to_tmp()
1848         print(f'Saved file{pluralize(n)} successfully.')
1849         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1850
1851     >>> pluralize(15)
1852     's'
1853     >>> count = 1
1854     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1855     There is 1 file.
1856     >>> count = 4
1857     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1858     There are 4 files.
1859     """
1860     if n == 1:
1861         return ""
1862     return "s"
1863
1864
1865 def make_contractions(txt: str) -> str:
1866     """This code glues words in txt together to form (English)
1867     contractions.
1868
1869     Args:
1870         txt: the input text to be contractionized.
1871
1872     Returns:
1873         Output text identical to original input except for any
1874         recognized contractions are formed.
1875
1876     .. note::
1877         The order in which we create contractions is defined by the
1878         implementation and what I thought made more sense when writing
1879         this code.
1880
1881     >>> make_contractions('It is nice today.')
1882     "It's nice today."
1883
1884     >>> make_contractions('I can    not even...')
1885     "I can't even..."
1886
1887     >>> make_contractions('She could not see!')
1888     "She couldn't see!"
1889
1890     >>> make_contractions('But she will not go.')
1891     "But she won't go."
1892
1893     >>> make_contractions('Verily, I shall not.')
1894     "Verily, I shan't."
1895
1896     >>> make_contractions('No you cannot.')
1897     "No you can't."
1898
1899     >>> make_contractions('I said you can not go.')
1900     "I said you can't go."
1901     """
1902
1903     first_second = [
1904         (
1905             [
1906                 'are',
1907                 'could',
1908                 'did',
1909                 'has',
1910                 'have',
1911                 'is',
1912                 'must',
1913                 'should',
1914                 'was',
1915                 'were',
1916                 'would',
1917             ],
1918             ['(n)o(t)'],
1919         ),
1920         (
1921             [
1922                 "I",
1923                 "you",
1924                 "he",
1925                 "she",
1926                 "it",
1927                 "we",
1928                 "they",
1929                 "how",
1930                 "why",
1931                 "when",
1932                 "where",
1933                 "who",
1934                 "there",
1935             ],
1936             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1937         ),
1938     ]
1939
1940     # Special cases: can't, shan't and won't.
1941     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1942     txt = re.sub(
1943         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
1944     )
1945     txt = re.sub(
1946         r'\b(w)ill\s*(n)(o)(t)\b',
1947         r"\1\3\2'\4",
1948         txt,
1949         count=0,
1950         flags=re.IGNORECASE,
1951     )
1952
1953     for first_list, second_list in first_second:
1954         for first in first_list:
1955             for second in second_list:
1956                 # Disallow there're/where're.  They're valid English
1957                 # but sound weird.
1958                 if (first in ('there', 'where')) and second == 'a(re)':
1959                     continue
1960
1961                 pattern = fr'\b({first})\s+{second}\b'
1962                 if second == '(n)o(t)':
1963                     replacement = r"\1\2'\3"
1964                 else:
1965                     replacement = r"\1'\2"
1966                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1967
1968     return txt
1969
1970
1971 def thify(n: int) -> str:
1972     """
1973     Args:
1974         n: how many of them are there?
1975
1976     Returns:
1977         The proper cardinal suffix for a number.
1978
1979     Suggested usage::
1980
1981         attempt_count = 0
1982         while True:
1983             attempt_count += 1
1984             if try_the_thing():
1985                 break
1986             print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
1987
1988     >>> thify(1)
1989     'st'
1990     >>> thify(33)
1991     'rd'
1992     >>> thify(16)
1993     'th'
1994     """
1995     digit = str(n)
1996     assert is_integer_number(digit)
1997     digit = digit[-1:]
1998     if digit == "1":
1999         return "st"
2000     elif digit == "2":
2001         return "nd"
2002     elif digit == "3":
2003         return "rd"
2004     else:
2005         return "th"
2006
2007
2008 def ngrams(txt: str, n: int):
2009     """
2010     Args:
2011         txt: the string to create ngrams using
2012         n: how many words per ngram created?
2013
2014     Returns:
2015         Generates the ngrams from the input string.
2016
2017     >>> [x for x in ngrams('This is a test', 2)]
2018     ['This is', 'is a', 'a test']
2019     """
2020     words = txt.split()
2021     for ngram in ngrams_presplit(words, n):
2022         ret = ''
2023         for word in ngram:
2024             ret += f'{word} '
2025         yield ret.strip()
2026
2027
2028 def ngrams_presplit(words: Sequence[str], n: int):
2029     """
2030     Same as :meth:`ngrams` but with the string pre-split.
2031     """
2032     return list_utils.ngrams(words, n)
2033
2034
2035 def bigrams(txt: str):
2036     """Generates the bigrams (n=2) of the given string."""
2037     return ngrams(txt, 2)
2038
2039
2040 def trigrams(txt: str):
2041     """Generates the trigrams (n=3) of the given string."""
2042     return ngrams(txt, 3)
2043
2044
2045 def shuffle_columns_into_list(
2046     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
2047 ) -> Iterable[str]:
2048     """Helper to shuffle / parse columnar data and return the results as a
2049     list.
2050
2051     Args:
2052         input_lines: A sequence of strings that represents text that
2053             has been broken into columns by the caller
2054         column_specs: an iterable collection of numeric sequences that
2055             indicate one or more column numbers to copy to form the Nth
2056             position in the output list.  See example below.
2057         delim: for column_specs that indicate we should copy more than
2058             one column from the input into this position, use delim to
2059             separate source data.  Defaults to ''.
2060
2061     Returns:
2062         A list of string created by following the instructions set forth
2063         in column_specs.
2064
2065     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2066     >>> shuffle_columns_into_list(
2067     ...     cols,
2068     ...     [ [8], [2, 3], [5, 6, 7] ],
2069     ...     delim='!',
2070     ... )
2071     ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2072     """
2073     out = []
2074
2075     # Column specs map input lines' columns into outputs.
2076     # [col1, col2...]
2077     for spec in column_specs:
2078         hunk = ''
2079         for n in spec:
2080             hunk = hunk + delim + input_lines[n]
2081         hunk = hunk.strip(delim)
2082         out.append(hunk)
2083     return out
2084
2085
2086 def shuffle_columns_into_dict(
2087     input_lines: Sequence[str],
2088     column_specs: Iterable[Tuple[str, Iterable[int]]],
2089     delim='',
2090 ) -> Dict[str, str]:
2091     """Helper to shuffle / parse columnar data and return the results
2092     as a dict.
2093
2094     Args:
2095         input_lines: a sequence of strings that represents text that
2096             has been broken into columns by the caller
2097         column_specs: instructions for what dictionary keys to apply
2098             to individual or compound input column data.  See example
2099             below.
2100         delim: when forming compound output data by gluing more than
2101             one input column together, use this character to separate
2102             the source data.  Defaults to ''.
2103
2104     Returns:
2105         A dict formed by applying the column_specs instructions.
2106
2107     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2108     >>> shuffle_columns_into_dict(
2109     ...     cols,
2110     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2111     ...     delim='!',
2112     ... )
2113     {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2114     """
2115     out = {}
2116
2117     # Column specs map input lines' columns into outputs.
2118     # "key", [col1, col2...]
2119     for spec in column_specs:
2120         hunk = ''
2121         for n in spec[1]:
2122             hunk = hunk + delim + input_lines[n]
2123         hunk = hunk.strip(delim)
2124         out[spec[0]] = hunk
2125     return out
2126
2127
2128 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2129     """
2130     Interpolate a string with data from a dict.
2131
2132     Args:
2133         txt: the mad libs template
2134         values: what you and your kids chose for each category.
2135
2136     >>> interpolate_using_dict('This is a {adjective} {noun}.',
2137     ...                        {'adjective': 'good', 'noun': 'example'})
2138     'This is a good example.'
2139     """
2140     return sprintf(txt.format(**values), end='')
2141
2142
2143 def to_ascii(txt: str):
2144     """
2145     Args:
2146         txt: the input data to encode
2147
2148     Returns:
2149         txt encoded as an ASCII byte string.
2150
2151     >>> to_ascii('test')
2152     b'test'
2153
2154     >>> to_ascii(b'1, 2, 3')
2155     b'1, 2, 3'
2156     """
2157     if isinstance(txt, str):
2158         return txt.encode('ascii')
2159     if isinstance(txt, bytes):
2160         return txt
2161     raise Exception('to_ascii works with strings and bytes')
2162
2163
2164 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
2165     """
2166     Args:
2167         txt: the input data to encode
2168
2169     Returns:
2170         txt encoded with a 64-chracter alphabet.  Similar to and compatible
2171         with uuencode/uudecode.
2172
2173     >>> to_base64('hello?')
2174     b'aGVsbG8/\\n'
2175     """
2176     return base64.encodebytes(txt.encode(encoding, errors))
2177
2178
2179 def is_base64(txt: str) -> bool:
2180     """
2181     Args:
2182         txt: the string to check
2183
2184     Returns:
2185         True if txt is a valid base64 encoded string.  This assumes
2186         txt was encoded with Python's standard base64 alphabet which
2187         is the same as what uuencode/uudecode uses).
2188
2189     >>> is_base64('test')    # all letters in the b64 alphabet
2190     True
2191
2192     >>> is_base64('another test, how do you like this one?')
2193     False
2194
2195     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
2196     True
2197
2198     """
2199     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2200     alphabet = set(a.encode('ascii'))
2201     for char in to_ascii(txt.strip()):
2202         if char not in alphabet:
2203             return False
2204     return True
2205
2206
2207 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
2208     """
2209     Args:
2210         b64: bytestring of 64-bit encoded data to decode / convert.
2211
2212     Returns:
2213         The decoded form of b64 as a normal python string.  Similar to
2214         and compatible with uuencode / uudecode.
2215
2216     >>> from_base64(b'aGVsbG8/\\n')
2217     'hello?'
2218     """
2219     return base64.decodebytes(b64).decode(encoding, errors)
2220
2221
2222 def chunk(txt: str, chunk_size: int):
2223     """
2224     Args:
2225         txt: a string to be chunked into evenly spaced pieces.
2226         chunk_size: the size of each chunk to make
2227
2228     Returns:
2229         The original string chunked into evenly spaced pieces.
2230
2231     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2232     '01001101 11000101 10101010 10101010 10011111 10101000'
2233     """
2234     if len(txt) % chunk_size != 0:
2235         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2236         logger.warning(msg)
2237         warnings.warn(msg, stacklevel=2)
2238     for x in range(0, len(txt), chunk_size):
2239         yield txt[x : x + chunk_size]
2240
2241
2242 def to_bitstring(txt: str, *, delimiter='') -> str:
2243     """
2244     Args:
2245         txt: the string to convert into a bitstring
2246         delimiter: character to insert between adjacent bytes.  Note that
2247             only bitstrings with delimiter='' are interpretable by
2248             :meth:`from_bitstring`.
2249
2250     Returns:
2251         txt converted to ascii/binary and then chopped into bytes.
2252
2253     >>> to_bitstring('hello?')
2254     '011010000110010101101100011011000110111100111111'
2255
2256     >>> to_bitstring('test', delimiter=' ')
2257     '01110100 01100101 01110011 01110100'
2258
2259     >>> to_bitstring(b'test')
2260     '01110100011001010111001101110100'
2261     """
2262     etxt = to_ascii(txt)
2263     bits = bin(int.from_bytes(etxt, 'big'))
2264     bits = bits[2:]
2265     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2266
2267
2268 def is_bitstring(txt: str) -> bool:
2269     """
2270     Args:
2271         txt: the string to check
2272
2273     Returns:
2274         True if txt is a recognized bitstring and False otherwise.
2275         Note that if delimiter is non empty this code will not
2276         recognize the bitstring.
2277
2278     >>> is_bitstring('011010000110010101101100011011000110111100111111')
2279     True
2280
2281     >>> is_bitstring('1234')
2282     False
2283     """
2284     return is_binary_integer_number(f'0b{txt}')
2285
2286
2287 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
2288     """
2289     Args:
2290         bits: the bitstring to convert back into a python string
2291         encoding: the encoding to use
2292
2293     Returns:
2294         The regular python string represented by bits.  Note that this
2295         code does not work with to_bitstring when delimiter is non-empty.
2296
2297     >>> from_bitstring('011010000110010101101100011011000110111100111111')
2298     'hello?'
2299     """
2300     n = int(bits, 2)
2301     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2302
2303
2304 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2305     """
2306     Args:
2307         txt: an IP address to chunk up for sorting purposes
2308
2309     Returns:
2310         A tuple of IP components arranged such that the sorting of
2311         IP addresses using a normal comparator will do something sane
2312         and desireable.
2313
2314     >>> ip_v4_sort_key('10.0.0.18')
2315     (10, 0, 0, 18)
2316
2317     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2318     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2319     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2320     """
2321     if not is_ip_v4(txt):
2322         print(f"not IP: {txt}")
2323         return None
2324     return tuple(int(x) for x in txt.split('.'))
2325
2326
2327 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2328     """
2329     Args:
2330         volume: the string to chunk up for sorting purposes
2331
2332     Returns:
2333         A tuple of volume's components such that the sorting of
2334         volumes using a normal comparator will do something sane
2335         and desireable.
2336
2337     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2338     ('usr', 'local', 'bin')
2339
2340     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2341     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2342     ['/usr', '/usr/local', '/usr/local/bin']
2343     """
2344     return tuple(x for x in volume.split('/') if len(x) > 0)
2345
2346
2347 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2348     """
2349     Execute several replace operations in a row.
2350
2351     Args:
2352         in_str: the string in which to replace characters
2353         replace_set: the set of target characters to replace
2354         replacement: the character to replace any member of replace_set
2355             with
2356
2357     Returns:
2358         The string with replacements executed.
2359
2360     >>> s = 'this_is a-test!'
2361     >>> replace_all(s, ' _-!', '')
2362     'thisisatest'
2363     """
2364     for char in replace_set:
2365         in_str = in_str.replace(char, replacement)
2366     return in_str
2367
2368
2369 def replace_nth(in_str: str, source: str, target: str, nth: int):
2370     """
2371     Replaces the nth occurrance of a substring within a string.
2372
2373     Args:
2374         in_str: the string in which to run the replacement
2375         source: the substring to replace
2376         target: the replacement text
2377         nth: which occurrance of source to replace?
2378
2379     >>> replace_nth('this is a test', ' ', '-', 3)
2380     'this is a-test'
2381     """
2382     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2383     before = in_str[:where]
2384     after = in_str[where:]
2385     after = after.replace(source, target, 1)
2386     return before + after
2387
2388
2389 if __name__ == '__main__':
2390     import doctest
2391
2392     doctest.testmod()