src/pyutils/string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7
   8 Modifications Copyright (c) 2021-2022 Scott Gasch
   9
  10 Permission is hereby granted, free of charge, to any person obtaining a copy
  11 of this software and associated documentation files (the "Software"), to deal
  12 in the Software without restriction, including without limitation the rights
  13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 copies of the Software, and to permit persons to whom the Software is
  15 furnished to do so, subject to the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included in all
  18 copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26 SOFTWARE.
  27
  28 This class is based on: https://github.com/daveoncode/python-string-utils.
  29 See NOTICE in the root of this module for a detailed enumeration of what
  30 work is Davide's and what work was added by Scott.
  31 """
  32
  33 import base64
  34 import contextlib  # type: ignore
  35 import datetime
  36 import io
  37 import json
  38 import logging
  39 import numbers
  40 import random
  41 import re
  42 import string
  43 import unicodedata
  44 import warnings
  45 from itertools import zip_longest
  46 from typing import (
  47     Any,
  48     Callable,
  49     Dict,
  50     Iterable,
  51     List,
  52     Literal,
  53     Optional,
  54     Sequence,
  55     Tuple,
  56 )
  57 from uuid import uuid4
  58
  59 from pyutils import list_utils
  60
  61 logger = logging.getLogger(__name__)
  62
  63 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  64
  65 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  66
  67 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  68
  69 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  70
  71 URLS_RAW_STRING = (
  72     r"([a-z-]+://)"  # scheme
  73     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  74     r"(www\.)?"  # www.
  75     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  76     r"(:\d{2,})?"  # port number
  77     r"(/[a-z\d_%+-]*)*"  # folders
  78     r"(\.[a-z\d_%+-]+)*"  # file extension
  79     r"(\?[a-z\d_+%-=]*)?"  # query string
  80     r"(#\S*)?"  # hash
  81 )
  82
  83 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  84
  85 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  86
  87 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  88
  89 EMAILS_RAW_STRING = (
  90     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  91 )
  92
  93 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  94
  95 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  96
  97 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
  98
  99 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
 100
 101 SNAKE_CASE_TEST_RE = re.compile(
 102     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
 103 )
 104
 105 SNAKE_CASE_TEST_DASH_RE = re.compile(
 106     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 107 )
 108
 109 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 110
 111 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 112
 113 CREDIT_CARDS = {
 114     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 115     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 116     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 117     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 118     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 119     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 120 }
 121
 122 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 123
 124 UUID_RE = re.compile(
 125     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 126 )
 127
 128 UUID_HEX_OK_RE = re.compile(
 129     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 130     re.IGNORECASE,
 131 )
 132
 133 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 134
 135 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 136
 137 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 138
 139 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 140
 141 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 142
 143 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 144     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 145 )
 146
 147 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 148
 149 HTML_RE = re.compile(
 150     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 151     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 152 )
 153
 154 HTML_TAG_ONLY_RE = re.compile(
 155     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 156     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 157 )
 158
 159 SPACES_RE = re.compile(r"\s")
 160
 161 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 162
 163 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 164
 165 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 166
 167 NUM_SUFFIXES = {
 168     "Pb": (1024**5),
 169     "P": (1024**5),
 170     "Tb": (1024**4),
 171     "T": (1024**4),
 172     "Gb": (1024**3),
 173     "G": (1024**3),
 174     "Mb": (1024**2),
 175     "M": (1024**2),
 176     "Kb": (1024**1),
 177     "K": (1024**1),
 178 }
 179
 180 units = [
 181     "zero",
 182     "one",
 183     "two",
 184     "three",
 185     "four",
 186     "five",
 187     "six",
 188     "seven",
 189     "eight",
 190     "nine",
 191     "ten",
 192     "eleven",
 193     "twelve",
 194     "thirteen",
 195     "fourteen",
 196     "fifteen",
 197     "sixteen",
 198     "seventeen",
 199     "eighteen",
 200     "nineteen",
 201 ]
 202
 203 tens = [
 204     "",
 205     "",
 206     "twenty",
 207     "thirty",
 208     "forty",
 209     "fifty",
 210     "sixty",
 211     "seventy",
 212     "eighty",
 213     "ninety",
 214 ]
 215
 216 scales = ["hundred", "thousand", "million", "billion", "trillion"]
 217
 218 NUM_WORDS = {}
 219 NUM_WORDS["and"] = (1, 0)
 220 for i, word in enumerate(units):
 221     NUM_WORDS[word] = (1, i)
 222 for i, word in enumerate(tens):
 223     NUM_WORDS[word] = (1, i * 10)
 224 for i, word in enumerate(scales):
 225     NUM_WORDS[word] = (10 ** (i * 3 or 2), 0)
 226 NUM_WORDS['score'] = (20, 0)
 227
 228
 229 def is_none_or_empty(in_str: Optional[str]) -> bool:
 230     """
 231     Args:
 232         in_str: the string to test
 233
 234     Returns:
 235         True if the input string is either None or an empty string,
 236         False otherwise.
 237
 238     >>> is_none_or_empty("")
 239     True
 240     >>> is_none_or_empty(None)
 241     True
 242     >>> is_none_or_empty("   \t   ")
 243     True
 244     >>> is_none_or_empty('Test')
 245     False
 246     """
 247     return in_str is None or len(in_str.strip()) == 0
 248
 249
 250 def is_string(obj: Any) -> bool:
 251     """
 252     Args:
 253         in_str: the object to test
 254
 255     Returns:
 256         True if the object is a string and False otherwise.
 257
 258     >>> is_string('test')
 259     True
 260     >>> is_string(123)
 261     False
 262     >>> is_string(100.3)
 263     False
 264     >>> is_string([1, 2, 3])
 265     False
 266     """
 267     return isinstance(obj, str)
 268
 269
 270 def is_empty_string(in_str: Any) -> bool:
 271     """
 272     Args:
 273         in_str: the string to test
 274
 275     Returns:
 276         True if the string is empty and False otherwise.
 277     """
 278     return is_empty(in_str)
 279
 280
 281 def is_empty(in_str: Any) -> bool:
 282     """
 283     Args:
 284         in_str: the string to test
 285
 286     Returns:
 287         True if the string is empty and false otherwise.
 288
 289     >>> is_empty('')
 290     True
 291     >>> is_empty('    \t\t    ')
 292     True
 293     >>> is_empty('test')
 294     False
 295     >>> is_empty(100.88)
 296     False
 297     >>> is_empty([1, 2, 3])
 298     False
 299     """
 300     return is_string(in_str) and in_str.strip() == ""
 301
 302
 303 def is_full_string(in_str: Any) -> bool:
 304     """
 305     Args:
 306         in_str: the object to test
 307
 308     Returns:
 309         True if the object is a string and is not empty ('') and
 310         is not only composed of whitespace.
 311
 312     >>> is_full_string('test!')
 313     True
 314     >>> is_full_string('')
 315     False
 316     >>> is_full_string('      ')
 317     False
 318     >>> is_full_string(100.999)
 319     False
 320     >>> is_full_string({"a": 1, "b": 2})
 321     False
 322     """
 323     return is_string(in_str) and in_str.strip() != ""
 324
 325
 326 def is_number(in_str: str) -> bool:
 327     """
 328     Args:
 329         in_str: the string to test
 330
 331     Returns:
 332         True if the string contains a valid numberic value and
 333         False otherwise.
 334
 335     >>> is_number(100.5)
 336     Traceback (most recent call last):
 337     ...
 338     ValueError: 100.5
 339     >>> is_number("100.5")
 340     True
 341     >>> is_number("test")
 342     False
 343     >>> is_number("99")
 344     True
 345     >>> is_number([1, 2, 3])
 346     Traceback (most recent call last):
 347     ...
 348     ValueError: [1, 2, 3]
 349     """
 350     if not is_string(in_str):
 351         raise ValueError(in_str)
 352     return NUMBER_RE.match(in_str) is not None
 353
 354
 355 def is_integer_number(in_str: str) -> bool:
 356     """
 357     Args:
 358         in_str: the string to test
 359
 360     Returns:
 361         True if the string contains a valid (signed or unsigned,
 362         decimal, hex, or octal, regular or scientific) integral
 363         expression and False otherwise.
 364
 365     >>> is_integer_number('42')
 366     True
 367     >>> is_integer_number('42.0')
 368     False
 369     """
 370     return (
 371         (is_number(in_str) and "." not in in_str)
 372         or is_hexidecimal_integer_number(in_str)
 373         or is_octal_integer_number(in_str)
 374         or is_binary_integer_number(in_str)
 375     )
 376
 377
 378 def is_hexidecimal_integer_number(in_str: str) -> bool:
 379     """
 380     Args:
 381         in_str: the string to test
 382
 383     Returns:
 384         True if the string is a hex integer number and False otherwise.
 385
 386     >>> is_hexidecimal_integer_number('0x12345')
 387     True
 388     >>> is_hexidecimal_integer_number('0x1A3E')
 389     True
 390     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 391     False
 392     >>> is_hexidecimal_integer_number('-0xff')
 393     True
 394     >>> is_hexidecimal_integer_number('test')
 395     False
 396     >>> is_hexidecimal_integer_number(12345)  # Not a string
 397     Traceback (most recent call last):
 398     ...
 399     ValueError: 12345
 400     >>> is_hexidecimal_integer_number(101.4)
 401     Traceback (most recent call last):
 402     ...
 403     ValueError: 101.4
 404     >>> is_hexidecimal_integer_number(0x1A3E)
 405     Traceback (most recent call last):
 406     ...
 407     ValueError: 6718
 408     """
 409     if not is_string(in_str):
 410         raise ValueError(in_str)
 411     return HEX_NUMBER_RE.match(in_str) is not None
 412
 413
 414 def is_octal_integer_number(in_str: str) -> bool:
 415     """
 416     Args:
 417         in_str: the string to test
 418
 419     Returns:
 420         True if the string is a valid octal integral number and False otherwise.
 421
 422     >>> is_octal_integer_number('0o777')
 423     True
 424     >>> is_octal_integer_number('-0O115')
 425     True
 426     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 427     False
 428     >>> is_octal_integer_number('7777')  # Needs 0o
 429     False
 430     >>> is_octal_integer_number('test')
 431     False
 432     """
 433     if not is_string(in_str):
 434         raise ValueError(in_str)
 435     return OCT_NUMBER_RE.match(in_str) is not None
 436
 437
 438 def is_binary_integer_number(in_str: str) -> bool:
 439     """
 440     Args:
 441         in_str: the string to test
 442
 443     Returns:
 444         True if the string contains a binary integral number and False otherwise.
 445
 446     >>> is_binary_integer_number('0b10111')
 447     True
 448     >>> is_binary_integer_number('-0b111')
 449     True
 450     >>> is_binary_integer_number('0B10101')
 451     True
 452     >>> is_binary_integer_number('0b10102')
 453     False
 454     >>> is_binary_integer_number('0xFFF')
 455     False
 456     >>> is_binary_integer_number('test')
 457     False
 458     """
 459     if not is_string(in_str):
 460         raise ValueError(in_str)
 461     return BIN_NUMBER_RE.match(in_str) is not None
 462
 463
 464 def to_int(in_str: str) -> int:
 465     """
 466     Args:
 467         in_str: the string to convert
 468
 469     Returns:
 470         The integral value of the string or raises on error.
 471
 472     >>> to_int('1234')
 473     1234
 474     >>> to_int('test')
 475     Traceback (most recent call last):
 476     ...
 477     ValueError: invalid literal for int() with base 10: 'test'
 478     """
 479     if not is_string(in_str):
 480         raise ValueError(in_str)
 481     if is_binary_integer_number(in_str):
 482         return int(in_str, 2)
 483     if is_octal_integer_number(in_str):
 484         return int(in_str, 8)
 485     if is_hexidecimal_integer_number(in_str):
 486         return int(in_str, 16)
 487     return int(in_str)
 488
 489
 490 def number_string_to_integer(in_str: str) -> int:
 491     """Convert a string containing a written-out number into an int.
 492
 493     >>> number_string_to_integer("one hundred fifty two")
 494     152
 495
 496     >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
 497     10200054003
 498
 499     >>> number_string_to_integer("four-score and 7")
 500     87
 501
 502     >>> number_string_to_integer("fifty xyzzy three")
 503     Traceback (most recent call last):
 504     ...
 505     ValueError: Unknown word: xyzzy
 506     """
 507     if type(in_str) == int:
 508         return in_str
 509
 510     current = result = 0
 511     in_str = in_str.replace('-', ' ')
 512     for word in in_str.split():
 513         if word not in NUM_WORDS:
 514             if is_integer_number(word):
 515                 current += int(word)
 516                 continue
 517             else:
 518                 raise ValueError("Unknown word: " + word)
 519         scale, increment = NUM_WORDS[word]
 520         current = current * scale + increment
 521         if scale > 100:
 522             result += current
 523             current = 0
 524     return result + current
 525
 526
 527 def is_decimal_number(in_str: str) -> bool:
 528     """
 529     Args:
 530         in_str: the string to check
 531
 532     Returns:
 533         True if the given string represents a decimal or False
 534         otherwise.  A decimal may be signed or unsigned or use
 535         a "scientific notation".
 536
 537     .. note::
 538         We do not consider integers without a decimal point
 539         to be decimals; they return False (see example).
 540
 541     >>> is_decimal_number('42.0')
 542     True
 543     >>> is_decimal_number('42')
 544     False
 545     """
 546     return is_number(in_str) and "." in in_str
 547
 548
 549 def strip_escape_sequences(in_str: str) -> str:
 550     """
 551     Args:
 552         in_str: the string to strip of escape sequences.
 553
 554     Returns:
 555         in_str with escape sequences removed.
 556
 557     .. note::
 558         What is considered to be an "escape sequence" is defined
 559         by a regular expression.  While this gets common ones,
 560         there may exist valid sequences that it doesn't match.
 561
 562     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 563     'this is a test!'
 564     """
 565     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 566     return in_str
 567
 568
 569 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 570     """
 571     Args:
 572         in_str: string or number to which to add thousands separator(s)
 573         separator_char: the separator character to add (defaults to comma)
 574         places: add a separator every N places (defaults to three)
 575
 576     Returns:
 577         A numeric string with thousands separators added appropriately.
 578
 579     >>> add_thousands_separator('12345678')
 580     '12,345,678'
 581     >>> add_thousands_separator(12345678)
 582     '12,345,678'
 583     >>> add_thousands_separator(12345678.99)
 584     '12,345,678.99'
 585     >>> add_thousands_separator('test')
 586     Traceback (most recent call last):
 587     ...
 588     ValueError: test
 589
 590     """
 591     if isinstance(in_str, numbers.Number):
 592         in_str = f'{in_str}'
 593     if is_number(in_str):
 594         return _add_thousands_separator(
 595             in_str, separator_char=separator_char, places=places
 596         )
 597     raise ValueError(in_str)
 598
 599
 600 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 601     decimal_part = ""
 602     if '.' in in_str:
 603         (in_str, decimal_part) = in_str.split('.')
 604     tmp = [iter(in_str[::-1])] * places
 605     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 606     if len(decimal_part) > 0:
 607         ret += '.'
 608         ret += decimal_part
 609     return ret
 610
 611
 612 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 613     """
 614     Args:
 615         in_str: the string to test
 616         allowed_schemes: an optional list of allowed schemes (e.g.
 617             ['http', 'https', 'ftp'].  If passed, only URLs that
 618             begin with the one of the schemes passed will be considered
 619             to be valid.  Otherwise, any scheme:// will be considered
 620             valid.
 621
 622     Returns:
 623         True if in_str contains a valid URL and False otherwise.
 624
 625     >>> is_url('http://www.mysite.com')
 626     True
 627     >>> is_url('https://mysite.com')
 628     True
 629     >>> is_url('.mysite.com')
 630     False
 631     >>> is_url('scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash')
 632     True
 633     """
 634     if not is_full_string(in_str):
 635         return False
 636
 637     valid = URL_RE.match(in_str) is not None
 638
 639     if allowed_schemes:
 640         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 641     return valid
 642
 643
 644 def is_email(in_str: Any) -> bool:
 645     """
 646     Args:
 647         in_str: the email address to check
 648
 649     Returns: True if the in_str contains a valid email (as defined by
 650         https://tools.ietf.org/html/rfc3696#section-3) or False
 651         otherwise.
 652
 653     >>> is_email('[email protected]')
 654     True
 655     >>> is_email('@gmail.com')
 656     False
 657     """
 658     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 659         return False
 660
 661     try:
 662         # we expect 2 tokens, one before "@" and one after, otherwise
 663         # we have an exception and the email is not valid.
 664         head, tail = in_str.split("@")
 665
 666         # head's size must be <= 64, tail <= 255, head must not start
 667         # with a dot or contain multiple consecutive dots.
 668         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 669             return False
 670
 671         # removes escaped spaces, so that later on the test regex will
 672         # accept the string.
 673         head = head.replace("\\ ", "")
 674         if head.startswith('"') and head.endswith('"'):
 675             head = head.replace(" ", "")[1:-1]
 676         return EMAIL_RE.match(head + "@" + tail) is not None
 677
 678     except ValueError:
 679         # borderline case in which we have multiple "@" signs but the
 680         # head part is correctly escaped.
 681         if ESCAPED_AT_SIGN.search(in_str) is not None:
 682             # replace "@" with "a" in the head
 683             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 684         return False
 685
 686
 687 def suffix_string_to_number(in_str: str) -> Optional[int]:
 688     """Takes a string like "33Gb" and converts it into a number (of bytes)
 689     like 34603008.
 690
 691     Args:
 692         in_str: the string with a suffix to be interpreted and removed.
 693
 694     Returns:
 695         An integer number of bytes or None to indicate an error.
 696
 697     >>> suffix_string_to_number('1Mb')
 698     1048576
 699     >>> suffix_string_to_number('13.1Gb')
 700     14066017894
 701     """
 702
 703     def suffix_capitalize(s: str) -> str:
 704         if len(s) == 1:
 705             return s.upper()
 706         elif len(s) == 2:
 707             return f"{s[0].upper()}{s[1].lower()}"
 708         return suffix_capitalize(s[0:1])
 709
 710     if is_string(in_str):
 711         if is_integer_number(in_str):
 712             return to_int(in_str)
 713         suffixes = [in_str[-2:], in_str[-1:]]
 714         rest = [in_str[:-2], in_str[:-1]]
 715         for x in range(len(suffixes)):
 716             s = suffixes[x]
 717             s = suffix_capitalize(s)
 718             multiplier = NUM_SUFFIXES.get(s, None)
 719             if multiplier is not None:
 720                 r = rest[x]
 721                 if is_integer_number(r):
 722                     return to_int(r) * multiplier
 723                 if is_decimal_number(r):
 724                     return int(float(r) * multiplier)
 725     return None
 726
 727
 728 def number_to_suffix_string(num: int) -> Optional[str]:
 729     """Take a number (of bytes) and returns a string like "43.8Gb".
 730
 731     Args:
 732         num: an integer number of bytes
 733
 734     Returns:
 735         A string with a suffix representing num bytes concisely or
 736         None to indicate an error.
 737
 738     >>> number_to_suffix_string(14066017894)
 739     '13.1Gb'
 740     >>> number_to_suffix_string(1024 * 1024)
 741     '1.0Mb'
 742     """
 743     d = 0.0
 744     suffix = None
 745     for (sfx, size) in NUM_SUFFIXES.items():
 746         if num >= size:
 747             d = num / size
 748             suffix = sfx
 749             break
 750     if suffix is not None:
 751         return f"{d:.1f}{suffix}"
 752     else:
 753         return f'{num:d}'
 754
 755
 756 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 757     """
 758     Args:
 759         in_str: a string to check
 760         card_type: if provided, contains the card type to validate
 761             with.  Otherwise, all known credit card number types will
 762             be accepted.
 763
 764             Supported card types are the following:
 765
 766             * VISA
 767             * MASTERCARD
 768             * AMERICAN_EXPRESS
 769             * DINERS_CLUB
 770             * DISCOVER
 771             * JCB
 772
 773     Returns:
 774         True if in_str is a valid credit card number.
 775     """
 776     if not is_full_string(in_str):
 777         return False
 778
 779     if card_type is not None:
 780         if card_type not in CREDIT_CARDS:
 781             raise KeyError(
 782                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 783             )
 784         return CREDIT_CARDS[card_type].match(in_str) is not None
 785     for c in CREDIT_CARDS:
 786         if CREDIT_CARDS[c].match(in_str) is not None:
 787             return True
 788     return False
 789
 790
 791 def is_camel_case(in_str: Any) -> bool:
 792     """
 793     Args:
 794         in_str: the string to test
 795
 796     Returns:
 797         True if the string is formatted as camel case and False otherwise.
 798         A string is considered camel case when:
 799
 800         * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 801         * it contains both lowercase and uppercase letters
 802         * it does not start with a number
 803     """
 804     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 805
 806
 807 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 808     """
 809     Args:
 810         in_str: the string to test
 811
 812     Returns: True if the string is snake case and False otherwise.  A
 813         string is considered snake case when:
 814
 815         * it's composed only by lowercase/uppercase letters and digits
 816         * it contains at least one underscore (or provided separator)
 817         * it does not start with a number
 818
 819     >>> is_snake_case('this_is_a_test')
 820     True
 821     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 822     True
 823     >>> is_snake_case('this-is-a-test')
 824     False
 825     >>> is_snake_case('this-is-a-test', separator='-')
 826     True
 827     """
 828     if is_full_string(in_str):
 829         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 830         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 831         r = re_map.get(
 832             separator,
 833             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 834         )
 835         return r.match(in_str) is not None
 836     return False
 837
 838
 839 def is_json(in_str: Any) -> bool:
 840     """
 841     Args:
 842         in_str: the string to test
 843
 844     Returns:
 845         True if the in_str contains valid JSON and False otherwise.
 846
 847     >>> is_json('{"name": "Peter"}')
 848     True
 849     >>> is_json('[1, 2, 3]')
 850     True
 851     >>> is_json('{nope}')
 852     False
 853     """
 854     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 855         try:
 856             return isinstance(json.loads(in_str), (dict, list))
 857         except (TypeError, ValueError, OverflowError):
 858             pass
 859     return False
 860
 861
 862 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 863     """
 864     Args:
 865         in_str: the string to test
 866
 867     Returns:
 868         True if the in_str contains a valid UUID and False otherwise.
 869
 870     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 871     True
 872     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 873     False
 874     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 875     True
 876     """
 877     # string casting is used to allow UUID itself as input data type
 878     s = str(in_str)
 879     if allow_hex:
 880         return UUID_HEX_OK_RE.match(s) is not None
 881     return UUID_RE.match(s) is not None
 882
 883
 884 def is_ip_v4(in_str: Any) -> bool:
 885     """
 886     Args:
 887         in_str: the string to test
 888
 889     Returns:
 890         True if in_str contains a valid IPv4 address and False otherwise.
 891
 892     >>> is_ip_v4('255.200.100.75')
 893     True
 894     >>> is_ip_v4('nope')
 895     False
 896     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 897     False
 898     """
 899     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 900         return False
 901
 902     # checks that each entry in the ip is in the valid range (0 to 255)
 903     for token in in_str.split("."):
 904         if not 0 <= int(token) <= 255:
 905             return False
 906     return True
 907
 908
 909 def extract_ip_v4(in_str: Any) -> Optional[str]:
 910     """
 911     Args:
 912         in_str: the string to extract an IPv4 address from.
 913
 914     Returns:
 915         The first extracted IPv4 address from in_str or None if
 916         none were found or an error occurred.
 917
 918     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 919     '127.0.0.1'
 920     >>> extract_ip_v4('Your mom dresses you funny.')
 921     """
 922     if not is_full_string(in_str):
 923         return None
 924     m = ANYWHERE_IP_V4_RE.search(in_str)
 925     if m is not None:
 926         return m.group(0)
 927     return None
 928
 929
 930 def is_ip_v6(in_str: Any) -> bool:
 931     """
 932     Args:
 933         in_str: the string to test.
 934
 935     Returns:
 936         True if in_str contains a valid IPv6 address and False otherwise.
 937
 938     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 939     True
 940     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 941     False
 942     """
 943     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 944
 945
 946 def extract_ip_v6(in_str: Any) -> Optional[str]:
 947     """
 948     Args:
 949         in_str: the string from which to extract an IPv6 address.
 950
 951     Returns:
 952         The first IPv6 address found in in_str or None if no address
 953         was found or an error occurred.
 954
 955     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 956     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 957     >>> extract_ip_v6("(and she's ugly too, btw)")
 958     """
 959     if not is_full_string(in_str):
 960         return None
 961     m = ANYWHERE_IP_V6_RE.search(in_str)
 962     if m is not None:
 963         return m.group(0)
 964     return None
 965
 966
 967 def is_ip(in_str: Any) -> bool:
 968     """
 969     Args:
 970         in_str: the string to test.
 971
 972     Returns:
 973         True if in_str contains a valid IP address (either IPv4 or
 974         IPv6).
 975
 976     >>> is_ip('255.200.100.75')
 977     True
 978     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 979     True
 980     >>> is_ip('1.2.3')
 981     False
 982     >>> is_ip('1.2.3.999')
 983     False
 984     """
 985     return is_ip_v6(in_str) or is_ip_v4(in_str)
 986
 987
 988 def extract_ip(in_str: Any) -> Optional[str]:
 989     """
 990     Args:
 991         in_str: the string from which to extract in IP address.
 992
 993     Returns:
 994         The first IP address (IPv4 or IPv6) found in in_str or
 995         None to indicate none found or an error condition.
 996
 997     >>> extract_ip('Attacker: 255.200.100.75')
 998     '255.200.100.75'
 999     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1000     '2001:db8:85a3:0000:0000:8a2e:370:7334'
1001     >>> extract_ip('1.2.3')
1002     """
1003     ip = extract_ip_v4(in_str)
1004     if ip is None:
1005         ip = extract_ip_v6(in_str)
1006     return ip
1007
1008
1009 def is_mac_address(in_str: Any) -> bool:
1010     """
1011     Args:
1012         in_str: the string to test
1013
1014     Returns:
1015         True if in_str is a valid MAC address False otherwise.
1016
1017     >>> is_mac_address("34:29:8F:12:0D:2F")
1018     True
1019     >>> is_mac_address('34:29:8f:12:0d:2f')
1020     True
1021     >>> is_mac_address('34-29-8F-12-0D-2F')
1022     True
1023     >>> is_mac_address("test")
1024     False
1025     """
1026     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1027
1028
1029 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1030     """
1031     Args:
1032         in_str: the string from which to extract a MAC address.
1033
1034     Returns:
1035         The first MAC address found in in_str or None to indicate no
1036         match or an error.
1037
1038     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1039     '34:29:8F:12:0D:2F'
1040
1041     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1042     'd8:5d:e2:34:54:86'
1043     """
1044     if not is_full_string(in_str):
1045         return None
1046     in_str.strip()
1047     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1048     if m is not None:
1049         mac = m.group(0)
1050         mac.replace(":", separator)
1051         mac.replace("-", separator)
1052         return mac
1053     return None
1054
1055
1056 def is_slug(in_str: Any, separator: str = "-") -> bool:
1057     """
1058     Args:
1059         in_str: string to test
1060
1061     Returns:
1062         True if in_str is a slug string and False otherwise.
1063
1064     >>> is_slug('my-blog-post-title')
1065     True
1066     >>> is_slug('My blog post title')
1067     False
1068     """
1069     if not is_full_string(in_str):
1070         return False
1071     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1072     return re.match(rex, in_str) is not None
1073
1074
1075 def contains_html(in_str: str) -> bool:
1076     """
1077     Args:
1078         in_str: the string to check for tags in
1079
1080     Returns:
1081         True if the given string contains HTML/XML tags and False
1082         otherwise.
1083
1084     .. warning::
1085         By design, this function matches ANY type of tag, so don't expect
1086         to use it as an HTML validator.  It's a quick sanity check at
1087         best.  See something like BeautifulSoup for a more full-featuered
1088         HTML parser.
1089
1090     >>> contains_html('my string is <strong>bold</strong>')
1091     True
1092     >>> contains_html('my string is not bold')
1093     False
1094
1095     """
1096     if not is_string(in_str):
1097         raise ValueError(in_str)
1098     return HTML_RE.search(in_str) is not None
1099
1100
1101 def words_count(in_str: str) -> int:
1102     """
1103     Args:
1104         in_str: the string to count words in
1105
1106     Returns:
1107         The number of words contained in the given string.
1108
1109     .. note::
1110
1111         This method is "smart" in that it does consider only sequences
1112         of one or more letter and/or numbers to be "words".  Thus a
1113         string like this: "! @ # % ... []" will return zero.  Moreover
1114         it is aware of punctuation, so the count for a string like
1115         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1116         in the string).
1117
1118     >>> words_count('hello world')
1119     2
1120     >>> words_count('one,two,three.stop')
1121     4
1122     """
1123     if not is_string(in_str):
1124         raise ValueError(in_str)
1125     return len(WORDS_COUNT_RE.findall(in_str))
1126
1127
1128 def word_count(in_str: str) -> int:
1129     """
1130     Args:
1131         in_str: the string to count words in
1132
1133     Returns:
1134         The number of words contained in the given string.
1135
1136     .. note::
1137
1138         This method is "smart" in that it does consider only sequences
1139         of one or more letter and/or numbers to be "words".  Thus a
1140         string like this: "! @ # % ... []" will return zero.  Moreover
1141         it is aware of punctuation, so the count for a string like
1142         "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1143         in the string).
1144
1145     >>> word_count('hello world')
1146     2
1147     >>> word_count('one,two,three.stop')
1148     4
1149     """
1150     return words_count(in_str)
1151
1152
1153 def generate_uuid(omit_dashes: bool = False) -> str:
1154     """
1155     Args:
1156         omit_dashes: should we omit the dashes in the generated UUID?
1157
1158     Returns:
1159         A generated UUID string (using `uuid.uuid4()`) with or without
1160         dashes per the omit_dashes arg.
1161
1162     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1163     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1164     """
1165     uid = uuid4()
1166     if omit_dashes:
1167         return uid.hex
1168     return str(uid)
1169
1170
1171 def generate_random_alphanumeric_string(size: int) -> str:
1172     """
1173     Args:
1174         size: number of characters to generate
1175
1176     Returns:
1177         A string of the specified size containing random characters
1178         (uppercase/lowercase ascii letters and digits).
1179
1180     >>> random.seed(22)
1181     >>> generate_random_alphanumeric_string(9)
1182     '96ipbNClS'
1183     """
1184     if size < 1:
1185         raise ValueError("size must be >= 1")
1186     chars = string.ascii_letters + string.digits
1187     buffer = [random.choice(chars) for _ in range(size)]
1188     return from_char_list(buffer)
1189
1190
1191 def reverse(in_str: str) -> str:
1192     """
1193     Args:
1194         in_str: the string to reverse
1195
1196     Returns:
1197         The reversed (chracter by character) string.
1198
1199     >>> reverse('test')
1200     'tset'
1201     """
1202     if not is_string(in_str):
1203         raise ValueError(in_str)
1204     return in_str[::-1]
1205
1206
1207 def camel_case_to_snake_case(in_str, *, separator="_"):
1208     """
1209     Args:
1210         in_str: the camel case string to convert
1211
1212     Returns:
1213         A snake case string equivalent to the camel case input or the
1214         original string if it is not a valid camel case string or some
1215         other error occurs.
1216
1217     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1218     'mac_address_extractor_factory'
1219     >>> camel_case_to_snake_case('Luke Skywalker')
1220     'Luke Skywalker'
1221     """
1222     if not is_string(in_str):
1223         raise ValueError(in_str)
1224     if not is_camel_case(in_str):
1225         return in_str
1226     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1227
1228
1229 def snake_case_to_camel_case(
1230     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1231 ) -> str:
1232     """
1233     Args:
1234         in_str: the snake case string to convert
1235
1236     Returns:
1237         A camel case string that is equivalent to the snake case string
1238         provided or the original string back again if it is not valid
1239         snake case or another error occurs.
1240
1241     >>> snake_case_to_camel_case('this_is_a_test')
1242     'ThisIsATest'
1243     >>> snake_case_to_camel_case('Han Solo')
1244     'Han Solo'
1245     """
1246     if not is_string(in_str):
1247         raise ValueError(in_str)
1248     if not is_snake_case(in_str, separator=separator):
1249         return in_str
1250     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1251     if not upper_case_first:
1252         tokens[0] = tokens[0].lower()
1253     return from_char_list(tokens)
1254
1255
1256 def to_char_list(in_str: str) -> List[str]:
1257     """
1258     Args:
1259         in_str: the string to split into a char list
1260
1261     Returns:
1262         A list of strings of length one each.
1263
1264     >>> to_char_list('test')
1265     ['t', 'e', 's', 't']
1266     """
1267     if not is_string(in_str):
1268         return []
1269     return list(in_str)
1270
1271
1272 def from_char_list(in_list: List[str]) -> str:
1273     """
1274     Args:
1275         in_list: A list of characters to convert into a string.
1276
1277     Returns:
1278         The string resulting from gluing the characters in in_list
1279         together.
1280
1281     >>> from_char_list(['t', 'e', 's', 't'])
1282     'test'
1283     """
1284     return "".join(in_list)
1285
1286
1287 def shuffle(in_str: str) -> Optional[str]:
1288     """
1289     Args:
1290         in_str: a string to shuffle randomly by character
1291
1292     Returns:
1293         A new string containing same chars of the given one but in
1294         a randomized order.  Note that in rare cases this could result
1295         in the same original string as no check is done.  Returns
1296         None to indicate error conditions.
1297
1298     >>> random.seed(22)
1299     >>> shuffle('awesome')
1300     'meosaew'
1301     """
1302     if not is_string(in_str):
1303         return None
1304     chars = to_char_list(in_str)
1305     random.shuffle(chars)
1306     return from_char_list(chars)
1307
1308
1309 def scramble(in_str: str) -> Optional[str]:
1310     """
1311     Args:
1312         in_str: a string to shuffle randomly by character
1313
1314     Returns:
1315         A new string containing same chars of the given one but in
1316         a randomized order.  Note that in rare cases this could result
1317         in the same original string as no check is done.  Returns
1318         None to indicate error conditions.
1319
1320     >>> random.seed(22)
1321     >>> scramble('awesome')
1322     'meosaew'
1323     """
1324     return shuffle(in_str)
1325
1326
1327 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1328     """
1329     Args:
1330         in_str: the string to strip tags from
1331         keep_tag_content: should we keep the inner contents of tags?
1332
1333     Returns:
1334         A string with all HTML tags removed (optionally with tag contents
1335         preserved).
1336
1337     .. note::
1338         This method uses simple regular expressions to strip tags and is
1339         not a full fledged HTML parser by any means.  Consider using
1340         something like BeautifulSoup if your needs are more than this
1341         simple code can fulfill.
1342
1343     >>> strip_html('test: <a href="foo/bar">click here</a>')
1344     'test: '
1345     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1346     'test: click here'
1347     """
1348     if not is_string(in_str):
1349         raise ValueError(in_str)
1350     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1351     return r.sub("", in_str)
1352
1353
1354 def asciify(in_str: str) -> str:
1355     """
1356     Args:
1357         in_str: the string to asciify.
1358
1359     Returns:
1360         An output string roughly equivalent to the original string
1361         where all content to are ascii-only.  This is accomplished
1362         by translating all non-ascii chars into their closest possible
1363         ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1364
1365     .. warning::
1366         Some chars may be lost if impossible to translate.
1367
1368     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1369     'eeuuooaaeynAAACIINOE'
1370     """
1371     if not is_string(in_str):
1372         raise ValueError(in_str)
1373
1374     # "NFKD" is the algorithm which is able to successfully translate
1375     # the most of non-ascii chars.
1376     normalized = unicodedata.normalize("NFKD", in_str)
1377
1378     # encode string forcing ascii and ignore any errors
1379     # (unrepresentable chars will be stripped out)
1380     ascii_bytes = normalized.encode("ascii", "ignore")
1381
1382     # turns encoded bytes into an utf-8 string
1383     return ascii_bytes.decode("utf-8")
1384
1385
1386 def slugify(in_str: str, *, separator: str = "-") -> str:
1387     """
1388     Args:
1389         in_str: the string to slugify
1390         separator: the character to use during sligification (default
1391             is a dash)
1392
1393     Returns:
1394         The converted string.  The returned string has the following properties:
1395
1396         * it has no spaces
1397         * all letters are in lower case
1398         * all punctuation signs and non alphanumeric chars are removed
1399         * words are divided using provided separator
1400         * all chars are encoded as ascii (by using :meth:`asciify`)
1401         * is safe for URL
1402
1403     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1404     'top-10-reasons-to-love-dogs'
1405     >>> slugify('Mönstér Mägnët')
1406     'monster-magnet'
1407     """
1408     if not is_string(in_str):
1409         raise ValueError(in_str)
1410
1411     # replace any character that is NOT letter or number with spaces
1412     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1413
1414     # replace spaces with join sign
1415     out = SPACES_RE.sub(separator, out)
1416
1417     # normalize joins (remove duplicates)
1418     out = re.sub(re.escape(separator) + r"+", separator, out)
1419     return asciify(out)
1420
1421
1422 def to_bool(in_str: str) -> bool:
1423     """
1424     Args:
1425         in_str: the string to convert to boolean
1426
1427     Returns:
1428         A boolean equivalent of the original string based on its contents.
1429         All conversion is case insensitive.  A positive boolean (True) is
1430         returned if the string value is any of the following:
1431
1432         * "true"
1433         * "t"
1434         * "1"
1435         * "yes"
1436         * "y"
1437         * "on"
1438
1439         Otherwise False is returned.
1440
1441     >>> to_bool('True')
1442     True
1443
1444     >>> to_bool('1')
1445     True
1446
1447     >>> to_bool('yes')
1448     True
1449
1450     >>> to_bool('no')
1451     False
1452
1453     >>> to_bool('huh?')
1454     False
1455
1456     >>> to_bool('on')
1457     True
1458     """
1459     if not is_string(in_str):
1460         raise ValueError(in_str)
1461     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1462
1463
1464 def to_date(in_str: str) -> Optional[datetime.date]:
1465     """
1466     Args:
1467         in_str: the string to convert into a date
1468
1469     Returns:
1470         The datetime.date the string contained or None to indicate
1471         an error.  This parser is relatively clever; see
1472         :class:`datetimez.dateparse_utils` docs for details.
1473
1474     >>> to_date('9/11/2001')
1475     datetime.date(2001, 9, 11)
1476     >>> to_date('xyzzy')
1477     """
1478     import pyutils.datetimez.dateparse_utils as du
1479
1480     try:
1481         d = du.DateParser()  # type: ignore
1482         d.parse(in_str)
1483         return d.get_date()
1484     except du.ParseException:  # type: ignore
1485         msg = f'Unable to parse date {in_str}.'
1486         logger.warning(msg)
1487     return None
1488
1489
1490 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1491     """Finds and extracts a date from the string, if possible.
1492
1493     Args:
1494         in_str: the string to extract a date from
1495
1496     Returns:
1497         a datetime if date was found, otherwise None
1498
1499     >>> extract_date("filename.txt    dec 13, 2022")
1500     datetime.datetime(2022, 12, 13, 0, 0)
1501
1502     >>> extract_date("Dear Santa, please get me a pony.")
1503
1504     """
1505     import itertools
1506
1507     import pyutils.datetimez.dateparse_utils as du
1508
1509     d = du.DateParser()  # type: ignore
1510     chunks = in_str.split()
1511     for ngram in itertools.chain(
1512         list_utils.ngrams(chunks, 5),
1513         list_utils.ngrams(chunks, 4),
1514         list_utils.ngrams(chunks, 3),
1515         list_utils.ngrams(chunks, 2),
1516     ):
1517         try:
1518             expr = " ".join(ngram)
1519             logger.debug(f"Trying {expr}")
1520             if d.parse(expr):
1521                 return d.get_datetime()
1522         except du.ParseException:  # type: ignore
1523             pass
1524     return None
1525
1526
1527 def is_valid_date(in_str: str) -> bool:
1528     """
1529     Args:
1530         in_str: the string to check
1531
1532     Returns:
1533         True if the string represents a valid date that we can recognize
1534         and False otherwise.  This parser is relatively clever; see
1535         :class:`datetimez.dateparse_utils` docs for details.
1536
1537     >>> is_valid_date('1/2/2022')
1538     True
1539     >>> is_valid_date('christmas')
1540     True
1541     >>> is_valid_date('next wednesday')
1542     True
1543     >>> is_valid_date('xyzzy')
1544     False
1545     """
1546     import pyutils.datetimez.dateparse_utils as dp
1547
1548     try:
1549         d = dp.DateParser()  # type: ignore
1550         _ = d.parse(in_str)
1551         return True
1552     except dp.ParseException:  # type: ignore
1553         msg = f'Unable to parse date {in_str}.'
1554         logger.warning(msg)
1555     return False
1556
1557
1558 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1559     """
1560     Args:
1561         in_str: string to parse into a datetime
1562
1563     Returns:
1564         A python datetime parsed from in_str or None to indicate
1565         an error.  This parser is relatively clever; see
1566         :class:`datetimez.dateparse_utils` docs for details.
1567
1568     >>> to_datetime('7/20/1969 02:56 GMT')
1569     datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1570     """
1571     import pyutils.datetimez.dateparse_utils as dp
1572
1573     try:
1574         d = dp.DateParser()  # type: ignore
1575         dt = d.parse(in_str)
1576         if isinstance(dt, datetime.datetime):
1577             return dt
1578     except Exception:
1579         msg = f'Unable to parse datetime {in_str}.'
1580         logger.warning(msg)
1581     return None
1582
1583
1584 def valid_datetime(in_str: str) -> bool:
1585     """
1586     Args:
1587         in_str: the string to check
1588
1589     Returns:
1590         True if in_str contains a valid datetime and False otherwise.
1591         This parser is relatively clever; see
1592         :class:`datetimez.dateparse_utils` docs for details.
1593
1594     >>> valid_datetime('next wednesday at noon')
1595     True
1596     >>> valid_datetime('3 weeks ago at midnight')
1597     True
1598     >>> valid_datetime('next easter at 5:00 am')
1599     True
1600     >>> valid_datetime('sometime soon')
1601     False
1602     """
1603     _ = to_datetime(in_str)
1604     if _ is not None:
1605         return True
1606     msg = f'Unable to parse datetime {in_str}.'
1607     logger.warning(msg)
1608     return False
1609
1610
1611 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1612     """
1613     Args:
1614         in_str: the string to squeeze
1615         character_to_squeeze: the character to remove runs of
1616             more than one in a row (default = space)
1617
1618     Returns: A "squeezed string" where runs of more than one
1619         character_to_squeeze into one.
1620
1621     >>> squeeze(' this        is       a    test    ')
1622     ' this is a test '
1623
1624     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1625     'one|!|two|!|three'
1626
1627     """
1628     return re.sub(
1629         r'(' + re.escape(character_to_squeeze) + r')+',
1630         character_to_squeeze,
1631         in_str,
1632     )
1633
1634
1635 def dedent(in_str: str) -> Optional[str]:
1636     """
1637     Args:
1638         in_str: the string to dedent
1639
1640     Returns:
1641         A string with tab indentation removed or None on error.
1642
1643     .. note::
1644
1645         Inspired by analogous Scala function.
1646
1647     >>> dedent('\t\ttest\\n\t\ting')
1648     'test\\ning'
1649     """
1650     if not is_string(in_str):
1651         return None
1652     line_separator = '\n'
1653     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1654     return line_separator.join(lines)
1655
1656
1657 def indent(in_str: str, amount: int) -> str:
1658     """
1659     Args:
1660         in_str: the string to indent
1661         amount: count of spaces to indent each line by
1662
1663     Returns:
1664         An indented string created by prepending amount spaces.
1665
1666     >>> indent('This is a test', 4)
1667     '    This is a test'
1668     """
1669     if not is_string(in_str):
1670         raise ValueError(in_str)
1671     line_separator = '\n'
1672     lines = [" " * amount + line for line in in_str.split(line_separator)]
1673     return line_separator.join(lines)
1674
1675
1676 def sprintf(*args, **kwargs) -> str:
1677     """
1678     Args:
1679         This function uses the same syntax as the builtin print
1680         function.
1681
1682     Returns:
1683         An interpolated string capturing print output, like man(3)
1684         `sprintf`.
1685     """
1686     ret = ""
1687
1688     sep = kwargs.pop("sep", None)
1689     if sep is not None:
1690         if not isinstance(sep, str):
1691             raise TypeError("sep must be None or a string")
1692
1693     end = kwargs.pop("end", None)
1694     if end is not None:
1695         if not isinstance(end, str):
1696             raise TypeError("end must be None or a string")
1697
1698     if kwargs:
1699         raise TypeError("invalid keyword arguments to sprint()")
1700
1701     if sep is None:
1702         sep = " "
1703     if end is None:
1704         end = "\n"
1705     for i, arg in enumerate(args):
1706         if i:
1707             ret += sep
1708         if isinstance(arg, str):
1709             ret += arg
1710         else:
1711             ret += str(arg)
1712     ret += end
1713     return ret
1714
1715
1716 def strip_ansi_sequences(in_str: str) -> str:
1717     """
1718     Args:
1719         in_str: the string to strip
1720
1721     Returns:
1722         in_str with recognized ANSI escape sequences removed.
1723
1724     .. warning::
1725         This method works by using a regular expression.
1726         It works for all ANSI escape sequences I've tested with but
1727         may miss some; caveat emptor.
1728
1729     >>> import ansi as a
1730     >>> s = a.fg('blue') + 'blue!' + a.reset()
1731     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1732     18
1733     >>> len(strip_ansi_sequences(s))
1734     5
1735     >>> strip_ansi_sequences(s)
1736     'blue!'
1737
1738     """
1739     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1740
1741
1742 class SprintfStdout(contextlib.AbstractContextManager):
1743     """
1744     A context manager that captures outputs to stdout to a buffer
1745     without printing them.
1746
1747     >>> with SprintfStdout() as buf:
1748     ...     print("test")
1749     ...     print("1, 2, 3")
1750     ...
1751     >>> print(buf(), end='')
1752     test
1753     1, 2, 3
1754
1755     """
1756
1757     def __init__(self) -> None:
1758         self.destination = io.StringIO()
1759         self.recorder: contextlib.redirect_stdout
1760
1761     def __enter__(self) -> Callable[[], str]:
1762         self.recorder = contextlib.redirect_stdout(self.destination)
1763         self.recorder.__enter__()
1764         return lambda: self.destination.getvalue()
1765
1766     def __exit__(self, *args) -> Literal[False]:
1767         self.recorder.__exit__(*args)
1768         self.destination.seek(0)
1769         return False
1770
1771
1772 def capitalize_first_letter(in_str: str) -> str:
1773     """
1774     Args:
1775         in_str: the string to capitalize
1776
1777     Returns:
1778         in_str with the first character capitalized.
1779
1780     >>> capitalize_first_letter('test')
1781     'Test'
1782     >>> capitalize_first_letter("ALREADY!")
1783     'ALREADY!'
1784
1785     """
1786     return in_str[0].upper() + in_str[1:]
1787
1788
1789 def it_they(n: int) -> str:
1790     """
1791     Args:
1792         n: how many of them are there?
1793
1794     Returns:
1795         'it' if n is one or 'they' otherwize.
1796
1797     Suggested usage::
1798
1799         n = num_files_saved_to_tmp()
1800         print(f'Saved file{pluralize(n)} successfully.')
1801         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1802
1803     >>> it_they(1)
1804     'it'
1805     >>> it_they(100)
1806     'they'
1807     """
1808     if n == 1:
1809         return "it"
1810     return "they"
1811
1812
1813 def is_are(n: int) -> str:
1814     """
1815     Args:
1816         n: how many of them are there?
1817
1818     Returns:
1819         'is' if n is one or 'are' otherwize.
1820
1821     Suggested usage::
1822
1823         n = num_files_saved_to_tmp()
1824         print(f'Saved file{pluralize(n)} successfully.')
1825         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1826
1827     >>> is_are(1)
1828     'is'
1829     >>> is_are(2)
1830     'are'
1831
1832     """
1833     if n == 1:
1834         return "is"
1835     return "are"
1836
1837
1838 def pluralize(n: int) -> str:
1839     """
1840     Args:
1841         n: how many of them are there?
1842
1843     Returns:
1844         's' if n is greater than one otherwize ''.
1845
1846     Suggested usage::
1847
1848         n = num_files_saved_to_tmp()
1849         print(f'Saved file{pluralize(n)} successfully.')
1850         print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1851
1852     >>> pluralize(15)
1853     's'
1854     >>> count = 1
1855     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1856     There is 1 file.
1857     >>> count = 4
1858     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1859     There are 4 files.
1860     """
1861     if n == 1:
1862         return ""
1863     return "s"
1864
1865
1866 def make_contractions(txt: str) -> str:
1867     """This code glues words in txt together to form (English)
1868     contractions.
1869
1870     Args:
1871         txt: the input text to be contractionized.
1872
1873     Returns:
1874         Output text identical to original input except for any
1875         recognized contractions are formed.
1876
1877     .. note::
1878         The order in which we create contractions is defined by the
1879         implementation and what I thought made more sense when writing
1880         this code.
1881
1882     >>> make_contractions('It is nice today.')
1883     "It's nice today."
1884
1885     >>> make_contractions('I can    not even...')
1886     "I can't even..."
1887
1888     >>> make_contractions('She could not see!')
1889     "She couldn't see!"
1890
1891     >>> make_contractions('But she will not go.')
1892     "But she won't go."
1893
1894     >>> make_contractions('Verily, I shall not.')
1895     "Verily, I shan't."
1896
1897     >>> make_contractions('No you cannot.')
1898     "No you can't."
1899
1900     >>> make_contractions('I said you can not go.')
1901     "I said you can't go."
1902     """
1903
1904     first_second = [
1905         (
1906             [
1907                 'are',
1908                 'could',
1909                 'did',
1910                 'has',
1911                 'have',
1912                 'is',
1913                 'must',
1914                 'should',
1915                 'was',
1916                 'were',
1917                 'would',
1918             ],
1919             ['(n)o(t)'],
1920         ),
1921         (
1922             [
1923                 "I",
1924                 "you",
1925                 "he",
1926                 "she",
1927                 "it",
1928                 "we",
1929                 "they",
1930                 "how",
1931                 "why",
1932                 "when",
1933                 "where",
1934                 "who",
1935                 "there",
1936             ],
1937             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1938         ),
1939     ]
1940
1941     # Special cases: can't, shan't and won't.
1942     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1943     txt = re.sub(
1944         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
1945     )
1946     txt = re.sub(
1947         r'\b(w)ill\s*(n)(o)(t)\b',
1948         r"\1\3\2'\4",
1949         txt,
1950         count=0,
1951         flags=re.IGNORECASE,
1952     )
1953
1954     for first_list, second_list in first_second:
1955         for first in first_list:
1956             for second in second_list:
1957                 # Disallow there're/where're.  They're valid English
1958                 # but sound weird.
1959                 if (first in ('there', 'where')) and second == 'a(re)':
1960                     continue
1961
1962                 pattern = fr'\b({first})\s+{second}\b'
1963                 if second == '(n)o(t)':
1964                     replacement = r"\1\2'\3"
1965                 else:
1966                     replacement = r"\1'\2"
1967                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1968
1969     return txt
1970
1971
1972 def thify(n: int) -> str:
1973     """
1974     Args:
1975         n: how many of them are there?
1976
1977     Returns:
1978         The proper cardinal suffix for a number.
1979
1980     Suggested usage::
1981
1982         attempt_count = 0
1983         while True:
1984             attempt_count += 1
1985             if try_the_thing():
1986                 break
1987             print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
1988
1989     >>> thify(1)
1990     'st'
1991     >>> thify(33)
1992     'rd'
1993     >>> thify(16)
1994     'th'
1995     """
1996     digit = str(n)
1997     assert is_integer_number(digit)
1998     digit = digit[-1:]
1999     if digit == "1":
2000         return "st"
2001     elif digit == "2":
2002         return "nd"
2003     elif digit == "3":
2004         return "rd"
2005     else:
2006         return "th"
2007
2008
2009 def ngrams(txt: str, n: int):
2010     """
2011     Args:
2012         txt: the string to create ngrams using
2013         n: how many words per ngram created?
2014
2015     Returns:
2016         Generates the ngrams from the input string.
2017
2018     >>> [x for x in ngrams('This is a test', 2)]
2019     ['This is', 'is a', 'a test']
2020     """
2021     words = txt.split()
2022     for ngram in ngrams_presplit(words, n):
2023         ret = ''
2024         for word in ngram:
2025             ret += f'{word} '
2026         yield ret.strip()
2027
2028
2029 def ngrams_presplit(words: Sequence[str], n: int):
2030     """
2031     Same as :meth:`ngrams` but with the string pre-split.
2032     """
2033     return list_utils.ngrams(words, n)
2034
2035
2036 def bigrams(txt: str):
2037     """Generates the bigrams (n=2) of the given string.
2038
2039     >>> [x for x in bigrams('this is a test')]
2040     ['this is', 'is a', 'a test']
2041     """
2042     return ngrams(txt, 2)
2043
2044
2045 def trigrams(txt: str):
2046     """Generates the trigrams (n=3) of the given string."""
2047     return ngrams(txt, 3)
2048
2049
2050 def shuffle_columns_into_list(
2051     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
2052 ) -> Iterable[str]:
2053     """Helper to shuffle / parse columnar data and return the results as a
2054     list.
2055
2056     Args:
2057         input_lines: A sequence of strings that represents text that
2058             has been broken into columns by the caller
2059         column_specs: an iterable collection of numeric sequences that
2060             indicate one or more column numbers to copy to form the Nth
2061             position in the output list.  See example below.
2062         delim: for column_specs that indicate we should copy more than
2063             one column from the input into this position, use delim to
2064             separate source data.  Defaults to ''.
2065
2066     Returns:
2067         A list of string created by following the instructions set forth
2068         in column_specs.
2069
2070     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2071     >>> shuffle_columns_into_list(
2072     ...     cols,
2073     ...     [ [8], [2, 3], [5, 6, 7] ],
2074     ...     delim='!',
2075     ... )
2076     ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2077     """
2078     out = []
2079
2080     # Column specs map input lines' columns into outputs.
2081     # [col1, col2...]
2082     for spec in column_specs:
2083         hunk = ''
2084         for n in spec:
2085             hunk = hunk + delim + input_lines[n]
2086         hunk = hunk.strip(delim)
2087         out.append(hunk)
2088     return out
2089
2090
2091 def shuffle_columns_into_dict(
2092     input_lines: Sequence[str],
2093     column_specs: Iterable[Tuple[str, Iterable[int]]],
2094     delim='',
2095 ) -> Dict[str, str]:
2096     """Helper to shuffle / parse columnar data and return the results
2097     as a dict.
2098
2099     Args:
2100         input_lines: a sequence of strings that represents text that
2101             has been broken into columns by the caller
2102         column_specs: instructions for what dictionary keys to apply
2103             to individual or compound input column data.  See example
2104             below.
2105         delim: when forming compound output data by gluing more than
2106             one input column together, use this character to separate
2107             the source data.  Defaults to ''.
2108
2109     Returns:
2110         A dict formed by applying the column_specs instructions.
2111
2112     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
2113     >>> shuffle_columns_into_dict(
2114     ...     cols,
2115     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2116     ...     delim='!',
2117     ... )
2118     {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2119     """
2120     out = {}
2121
2122     # Column specs map input lines' columns into outputs.
2123     # "key", [col1, col2...]
2124     for spec in column_specs:
2125         hunk = ''
2126         for n in spec[1]:
2127             hunk = hunk + delim + input_lines[n]
2128         hunk = hunk.strip(delim)
2129         out[spec[0]] = hunk
2130     return out
2131
2132
2133 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2134     """
2135     Interpolate a string with data from a dict.
2136
2137     Args:
2138         txt: the mad libs template
2139         values: what you and your kids chose for each category.
2140
2141     >>> interpolate_using_dict('This is a {adjective} {noun}.',
2142     ...                        {'adjective': 'good', 'noun': 'example'})
2143     'This is a good example.'
2144     """
2145     return sprintf(txt.format(**values), end='')
2146
2147
2148 def to_ascii(txt: str):
2149     """
2150     Args:
2151         txt: the input data to encode
2152
2153     Returns:
2154         txt encoded as an ASCII byte string.
2155
2156     >>> to_ascii('test')
2157     b'test'
2158
2159     >>> to_ascii(b'1, 2, 3')
2160     b'1, 2, 3'
2161     """
2162     if isinstance(txt, str):
2163         return txt.encode('ascii')
2164     if isinstance(txt, bytes):
2165         return txt
2166     raise Exception('to_ascii works with strings and bytes')
2167
2168
2169 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
2170     """
2171     Args:
2172         txt: the input data to encode
2173
2174     Returns:
2175         txt encoded with a 64-chracter alphabet.  Similar to and compatible
2176         with uuencode/uudecode.
2177
2178     >>> to_base64('hello?')
2179     b'aGVsbG8/\\n'
2180     """
2181     return base64.encodebytes(txt.encode(encoding, errors))
2182
2183
2184 def is_base64(txt: str) -> bool:
2185     """
2186     Args:
2187         txt: the string to check
2188
2189     Returns:
2190         True if txt is a valid base64 encoded string.  This assumes
2191         txt was encoded with Python's standard base64 alphabet which
2192         is the same as what uuencode/uudecode uses).
2193
2194     >>> is_base64('test')    # all letters in the b64 alphabet
2195     True
2196
2197     >>> is_base64('another test, how do you like this one?')
2198     False
2199
2200     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
2201     True
2202
2203     """
2204     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2205     alphabet = set(a.encode('ascii'))
2206     for char in to_ascii(txt.strip()):
2207         if char not in alphabet:
2208             return False
2209     return True
2210
2211
2212 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
2213     """
2214     Args:
2215         b64: bytestring of 64-bit encoded data to decode / convert.
2216
2217     Returns:
2218         The decoded form of b64 as a normal python string.  Similar to
2219         and compatible with uuencode / uudecode.
2220
2221     >>> from_base64(b'aGVsbG8/\\n')
2222     'hello?'
2223     """
2224     return base64.decodebytes(b64).decode(encoding, errors)
2225
2226
2227 def chunk(txt: str, chunk_size: int):
2228     """
2229     Args:
2230         txt: a string to be chunked into evenly spaced pieces.
2231         chunk_size: the size of each chunk to make
2232
2233     Returns:
2234         The original string chunked into evenly spaced pieces.
2235
2236     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2237     '01001101 11000101 10101010 10101010 10011111 10101000'
2238     """
2239     if len(txt) % chunk_size != 0:
2240         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2241         logger.warning(msg)
2242         warnings.warn(msg, stacklevel=2)
2243     for x in range(0, len(txt), chunk_size):
2244         yield txt[x : x + chunk_size]
2245
2246
2247 def to_bitstring(txt: str, *, delimiter='') -> str:
2248     """
2249     Args:
2250         txt: the string to convert into a bitstring
2251         delimiter: character to insert between adjacent bytes.  Note that
2252             only bitstrings with delimiter='' are interpretable by
2253             :meth:`from_bitstring`.
2254
2255     Returns:
2256         txt converted to ascii/binary and then chopped into bytes.
2257
2258     >>> to_bitstring('hello?')
2259     '011010000110010101101100011011000110111100111111'
2260
2261     >>> to_bitstring('test', delimiter=' ')
2262     '01110100 01100101 01110011 01110100'
2263
2264     >>> to_bitstring(b'test')
2265     '01110100011001010111001101110100'
2266     """
2267     etxt = to_ascii(txt)
2268     bits = bin(int.from_bytes(etxt, 'big'))
2269     bits = bits[2:]
2270     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2271
2272
2273 def is_bitstring(txt: str) -> bool:
2274     """
2275     Args:
2276         txt: the string to check
2277
2278     Returns:
2279         True if txt is a recognized bitstring and False otherwise.
2280         Note that if delimiter is non empty this code will not
2281         recognize the bitstring.
2282
2283     >>> is_bitstring('011010000110010101101100011011000110111100111111')
2284     True
2285
2286     >>> is_bitstring('1234')
2287     False
2288     """
2289     return is_binary_integer_number(f'0b{txt}')
2290
2291
2292 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
2293     """
2294     Args:
2295         bits: the bitstring to convert back into a python string
2296         encoding: the encoding to use
2297
2298     Returns:
2299         The regular python string represented by bits.  Note that this
2300         code does not work with to_bitstring when delimiter is non-empty.
2301
2302     >>> from_bitstring('011010000110010101101100011011000110111100111111')
2303     'hello?'
2304     """
2305     n = int(bits, 2)
2306     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2307
2308
2309 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2310     """
2311     Args:
2312         txt: an IP address to chunk up for sorting purposes
2313
2314     Returns:
2315         A tuple of IP components arranged such that the sorting of
2316         IP addresses using a normal comparator will do something sane
2317         and desireable.
2318
2319     >>> ip_v4_sort_key('10.0.0.18')
2320     (10, 0, 0, 18)
2321
2322     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2323     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2324     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2325     """
2326     if not is_ip_v4(txt):
2327         print(f"not IP: {txt}")
2328         return None
2329     return tuple(int(x) for x in txt.split('.'))
2330
2331
2332 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2333     """
2334     Args:
2335         volume: the string to chunk up for sorting purposes
2336
2337     Returns:
2338         A tuple of volume's components such that the sorting of
2339         volumes using a normal comparator will do something sane
2340         and desireable.
2341
2342     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2343     ('usr', 'local', 'bin')
2344
2345     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2346     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2347     ['/usr', '/usr/local', '/usr/local/bin']
2348     """
2349     return tuple(x for x in volume.split('/') if len(x) > 0)
2350
2351
2352 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2353     """
2354     Execute several replace operations in a row.
2355
2356     Args:
2357         in_str: the string in which to replace characters
2358         replace_set: the set of target characters to replace
2359         replacement: the character to replace any member of replace_set
2360             with
2361
2362     Returns:
2363         The string with replacements executed.
2364
2365     >>> s = 'this_is a-test!'
2366     >>> replace_all(s, ' _-!', '')
2367     'thisisatest'
2368     """
2369     for char in replace_set:
2370         in_str = in_str.replace(char, replacement)
2371     return in_str
2372
2373
2374 def replace_nth(in_str: str, source: str, target: str, nth: int):
2375     """
2376     Replaces the nth occurrance of a substring within a string.
2377
2378     Args:
2379         in_str: the string in which to run the replacement
2380         source: the substring to replace
2381         target: the replacement text
2382         nth: which occurrance of source to replace?
2383
2384     >>> replace_nth('this is a test', ' ', '-', 3)
2385     'this is a-test'
2386     """
2387     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2388     before = in_str[:where]
2389     after = in_str[where:]
2390     after = after.replace(source, target, 1)
2391     return before + after
2392
2393
2394 if __name__ == '__main__':
2395     import doctest
2396
2397     doctest.testmod()