string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import base64
   4 import contextlib
   5 import datetime
   6 import io
   7 from itertools import zip_longest
   8 import json
   9 import logging
  10 import numbers
  11 import random
  12 import re
  13 import string
  14 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
  15 import unicodedata
  16 from uuid import uuid4
  17 import warnings
  18
  19 import list_utils
  20
  21 logger = logging.getLogger(__name__)
  22
  23 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  24
  25 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  26
  27 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  28
  29 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  30
  31 URLS_RAW_STRING = (
  32     r"([a-z-]+://)"  # scheme
  33     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  34     r"(www\.)?"  # www.
  35     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
  36     r"(:\d{2,})?"  # port number
  37     r"(/[a-z\d_%+-]*)*"  # folders
  38     r"(\.[a-z\d_%+-]+)*"  # file extension
  39     r"(\?[a-z\d_+%-=]*)?"  # query string
  40     r"(#\S*)?"  # hash
  41 )
  42
  43 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  44
  45 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  46
  47 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  48
  49 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  50
  51 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  52
  53 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  54
  55 CAMEL_CASE_TEST_RE = re.compile(
  56     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  57 )
  58
  59 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  60
  61 SNAKE_CASE_TEST_RE = re.compile(
  62     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  63 )
  64
  65 SNAKE_CASE_TEST_DASH_RE = re.compile(
  66     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  67 )
  68
  69 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  70
  71 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  72
  73 CREDIT_CARDS = {
  74     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  75     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  76     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  77     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  78     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  79     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  80 }
  81
  82 JSON_WRAPPER_RE = re.compile(
  83     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  84 )
  85
  86 UUID_RE = re.compile(
  87     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  88 )
  89
  90 UUID_HEX_OK_RE = re.compile(
  91     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  92     re.IGNORECASE,
  93 )
  94
  95 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  96
  97 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
  98
  99 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 100
 101 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 102
 103 MAC_ADDRESS_RE = re.compile(
 104     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
 105 )
 106
 107 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 108     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 109 )
 110
 111 WORDS_COUNT_RE = re.compile(
 112     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 113 )
 114
 115 HTML_RE = re.compile(
 116     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 117     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 118 )
 119
 120 HTML_TAG_ONLY_RE = re.compile(
 121     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 122     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 123 )
 124
 125 SPACES_RE = re.compile(r"\s")
 126
 127 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 128     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 129 )
 130
 131 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 132
 133 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 134
 135 NUM_SUFFIXES = {
 136     "Pb": (1024 ** 5),
 137     "P": (1024 ** 5),
 138     "Tb": (1024 ** 4),
 139     "T": (1024 ** 4),
 140     "Gb": (1024 ** 3),
 141     "G": (1024 ** 3),
 142     "Mb": (1024 ** 2),
 143     "M": (1024 ** 2),
 144     "Kb": (1024 ** 1),
 145     "K": (1024 ** 1),
 146 }
 147
 148
 149 def is_none_or_empty(in_str: Optional[str]) -> bool:
 150     """
 151     Returns true if the input string is either None or an empty string.
 152
 153     >>> is_none_or_empty("")
 154     True
 155     >>> is_none_or_empty(None)
 156     True
 157     >>> is_none_or_empty("   \t   ")
 158     True
 159     >>> is_none_or_empty('Test')
 160     False
 161     """
 162     return in_str is None or len(in_str.strip()) == 0
 163
 164
 165 def is_string(obj: Any) -> bool:
 166     """
 167     Checks if an object is a string.
 168
 169     >>> is_string('test')
 170     True
 171     >>> is_string(123)
 172     False
 173     >>> is_string(100.3)
 174     False
 175     >>> is_string([1, 2, 3])
 176     False
 177     """
 178     return isinstance(obj, str)
 179
 180
 181 def is_empty_string(in_str: Any) -> bool:
 182     return is_empty(in_str)
 183
 184
 185 def is_empty(in_str: Any) -> bool:
 186     """
 187     Checks if input is a string and empty or only whitespace.
 188
 189     >>> is_empty('')
 190     True
 191     >>> is_empty('    \t\t    ')
 192     True
 193     >>> is_empty('test')
 194     False
 195     >>> is_empty(100.88)
 196     False
 197     >>> is_empty([1, 2, 3])
 198     False
 199     """
 200     return is_string(in_str) and in_str.strip() == ""
 201
 202
 203 def is_full_string(in_str: Any) -> bool:
 204     """
 205     Checks that input is a string and is not empty ('') or only whitespace.
 206
 207     >>> is_full_string('test!')
 208     True
 209     >>> is_full_string('')
 210     False
 211     >>> is_full_string('      ')
 212     False
 213     >>> is_full_string(100.999)
 214     False
 215     >>> is_full_string({"a": 1, "b": 2})
 216     False
 217     """
 218     return is_string(in_str) and in_str.strip() != ""
 219
 220
 221 def is_number(in_str: str) -> bool:
 222     """
 223     Checks if a string is a valid number.
 224
 225     >>> is_number(100.5)
 226     Traceback (most recent call last):
 227     ...
 228     ValueError: 100.5
 229     >>> is_number("100.5")
 230     True
 231     >>> is_number("test")
 232     False
 233     >>> is_number("99")
 234     True
 235     >>> is_number([1, 2, 3])
 236     Traceback (most recent call last):
 237     ...
 238     ValueError: [1, 2, 3]
 239     """
 240     if not is_string(in_str):
 241         raise ValueError(in_str)
 242     return NUMBER_RE.match(in_str) is not None
 243
 244
 245 def is_integer_number(in_str: str) -> bool:
 246     """
 247     Checks whether the given string represents an integer or not.
 248
 249     An integer may be signed or unsigned or use a "scientific notation".
 250
 251     >>> is_integer_number('42')
 252     True
 253     >>> is_integer_number('42.0')
 254     False
 255     """
 256     return (
 257         (is_number(in_str) and "." not in in_str) or
 258         is_hexidecimal_integer_number(in_str) or
 259         is_octal_integer_number(in_str) or
 260         is_binary_integer_number(in_str)
 261     )
 262
 263
 264 def is_hexidecimal_integer_number(in_str: str) -> bool:
 265     """
 266     Checks whether a string is a hex integer number.
 267
 268     >>> is_hexidecimal_integer_number('0x12345')
 269     True
 270     >>> is_hexidecimal_integer_number('0x1A3E')
 271     True
 272     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 273     False
 274     >>> is_hexidecimal_integer_number('-0xff')
 275     True
 276     >>> is_hexidecimal_integer_number('test')
 277     False
 278     >>> is_hexidecimal_integer_number(12345)  # Not a string
 279     Traceback (most recent call last):
 280     ...
 281     ValueError: 12345
 282     >>> is_hexidecimal_integer_number(101.4)
 283     Traceback (most recent call last):
 284     ...
 285     ValueError: 101.4
 286     >>> is_hexidecimal_integer_number(0x1A3E)
 287     Traceback (most recent call last):
 288     ...
 289     ValueError: 6718
 290     """
 291     if not is_string(in_str):
 292         raise ValueError(in_str)
 293     return HEX_NUMBER_RE.match(in_str) is not None
 294
 295
 296 def is_octal_integer_number(in_str: str) -> bool:
 297     """
 298     Checks whether a string is an octal number.
 299
 300     >>> is_octal_integer_number('0o777')
 301     True
 302     >>> is_octal_integer_number('-0O115')
 303     True
 304     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 305     False
 306     >>> is_octal_integer_number('7777')  # Needs 0o
 307     False
 308     >>> is_octal_integer_number('test')
 309     False
 310     """
 311     if not is_string(in_str):
 312         raise ValueError(in_str)
 313     return OCT_NUMBER_RE.match(in_str) is not None
 314
 315
 316 def is_binary_integer_number(in_str: str) -> bool:
 317     """
 318     Returns whether a string contains a binary number.
 319
 320     >>> is_binary_integer_number('0b10111')
 321     True
 322     >>> is_binary_integer_number('-0b111')
 323     True
 324     >>> is_binary_integer_number('0B10101')
 325     True
 326     >>> is_binary_integer_number('0b10102')
 327     False
 328     >>> is_binary_integer_number('0xFFF')
 329     False
 330     >>> is_binary_integer_number('test')
 331     False
 332     """
 333     if not is_string(in_str):
 334         raise ValueError(in_str)
 335     return BIN_NUMBER_RE.match(in_str) is not None
 336
 337
 338 def to_int(in_str: str) -> int:
 339     """Returns the integral value of the string or raises on error.
 340
 341     >>> to_int('1234')
 342     1234
 343     >>> to_int('test')
 344     Traceback (most recent call last):
 345     ...
 346     ValueError: invalid literal for int() with base 10: 'test'
 347     """
 348     if not is_string(in_str):
 349         raise ValueError(in_str)
 350     if is_binary_integer_number(in_str):
 351         return int(in_str, 2)
 352     if is_octal_integer_number(in_str):
 353         return int(in_str, 8)
 354     if is_hexidecimal_integer_number(in_str):
 355         return int(in_str, 16)
 356     return int(in_str)
 357
 358
 359 def is_decimal_number(in_str: str) -> bool:
 360     """
 361     Checks whether the given string represents a decimal or not.
 362
 363     A decimal may be signed or unsigned or use a "scientific notation".
 364
 365     >>> is_decimal_number('42.0')
 366     True
 367     >>> is_decimal_number('42')
 368     False
 369     """
 370     return is_number(in_str) and "." in in_str
 371
 372
 373 def strip_escape_sequences(in_str: str) -> str:
 374     """
 375     Remove escape sequences in the input string.
 376
 377     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 378     'this is a test!'
 379     """
 380     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 381     return in_str
 382
 383
 384 def add_thousands_separator(
 385         in_str: str,
 386         *,
 387         separator_char = ',',
 388         places = 3
 389 ) -> str:
 390     """
 391     Add thousands separator to a numeric string.  Also handles numbers.
 392
 393     >>> add_thousands_separator('12345678')
 394     '12,345,678'
 395     >>> add_thousands_separator(12345678)
 396     '12,345,678'
 397     >>> add_thousands_separator(12345678.99)
 398     '12,345,678.99'
 399     >>> add_thousands_separator('test')
 400     Traceback (most recent call last):
 401     ...
 402     ValueError: test
 403
 404     """
 405     if isinstance(in_str, numbers.Number):
 406         in_str = f'{in_str}'
 407     if is_number(in_str):
 408         return _add_thousands_separator(
 409             in_str,
 410             separator_char = separator_char,
 411             places = places
 412         )
 413     raise ValueError(in_str)
 414
 415
 416 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 417     decimal_part = ""
 418     if '.' in in_str:
 419         (in_str, decimal_part) = in_str.split('.')
 420     tmp = [iter(in_str[::-1])] * places
 421     ret = separator_char.join(
 422         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 423     if len(decimal_part) > 0:
 424         ret += '.'
 425         ret += decimal_part
 426     return ret
 427
 428
 429 # Full url example:
 430 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 431 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 432     """
 433     Check if a string is a valid url.
 434
 435     >>> is_url('http://www.mysite.com')
 436     True
 437     >>> is_url('https://mysite.com')
 438     True
 439     >>> is_url('.mysite.com')
 440     False
 441     """
 442     if not is_full_string(in_str):
 443         return False
 444
 445     valid = URL_RE.match(in_str) is not None
 446
 447     if allowed_schemes:
 448         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 449     return valid
 450
 451
 452 def is_email(in_str: Any) -> bool:
 453     """
 454     Check if a string is a valid email.
 455
 456     Reference: https://tools.ietf.org/html/rfc3696#section-3
 457
 458     >>> is_email('[email protected]')
 459     True
 460     >>> is_email('@gmail.com')
 461     False
 462     """
 463     if (
 464         not is_full_string(in_str)
 465         or len(in_str) > 320
 466         or in_str.startswith(".")
 467     ):
 468         return False
 469
 470     try:
 471         # we expect 2 tokens, one before "@" and one after, otherwise
 472         # we have an exception and the email is not valid.
 473         head, tail = in_str.split("@")
 474
 475         # head's size must be <= 64, tail <= 255, head must not start
 476         # with a dot or contain multiple consecutive dots.
 477         if (
 478             len(head) > 64
 479             or len(tail) > 255
 480             or head.endswith(".")
 481             or (".." in head)
 482         ):
 483             return False
 484
 485         # removes escaped spaces, so that later on the test regex will
 486         # accept the string.
 487         head = head.replace("\\ ", "")
 488         if head.startswith('"') and head.endswith('"'):
 489             head = head.replace(" ", "")[1:-1]
 490         return EMAIL_RE.match(head + "@" + tail) is not None
 491
 492     except ValueError:
 493         # borderline case in which we have multiple "@" signs but the
 494         # head part is correctly escaped.
 495         if ESCAPED_AT_SIGN.search(in_str) is not None:
 496             # replace "@" with "a" in the head
 497             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 498         return False
 499
 500
 501 def suffix_string_to_number(in_str: str) -> Optional[int]:
 502     """Take a string like "33Gb" and convert it into a number (of bytes)
 503     like 34603008.  Return None if the input string is not valid.
 504
 505     >>> suffix_string_to_number('1Mb')
 506     1048576
 507     >>> suffix_string_to_number('13.1Gb')
 508     14066017894
 509     """
 510     def suffix_capitalize(s: str) -> str:
 511         if len(s) == 1:
 512             return s.upper()
 513         elif len(s) == 2:
 514             return f"{s[0].upper()}{s[1].lower()}"
 515         return suffix_capitalize(s[0:1])
 516
 517     if is_string(in_str):
 518         if is_integer_number(in_str):
 519             return to_int(in_str)
 520         suffixes = [in_str[-2:], in_str[-1:]]
 521         rest = [in_str[:-2], in_str[:-1]]
 522         for x in range(len(suffixes)):
 523             s = suffixes[x]
 524             s = suffix_capitalize(s)
 525             multiplier = NUM_SUFFIXES.get(s, None)
 526             if multiplier is not None:
 527                 r = rest[x]
 528                 if is_integer_number(r):
 529                     return to_int(r) * multiplier
 530                 if is_decimal_number(r):
 531                     return int(float(r) * multiplier)
 532     return None
 533
 534
 535 def number_to_suffix_string(num: int) -> Optional[str]:
 536     """Take a number (of bytes) and returns a string like "43.8Gb".
 537     Returns none if the input is invalid.
 538
 539     >>> number_to_suffix_string(14066017894)
 540     '13.1Gb'
 541     >>> number_to_suffix_string(1024 * 1024)
 542     '1.0Mb'
 543
 544     """
 545     d = 0.0
 546     suffix = None
 547     for (sfx, size) in NUM_SUFFIXES.items():
 548         if num >= size:
 549             d = num / size
 550             suffix = sfx
 551             break
 552     if suffix is not None:
 553         return f"{d:.1f}{suffix}"
 554     else:
 555         return f'{num:d}'
 556
 557
 558 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 559     """
 560     Checks if a string is a valid credit card number.
 561     If card type is provided then it checks against that specific type only,
 562     otherwise any known credit card number will be accepted.
 563
 564     Supported card types are the following:
 565
 566     - VISA
 567     - MASTERCARD
 568     - AMERICAN_EXPRESS
 569     - DINERS_CLUB
 570     - DISCOVER
 571     - JCB
 572     """
 573     if not is_full_string(in_str):
 574         return False
 575
 576     if card_type is not None:
 577         if card_type not in CREDIT_CARDS:
 578             raise KeyError(
 579                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 580             )
 581         return CREDIT_CARDS[card_type].match(in_str) is not None
 582     for c in CREDIT_CARDS:
 583         if CREDIT_CARDS[c].match(in_str) is not None:
 584             return True
 585     return False
 586
 587
 588 def is_camel_case(in_str: Any) -> bool:
 589     """
 590     Checks if a string is formatted as camel case.
 591
 592     A string is considered camel case when:
 593
 594     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 595     - it contains both lowercase and uppercase letters
 596     - it does not start with a number
 597     """
 598     return (
 599         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 600     )
 601
 602
 603 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 604     """
 605     Checks if a string is formatted as "snake case".
 606
 607     A string is considered snake case when:
 608
 609     - it's composed only by lowercase/uppercase letters and digits
 610     - it contains at least one underscore (or provided separator)
 611     - it does not start with a number
 612
 613     >>> is_snake_case('this_is_a_test')
 614     True
 615     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 616     True
 617     >>> is_snake_case('this-is-a-test')
 618     False
 619     >>> is_snake_case('this-is-a-test', separator='-')
 620     True
 621
 622     """
 623     if is_full_string(in_str):
 624         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 625         re_template = (
 626             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 627         )
 628         r = re_map.get(
 629             separator,
 630             re.compile(
 631                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 632             ),
 633         )
 634         return r.match(in_str) is not None
 635     return False
 636
 637
 638 def is_json(in_str: Any) -> bool:
 639     """
 640     Check if a string is a valid json.
 641
 642     >>> is_json('{"name": "Peter"}')
 643     True
 644     >>> is_json('[1, 2, 3]')
 645     True
 646     >>> is_json('{nope}')
 647     False
 648     """
 649     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 650         try:
 651             return isinstance(json.loads(in_str), (dict, list))
 652         except (TypeError, ValueError, OverflowError):
 653             pass
 654     return False
 655
 656
 657 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 658     """
 659     Check if a string is a valid UUID.
 660
 661     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 662     True
 663     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 664     False
 665     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 666     True
 667     """
 668     # string casting is used to allow UUID itself as input data type
 669     s = str(in_str)
 670     if allow_hex:
 671         return UUID_HEX_OK_RE.match(s) is not None
 672     return UUID_RE.match(s) is not None
 673
 674
 675 def is_ip_v4(in_str: Any) -> bool:
 676     """
 677     Checks if a string is a valid ip v4.
 678
 679     >>> is_ip_v4('255.200.100.75')
 680     True
 681     >>> is_ip_v4('nope')
 682     False
 683     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 684     False
 685     """
 686     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 687         return False
 688
 689     # checks that each entry in the ip is in the valid range (0 to 255)
 690     for token in in_str.split("."):
 691         if not 0 <= int(token) <= 255:
 692             return False
 693     return True
 694
 695
 696 def extract_ip_v4(in_str: Any) -> Optional[str]:
 697     """
 698     Extracts the IPv4 chunk of a string or None.
 699
 700     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 701     '127.0.0.1'
 702     >>> extract_ip_v4('Your mom dresses you funny.')
 703     """
 704     if not is_full_string(in_str):
 705         return None
 706     m = ANYWHERE_IP_V4_RE.search(in_str)
 707     if m is not None:
 708         return m.group(0)
 709     return None
 710
 711
 712 def is_ip_v6(in_str: Any) -> bool:
 713     """
 714     Checks if a string is a valid ip v6.
 715
 716     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 717     True
 718     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 719     False
 720     """
 721     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 722
 723
 724 def extract_ip_v6(in_str: Any) -> Optional[str]:
 725     """
 726     Extract IPv6 chunk or None.
 727
 728     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 729     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 730     >>> extract_ip_v6("(and she's ugly too, btw)")
 731     """
 732     if not is_full_string(in_str):
 733         return None
 734     m = ANYWHERE_IP_V6_RE.search(in_str)
 735     if m is not None:
 736         return m.group(0)
 737     return None
 738
 739
 740 def is_ip(in_str: Any) -> bool:
 741     """
 742     Checks if a string is a valid ip (either v4 or v6).
 743
 744     >>> is_ip('255.200.100.75')
 745     True
 746     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 747     True
 748     >>> is_ip('1.2.3')
 749     False
 750     >>> is_ip('1.2.3.999')
 751     False
 752     """
 753     return is_ip_v6(in_str) or is_ip_v4(in_str)
 754
 755
 756 def extract_ip(in_str: Any) -> Optional[str]:
 757     """
 758     Extract the IP address or None.
 759
 760     >>> extract_ip('Attacker: 255.200.100.75')
 761     '255.200.100.75'
 762     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 763     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 764     >>> extract_ip('1.2.3')
 765
 766     """
 767     ip = extract_ip_v4(in_str)
 768     if ip is None:
 769         ip = extract_ip_v6(in_str)
 770     return ip
 771
 772
 773 def is_mac_address(in_str: Any) -> bool:
 774     """Return True if in_str is a valid MAC address false otherwise.
 775
 776     >>> is_mac_address("34:29:8F:12:0D:2F")
 777     True
 778     >>> is_mac_address('34:29:8f:12:0d:2f')
 779     True
 780     >>> is_mac_address('34-29-8F-12-0D-2F')
 781     True
 782     >>> is_mac_address("test")
 783     False
 784     """
 785     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 786
 787
 788 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 789     """
 790     Extract the MAC address from in_str.
 791
 792     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 793     '34:29:8F:12:0D:2F'
 794
 795     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 796     'd8:5d:e2:34:54:86'
 797
 798     """
 799     if not is_full_string(in_str):
 800         return None
 801     in_str.strip()
 802     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 803     if m is not None:
 804         mac = m.group(0)
 805         mac.replace(":", separator)
 806         mac.replace("-", separator)
 807         return mac
 808     return None
 809
 810
 811 def is_slug(in_str: Any, separator: str = "-") -> bool:
 812     """
 813     Checks if a given string is a slug (as created by `slugify()`).
 814
 815     >>> is_slug('my-blog-post-title')
 816     True
 817     >>> is_slug('My blog post title')
 818     False
 819
 820     """
 821     if not is_full_string(in_str):
 822         return False
 823     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 824     return re.match(rex, in_str) is not None
 825
 826
 827 def contains_html(in_str: str) -> bool:
 828     """
 829     Checks if the given string contains HTML/XML tags.
 830
 831     By design, this function matches ANY type of tag, so don't expect to use it
 832     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 833
 834     >>> contains_html('my string is <strong>bold</strong>')
 835     True
 836     >>> contains_html('my string is not bold')
 837     False
 838
 839     """
 840     if not is_string(in_str):
 841         raise ValueError(in_str)
 842     return HTML_RE.search(in_str) is not None
 843
 844
 845 def words_count(in_str: str) -> int:
 846     """
 847     Returns the number of words contained into the given string.
 848
 849     This method is smart, it does consider only sequence of one or more letter and/or numbers
 850     as "words", so a string like this: "! @ # % ... []" will return zero!
 851     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 852     will be 4 not 1 (even if there are no spaces in the string).
 853
 854     >>> words_count('hello world')
 855     2
 856     >>> words_count('one,two,three.stop')
 857     4
 858
 859     """
 860     if not is_string(in_str):
 861         raise ValueError(in_str)
 862     return len(WORDS_COUNT_RE.findall(in_str))
 863
 864
 865 def generate_uuid(omit_dashes: bool = False) -> str:
 866     """
 867     Generated an UUID string (using `uuid.uuid4()`).
 868
 869     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 870     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 871
 872     """
 873     uid = uuid4()
 874     if omit_dashes:
 875         return uid.hex
 876     return str(uid)
 877
 878
 879 def generate_random_alphanumeric_string(size: int) -> str:
 880     """
 881     Returns a string of the specified size containing random
 882     characters (uppercase/lowercase ascii letters and digits).
 883
 884     random_string(9) # possible output: "cx3QQbzYg"
 885
 886     """
 887     if size < 1:
 888         raise ValueError("size must be >= 1")
 889     chars = string.ascii_letters + string.digits
 890     buffer = [random.choice(chars) for _ in range(size)]
 891     return from_char_list(buffer)
 892
 893
 894 def reverse(in_str: str) -> str:
 895     """
 896     Returns the string with its chars reversed.
 897
 898     >>> reverse('test')
 899     'tset'
 900
 901     """
 902     if not is_string(in_str):
 903         raise ValueError(in_str)
 904     return in_str[::-1]
 905
 906
 907 def camel_case_to_snake_case(in_str, *, separator="_"):
 908     """
 909     Convert a camel case string into a snake case one.
 910     (The original string is returned if is not a valid camel case string)
 911
 912     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 913     'mac_address_extractor_factory'
 914     >>> camel_case_to_snake_case('Luke Skywalker')
 915     'Luke Skywalker'
 916     """
 917     if not is_string(in_str):
 918         raise ValueError(in_str)
 919     if not is_camel_case(in_str):
 920         return in_str
 921     return CAMEL_CASE_REPLACE_RE.sub(
 922         lambda m: m.group(1) + separator, in_str
 923     ).lower()
 924
 925
 926 def snake_case_to_camel_case(
 927     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 928 ) -> str:
 929     """
 930     Convert a snake case string into a camel case one.
 931     (The original string is returned if is not a valid snake case string)
 932
 933     >>> snake_case_to_camel_case('this_is_a_test')
 934     'ThisIsATest'
 935     >>> snake_case_to_camel_case('Han Solo')
 936     'Han Solo'
 937     """
 938     if not is_string(in_str):
 939         raise ValueError(in_str)
 940     if not is_snake_case(in_str, separator=separator):
 941         return in_str
 942     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 943     if not upper_case_first:
 944         tokens[0] = tokens[0].lower()
 945     return from_char_list(tokens)
 946
 947
 948 def to_char_list(in_str: str) -> List[str]:
 949     """Convert a string into a list of chars.
 950
 951     >>> to_char_list('test')
 952     ['t', 'e', 's', 't']
 953     """
 954     if not is_string(in_str):
 955         return []
 956     return list(in_str)
 957
 958
 959 def from_char_list(in_list: List[str]) -> str:
 960     """Convert a char list into a string.
 961
 962     >>> from_char_list(['t', 'e', 's', 't'])
 963     'test'
 964     """
 965     return "".join(in_list)
 966
 967
 968 def shuffle(in_str: str) -> str:
 969     """Return a new string containing same chars of the given one but in
 970     a randomized order.
 971     """
 972     if not is_string(in_str):
 973         raise ValueError(in_str)
 974
 975     # turn the string into a list of chars
 976     chars = to_char_list(in_str)
 977     random.shuffle(chars)
 978     return from_char_list(chars)
 979
 980
 981 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 982     """
 983     Remove html code contained into the given string.
 984
 985     >>> strip_html('test: <a href="foo/bar">click here</a>')
 986     'test: '
 987     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 988     'test: click here'
 989     """
 990     if not is_string(in_str):
 991         raise ValueError(in_str)
 992     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 993     return r.sub("", in_str)
 994
 995
 996 def asciify(in_str: str) -> str:
 997     """
 998     Force string content to be ascii-only by translating all non-ascii
 999     chars into the closest possible representation (eg: ó -> o, Ë ->
1000     E, ç -> c...).
1001
1002     N.B. Some chars may be lost if impossible to translate.
1003
1004     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1005     'eeuuooaaeynAAACIINOE'
1006     """
1007     if not is_string(in_str):
1008         raise ValueError(in_str)
1009
1010     # "NFKD" is the algorithm which is able to successfully translate
1011     # the most of non-ascii chars.
1012     normalized = unicodedata.normalize("NFKD", in_str)
1013
1014     # encode string forcing ascii and ignore any errors
1015     # (unrepresentable chars will be stripped out)
1016     ascii_bytes = normalized.encode("ascii", "ignore")
1017
1018     # turns encoded bytes into an utf-8 string
1019     return ascii_bytes.decode("utf-8")
1020
1021
1022 def slugify(in_str: str, *, separator: str = "-") -> str:
1023     """
1024     Converts a string into a "slug" using provided separator.
1025     The returned string has the following properties:
1026
1027     - it has no spaces
1028     - all letters are in lower case
1029     - all punctuation signs and non alphanumeric chars are removed
1030     - words are divided using provided separator
1031     - all chars are encoded as ascii (by using `asciify()`)
1032     - is safe for URL
1033
1034     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1035     'top-10-reasons-to-love-dogs'
1036     >>> slugify('Mönstér Mägnët')
1037     'monster-magnet'
1038     """
1039     if not is_string(in_str):
1040         raise ValueError(in_str)
1041
1042     # replace any character that is NOT letter or number with spaces
1043     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1044
1045     # replace spaces with join sign
1046     out = SPACES_RE.sub(separator, out)
1047
1048     # normalize joins (remove duplicates)
1049     out = re.sub(re.escape(separator) + r"+", separator, out)
1050     return asciify(out)
1051
1052
1053 def to_bool(in_str: str) -> bool:
1054     """
1055     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1056
1057     A positive boolean (True) is returned if the string value is one
1058     of the following:
1059
1060     - "true"
1061     - "1"
1062     - "yes"
1063     - "y"
1064
1065     Otherwise False is returned.
1066
1067     >>> to_bool('True')
1068     True
1069
1070     >>> to_bool('1')
1071     True
1072
1073     >>> to_bool('yes')
1074     True
1075
1076     >>> to_bool('no')
1077     False
1078
1079     >>> to_bool('huh?')
1080     False
1081
1082     >>> to_bool('on')
1083     True
1084
1085     """
1086     if not is_string(in_str):
1087         raise ValueError(in_str)
1088     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1089
1090
1091 def to_date(in_str: str) -> Optional[datetime.date]:
1092     """
1093     Parses a date string.  See DateParser docs for details.
1094     """
1095     import dateparse.dateparse_utils as dp
1096     try:
1097         d = dp.DateParser()
1098         d.parse(in_str)
1099         return d.get_date()
1100     except dp.ParseException:
1101         msg = f'Unable to parse date {in_str}.'
1102         logger.warning(msg)
1103     return None
1104
1105
1106 def valid_date(in_str: str) -> bool:
1107     """
1108     True if the string represents a valid date.
1109     """
1110     import dateparse.dateparse_utils as dp
1111     try:
1112         d = dp.DateParser()
1113         _ = d.parse(in_str)
1114         return True
1115     except dp.ParseException:
1116         msg = f'Unable to parse date {in_str}.'
1117         logger.warning(msg)
1118     return False
1119
1120
1121 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1122     """
1123     Parses a datetime string.  See DateParser docs for more info.
1124     """
1125     import dateparse.dateparse_utils as dp
1126     try:
1127         d = dp.DateParser()
1128         dt = d.parse(in_str)
1129         if type(dt) == datetime.datetime:
1130             return dt
1131     except ValueError:
1132         msg = f'Unable to parse datetime {in_str}.'
1133         logger.warning(msg)
1134     return None
1135
1136
1137 def valid_datetime(in_str: str) -> bool:
1138     """
1139     True if the string represents a valid datetime.
1140     """
1141     _ = to_datetime(in_str)
1142     if _ is not None:
1143         return True
1144     msg = f'Unable to parse datetime {in_str}.'
1145     logger.warning(msg)
1146     return False
1147
1148
1149 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1150     """
1151     Squeeze runs of more than one character_to_squeeze into one.
1152
1153     >>> squeeze(' this        is       a    test    ')
1154     ' this is a test '
1155
1156     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1157     'one|!|two|!|three'
1158
1159     """
1160     return re.sub(
1161         r'(' + re.escape(character_to_squeeze) + r')+',
1162         character_to_squeeze,
1163         in_str
1164     )
1165
1166
1167 def dedent(in_str: str) -> str:
1168     """
1169     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1170     """
1171     if not is_string(in_str):
1172         raise ValueError(in_str)
1173     line_separator = '\n'
1174     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1175     return line_separator.join(lines)
1176
1177
1178 def indent(in_str: str, amount: int) -> str:
1179     """
1180     Indents string by prepending amount spaces.
1181
1182     >>> indent('This is a test', 4)
1183     '    This is a test'
1184
1185     """
1186     if not is_string(in_str):
1187         raise ValueError(in_str)
1188     line_separator = '\n'
1189     lines = [" " * amount + line for line in in_str.split(line_separator)]
1190     return line_separator.join(lines)
1191
1192
1193 def sprintf(*args, **kwargs) -> str:
1194     """String printf, like in C"""
1195     ret = ""
1196
1197     sep = kwargs.pop("sep", None)
1198     if sep is not None:
1199         if not isinstance(sep, str):
1200             raise TypeError("sep must be None or a string")
1201
1202     end = kwargs.pop("end", None)
1203     if end is not None:
1204         if not isinstance(end, str):
1205             raise TypeError("end must be None or a string")
1206
1207     if kwargs:
1208         raise TypeError("invalid keyword arguments to sprint()")
1209
1210     if sep is None:
1211         sep = " "
1212     if end is None:
1213         end = "\n"
1214     for i, arg in enumerate(args):
1215         if i:
1216             ret += sep
1217         if isinstance(arg, str):
1218             ret += arg
1219         else:
1220             ret += str(arg)
1221     ret += end
1222     return ret
1223
1224
1225 class SprintfStdout(object):
1226     """
1227     A context manager that captures outputs to stdout.
1228
1229     with SprintfStdout() as buf:
1230         print("test")
1231     print(buf())
1232
1233     'test\n'
1234     """
1235     def __init__(self) -> None:
1236         self.destination = io.StringIO()
1237         self.recorder = None
1238
1239     def __enter__(self) -> Callable[[], str]:
1240         self.recorder = contextlib.redirect_stdout(self.destination)
1241         self.recorder.__enter__()
1242         return lambda: self.destination.getvalue()
1243
1244     def __exit__(self, *args) -> None:
1245         self.recorder.__exit__(*args)
1246         self.destination.seek(0)
1247         return None  # don't suppress exceptions
1248
1249
1250 def is_are(n: int) -> str:
1251     """Is or are?
1252
1253     >>> is_are(1)
1254     'is'
1255     >>> is_are(2)
1256     'are'
1257
1258     """
1259     if n == 1:
1260         return "is"
1261     return "are"
1262
1263
1264 def pluralize(n: int) -> str:
1265     """Add an s?
1266
1267     >>> pluralize(15)
1268     's'
1269     >>> count = 1
1270     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1271     There is 1 file.
1272     >>> count = 4
1273     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1274     There are 4 files.
1275
1276     """
1277     if n == 1:
1278         return ""
1279     return "s"
1280
1281
1282 def thify(n: int) -> str:
1283     """Return the proper cardinal suffix for a number.
1284
1285     >>> thify(1)
1286     'st'
1287     >>> thify(33)
1288     'rd'
1289     >>> thify(16)
1290     'th'
1291
1292     """
1293     digit = str(n)
1294     assert is_integer_number(digit)
1295     digit = digit[-1:]
1296     if digit == "1":
1297         return "st"
1298     elif digit == "2":
1299         return "nd"
1300     elif digit == "3":
1301         return "rd"
1302     else:
1303         return "th"
1304
1305
1306 def ngrams(txt: str, n: int):
1307     """Return the ngrams from a string.
1308
1309     >>> [x for x in ngrams('This is a test', 2)]
1310     ['This is', 'is a', 'a test']
1311
1312     """
1313     words = txt.split()
1314     for ngram in ngrams_presplit(words, n):
1315         ret = ''
1316         for word in ngram:
1317             ret += f'{word} '
1318         yield ret.strip()
1319
1320
1321 def ngrams_presplit(words: Sequence[str], n: int):
1322     return list_utils.ngrams(words, n)
1323
1324
1325 def bigrams(txt: str):
1326     return ngrams(txt, 2)
1327
1328
1329 def trigrams(txt: str):
1330     return ngrams(txt, 3)
1331
1332
1333 def shuffle_columns_into_list(
1334         input_lines: Iterable[str],
1335         column_specs: Iterable[Iterable[int]],
1336         delim=''
1337 ) -> Iterable[str]:
1338     """Helper to shuffle / parse columnar data and return the results as a
1339     list.  The column_specs argument is an iterable collection of
1340     numeric sequences that indicate one or more column numbers to
1341     copy.
1342
1343     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1344     >>> shuffle_columns_into_list(
1345     ...     cols,
1346     ...     [ [8], [2, 3], [5, 6, 7] ],
1347     ...     delim=' ',
1348     ... )
1349     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1350
1351     """
1352     out = []
1353
1354     # Column specs map input lines' columns into outputs.
1355     # [col1, col2...]
1356     for spec in column_specs:
1357         chunk = ''
1358         for n in spec:
1359             chunk = chunk + delim + input_lines[n]
1360         chunk = chunk.strip(delim)
1361         out.append(chunk)
1362     return out
1363
1364
1365 def shuffle_columns_into_dict(
1366         input_lines: Iterable[str],
1367         column_specs: Iterable[Tuple[str, Iterable[int]]],
1368         delim=''
1369 ) -> Dict[str, str]:
1370     """Helper to shuffle / parse columnar data and return the results
1371     as a dict.
1372
1373     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1374     >>> shuffle_columns_into_dict(
1375     ...     cols,
1376     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1377     ...     delim=' ',
1378     ... )
1379     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1380
1381     """
1382     out = {}
1383
1384     # Column specs map input lines' columns into outputs.
1385     # "key", [col1, col2...]
1386     for spec in column_specs:
1387         chunk = ''
1388         for n in spec[1]:
1389             chunk = chunk + delim + input_lines[n]
1390         chunk = chunk.strip(delim)
1391         out[spec[0]] = chunk
1392     return out
1393
1394
1395 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1396     """Interpolate a string with data from a dict.
1397
1398     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1399     ...                        {'adjective': 'good', 'noun': 'example'})
1400     'This is a good example.'
1401
1402     """
1403     return sprintf(txt.format(**values), end='')
1404
1405
1406 def to_ascii(x: str):
1407     """Encode as ascii bytes string.
1408
1409     >>> to_ascii('test')
1410     b'test'
1411
1412     >>> to_ascii(b'1, 2, 3')
1413     b'1, 2, 3'
1414
1415     """
1416     if type(x) is str:
1417         return x.encode('ascii')
1418     if type(x) is bytes:
1419         return x
1420     raise Exception('to_ascii works with strings and bytes')
1421
1422
1423 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
1424     """Encode txt and then encode the bytes with a 64-character
1425     alphabet.  This is compatible with uudecode.
1426
1427     >>> to_base64('hello?')
1428     b'aGVsbG8/\\n'
1429
1430     """
1431     return base64.encodebytes(txt.encode(encoding, errors))
1432
1433
1434 def is_base64(txt: str) -> bool:
1435     """Determine whether a string is base64 encoded (with Python's standard
1436     base64 alphabet which is the same as what uuencode uses).
1437
1438     >>> is_base64('test')    # all letters in the b64 alphabet
1439     True
1440
1441     >>> is_base64('another test, how do you like this one?')
1442     False
1443
1444     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1445     True
1446
1447     """
1448     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1449     alphabet = set(a.encode('ascii'))
1450     for char in to_ascii(txt.strip()):
1451         if char not in alphabet:
1452             return False
1453     return True
1454
1455
1456 def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
1457     """Convert base64 encoded string back to normal strings.
1458
1459     >>> from_base64(b'aGVsbG8/\\n')
1460     'hello?'
1461
1462     """
1463     return base64.decodebytes(b64).decode(encoding, errors)
1464
1465
1466 def chunk(txt: str, chunk_size):
1467     """Chunk up a string.
1468
1469     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1470     '01001101 11000101 10101010 10101010 10011111 10101000'
1471
1472     """
1473     if len(txt) % chunk_size != 0:
1474         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1475         logger.warning(msg)
1476         warnings.warn(msg, stacklevel=2)
1477     for x in range(0, len(txt), chunk_size):
1478         yield txt[x:x+chunk_size]
1479
1480
1481 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1482     """Encode txt and then chop it into bytes.  Note: only bitstrings
1483     with delimiter='' are interpretable by from_bitstring.
1484
1485     >>> to_bitstring('hello?')
1486     '011010000110010101101100011011000110111100111111'
1487
1488     >>> to_bitstring('test', delimiter=' ')
1489     '01110100 01100101 01110011 01110100'
1490
1491     >>> to_bitstring(b'test')
1492     '01110100011001010111001101110100'
1493
1494     """
1495     etxt = to_ascii(txt)
1496     bits = bin(
1497         int.from_bytes(
1498             etxt,
1499             'big'
1500         )
1501     )
1502     bits = bits[2:]
1503     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1504
1505
1506 def is_bitstring(txt: str) -> bool:
1507     """Is this a bitstring?
1508
1509     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1510     True
1511
1512     >>> is_bitstring('1234')
1513     False
1514
1515     """
1516     return is_binary_integer_number(f'0b{txt}')
1517
1518
1519 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1520     """Convert from bitstring back to bytes then decode into a str.
1521
1522     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1523     'hello?'
1524
1525     """
1526     n = int(bits, 2)
1527     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1528
1529
1530 def ip_v4_sort_key(txt: str) -> Tuple[int]:
1531     """Turn an IPv4 address into a tuple for sorting purposes.
1532
1533     >>> ip_v4_sort_key('10.0.0.18')
1534     (10, 0, 0, 18)
1535
1536     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1537     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1538     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1539
1540     """
1541     if not is_ip_v4(txt):
1542         print(f"not IP: {txt}")
1543         return None
1544     return tuple([int(x) for x in txt.split('.')])
1545
1546
1547 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str]:
1548     """Chunk up a file path so that parent/ancestor paths sort before
1549     children/descendant paths.
1550
1551     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1552     ('usr', 'local', 'bin')
1553
1554     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1555     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1556     ['/usr', '/usr/local', '/usr/local/bin']
1557
1558     """
1559     return tuple([x for x in volume.split('/') if len(x) > 0])
1560
1561
1562 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1563     """Execute several replace operations in a row.
1564
1565     >>> s = 'this_is a-test!'
1566     >>> replace_all(s, ' _-!', '')
1567     'thisisatest'
1568
1569     """
1570     for char in replace_set:
1571         in_str = in_str.replace(char, replacement)
1572     return in_str
1573
1574
1575 if __name__ == '__main__':
1576     import doctest
1577     doctest.testmod()