string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import base64
   4 import contextlib
   5 import datetime
   6 import io
   7 from itertools import zip_longest
   8 import json
   9 import logging
  10 import numbers
  11 import random
  12 import re
  13 import string
  14 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
  15 import unicodedata
  16 from uuid import uuid4
  17
  18 import list_utils
  19
  20 logger = logging.getLogger(__name__)
  21
  22 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  23
  24 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  25
  26 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  27
  28 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  29
  30 URLS_RAW_STRING = (
  31     r"([a-z-]+://)"  # scheme
  32     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  33     r"(www\.)?"  # www.
  34     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
  35     r"(:\d{2,})?"  # port number
  36     r"(/[a-z\d_%+-]*)*"  # folders
  37     r"(\.[a-z\d_%+-]+)*"  # file extension
  38     r"(\?[a-z\d_+%-=]*)?"  # query string
  39     r"(#\S*)?"  # hash
  40 )
  41
  42 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  43
  44 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  45
  46 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  47
  48 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  49
  50 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  51
  52 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  53
  54 CAMEL_CASE_TEST_RE = re.compile(
  55     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  56 )
  57
  58 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  59
  60 SNAKE_CASE_TEST_RE = re.compile(
  61     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  62 )
  63
  64 SNAKE_CASE_TEST_DASH_RE = re.compile(
  65     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  66 )
  67
  68 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  69
  70 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  71
  72 CREDIT_CARDS = {
  73     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  74     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  75     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  76     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  77     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  78     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  79 }
  80
  81 JSON_WRAPPER_RE = re.compile(
  82     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  83 )
  84
  85 UUID_RE = re.compile(
  86     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  87 )
  88
  89 UUID_HEX_OK_RE = re.compile(
  90     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  91     re.IGNORECASE,
  92 )
  93
  94 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  95
  96 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
  97
  98 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
  99
 100 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 101
 102 MAC_ADDRESS_RE = re.compile(
 103     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
 104 )
 105
 106 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 107     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 108 )
 109
 110 WORDS_COUNT_RE = re.compile(
 111     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 112 )
 113
 114 HTML_RE = re.compile(
 115     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 116     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 117 )
 118
 119 HTML_TAG_ONLY_RE = re.compile(
 120     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 121     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 122 )
 123
 124 SPACES_RE = re.compile(r"\s")
 125
 126 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 127     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 128 )
 129
 130 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 131
 132 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 133
 134 NUM_SUFFIXES = {
 135     "Pb": (1024 ** 5),
 136     "P": (1024 ** 5),
 137     "Tb": (1024 ** 4),
 138     "T": (1024 ** 4),
 139     "Gb": (1024 ** 3),
 140     "G": (1024 ** 3),
 141     "Mb": (1024 ** 2),
 142     "M": (1024 ** 2),
 143     "Kb": (1024 ** 1),
 144     "K": (1024 ** 1),
 145 }
 146
 147
 148 def is_none_or_empty(in_str: Optional[str]) -> bool:
 149     """
 150     Returns true if the input string is either None or an empty string.
 151
 152     >>> is_none_or_empty("")
 153     True
 154     >>> is_none_or_empty(None)
 155     True
 156     >>> is_none_or_empty("   \t   ")
 157     True
 158     >>> is_none_or_empty('Test')
 159     False
 160     """
 161     return in_str is None or len(in_str.strip()) == 0
 162
 163
 164 def is_string(obj: Any) -> bool:
 165     """
 166     Checks if an object is a string.
 167
 168     >>> is_string('test')
 169     True
 170     >>> is_string(123)
 171     False
 172     >>> is_string(100.3)
 173     False
 174     >>> is_string([1, 2, 3])
 175     False
 176     """
 177     return isinstance(obj, str)
 178
 179
 180 def is_empty_string(in_str: Any) -> bool:
 181     return is_empty(in_str)
 182
 183
 184 def is_empty(in_str: Any) -> bool:
 185     """
 186     Checks if input is a string and empty or only whitespace.
 187
 188     >>> is_empty('')
 189     True
 190     >>> is_empty('    \t\t    ')
 191     True
 192     >>> is_empty('test')
 193     False
 194     >>> is_empty(100.88)
 195     False
 196     >>> is_empty([1, 2, 3])
 197     False
 198     """
 199     return is_string(in_str) and in_str.strip() == ""
 200
 201
 202 def is_full_string(in_str: Any) -> bool:
 203     """
 204     Checks that input is a string and is not empty ('') or only whitespace.
 205
 206     >>> is_full_string('test!')
 207     True
 208     >>> is_full_string('')
 209     False
 210     >>> is_full_string('      ')
 211     False
 212     >>> is_full_string(100.999)
 213     False
 214     >>> is_full_string({"a": 1, "b": 2})
 215     False
 216     """
 217     return is_string(in_str) and in_str.strip() != ""
 218
 219
 220 def is_number(in_str: str) -> bool:
 221     """
 222     Checks if a string is a valid number.
 223
 224     >>> is_number(100.5)
 225     Traceback (most recent call last):
 226     ...
 227     ValueError: 100.5
 228     >>> is_number("100.5")
 229     True
 230     >>> is_number("test")
 231     False
 232     >>> is_number("99")
 233     True
 234     >>> is_number([1, 2, 3])
 235     Traceback (most recent call last):
 236     ...
 237     ValueError: [1, 2, 3]
 238     """
 239     if not is_string(in_str):
 240         raise ValueError(in_str)
 241     return NUMBER_RE.match(in_str) is not None
 242
 243
 244 def is_integer_number(in_str: str) -> bool:
 245     """
 246     Checks whether the given string represents an integer or not.
 247
 248     An integer may be signed or unsigned or use a "scientific notation".
 249
 250     >>> is_integer_number('42')
 251     True
 252     >>> is_integer_number('42.0')
 253     False
 254     """
 255     return (
 256         (is_number(in_str) and "." not in in_str) or
 257         is_hexidecimal_integer_number(in_str) or
 258         is_octal_integer_number(in_str) or
 259         is_binary_integer_number(in_str)
 260     )
 261
 262
 263 def is_hexidecimal_integer_number(in_str: str) -> bool:
 264     """
 265     Checks whether a string is a hex integer number.
 266
 267     >>> is_hexidecimal_integer_number('0x12345')
 268     True
 269     >>> is_hexidecimal_integer_number('0x1A3E')
 270     True
 271     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 272     False
 273     >>> is_hexidecimal_integer_number('-0xff')
 274     True
 275     >>> is_hexidecimal_integer_number('test')
 276     False
 277     >>> is_hexidecimal_integer_number(12345)  # Not a string
 278     Traceback (most recent call last):
 279     ...
 280     ValueError: 12345
 281     >>> is_hexidecimal_integer_number(101.4)
 282     Traceback (most recent call last):
 283     ...
 284     ValueError: 101.4
 285     >>> is_hexidecimal_integer_number(0x1A3E)
 286     Traceback (most recent call last):
 287     ...
 288     ValueError: 6718
 289     """
 290     if not is_string(in_str):
 291         raise ValueError(in_str)
 292     return HEX_NUMBER_RE.match(in_str) is not None
 293
 294
 295 def is_octal_integer_number(in_str: str) -> bool:
 296     """
 297     Checks whether a string is an octal number.
 298
 299     >>> is_octal_integer_number('0o777')
 300     True
 301     >>> is_octal_integer_number('-0O115')
 302     True
 303     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 304     False
 305     >>> is_octal_integer_number('7777')  # Needs 0o
 306     False
 307     >>> is_octal_integer_number('test')
 308     False
 309     """
 310     if not is_string(in_str):
 311         raise ValueError(in_str)
 312     return OCT_NUMBER_RE.match(in_str) is not None
 313
 314
 315 def is_binary_integer_number(in_str: str) -> bool:
 316     """
 317     Returns whether a string contains a binary number.
 318
 319     >>> is_binary_integer_number('0b10111')
 320     True
 321     >>> is_binary_integer_number('-0b111')
 322     True
 323     >>> is_binary_integer_number('0B10101')
 324     True
 325     >>> is_binary_integer_number('0b10102')
 326     False
 327     >>> is_binary_integer_number('0xFFF')
 328     False
 329     >>> is_binary_integer_number('test')
 330     False
 331     """
 332     if not is_string(in_str):
 333         raise ValueError(in_str)
 334     return BIN_NUMBER_RE.match(in_str) is not None
 335
 336
 337 def to_int(in_str: str) -> int:
 338     """Returns the integral value of the string or raises on error.
 339
 340     >>> to_int('1234')
 341     1234
 342     >>> to_int('test')
 343     Traceback (most recent call last):
 344     ...
 345     ValueError: invalid literal for int() with base 10: 'test'
 346     """
 347     if not is_string(in_str):
 348         raise ValueError(in_str)
 349     if is_binary_integer_number(in_str):
 350         return int(in_str, 2)
 351     if is_octal_integer_number(in_str):
 352         return int(in_str, 8)
 353     if is_hexidecimal_integer_number(in_str):
 354         return int(in_str, 16)
 355     return int(in_str)
 356
 357
 358 def is_decimal_number(in_str: str) -> bool:
 359     """
 360     Checks whether the given string represents a decimal or not.
 361
 362     A decimal may be signed or unsigned or use a "scientific notation".
 363
 364     >>> is_decimal_number('42.0')
 365     True
 366     >>> is_decimal_number('42')
 367     False
 368     """
 369     return is_number(in_str) and "." in in_str
 370
 371
 372 def strip_escape_sequences(in_str: str) -> str:
 373     """
 374     Remove escape sequences in the input string.
 375
 376     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 377     'this is a test!'
 378     """
 379     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 380     return in_str
 381
 382
 383 def add_thousands_separator(
 384         in_str: str,
 385         *,
 386         separator_char = ',',
 387         places = 3
 388 ) -> str:
 389     """
 390     Add thousands separator to a numeric string.  Also handles numbers.
 391
 392     >>> add_thousands_separator('12345678')
 393     '12,345,678'
 394     >>> add_thousands_separator(12345678)
 395     '12,345,678'
 396     >>> add_thousands_separator(12345678.99)
 397     '12,345,678.99'
 398     >>> add_thousands_separator('test')
 399     Traceback (most recent call last):
 400     ...
 401     ValueError: test
 402
 403     """
 404     if isinstance(in_str, numbers.Number):
 405         in_str = f'{in_str}'
 406     if is_number(in_str):
 407         return _add_thousands_separator(
 408             in_str,
 409             separator_char = separator_char,
 410             places = places
 411         )
 412     raise ValueError(in_str)
 413
 414
 415 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 416     decimal_part = ""
 417     if '.' in in_str:
 418         (in_str, decimal_part) = in_str.split('.')
 419     tmp = [iter(in_str[::-1])] * places
 420     ret = separator_char.join(
 421         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 422     if len(decimal_part) > 0:
 423         ret += '.'
 424         ret += decimal_part
 425     return ret
 426
 427
 428 # Full url example:
 429 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 430 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 431     """
 432     Check if a string is a valid url.
 433
 434     >>> is_url('http://www.mysite.com')
 435     True
 436     >>> is_url('https://mysite.com')
 437     True
 438     >>> is_url('.mysite.com')
 439     False
 440     """
 441     if not is_full_string(in_str):
 442         return False
 443
 444     valid = URL_RE.match(in_str) is not None
 445
 446     if allowed_schemes:
 447         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 448     return valid
 449
 450
 451 def is_email(in_str: Any) -> bool:
 452     """
 453     Check if a string is a valid email.
 454
 455     Reference: https://tools.ietf.org/html/rfc3696#section-3
 456
 457     >>> is_email('[email protected]')
 458     True
 459     >>> is_email('@gmail.com')
 460     False
 461     """
 462     if (
 463         not is_full_string(in_str)
 464         or len(in_str) > 320
 465         or in_str.startswith(".")
 466     ):
 467         return False
 468
 469     try:
 470         # we expect 2 tokens, one before "@" and one after, otherwise
 471         # we have an exception and the email is not valid.
 472         head, tail = in_str.split("@")
 473
 474         # head's size must be <= 64, tail <= 255, head must not start
 475         # with a dot or contain multiple consecutive dots.
 476         if (
 477             len(head) > 64
 478             or len(tail) > 255
 479             or head.endswith(".")
 480             or (".." in head)
 481         ):
 482             return False
 483
 484         # removes escaped spaces, so that later on the test regex will
 485         # accept the string.
 486         head = head.replace("\\ ", "")
 487         if head.startswith('"') and head.endswith('"'):
 488             head = head.replace(" ", "")[1:-1]
 489         return EMAIL_RE.match(head + "@" + tail) is not None
 490
 491     except ValueError:
 492         # borderline case in which we have multiple "@" signs but the
 493         # head part is correctly escaped.
 494         if ESCAPED_AT_SIGN.search(in_str) is not None:
 495             # replace "@" with "a" in the head
 496             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 497         return False
 498
 499
 500 def suffix_string_to_number(in_str: str) -> Optional[int]:
 501     """Take a string like "33Gb" and convert it into a number (of bytes)
 502     like 34603008.  Return None if the input string is not valid.
 503
 504     >>> suffix_string_to_number('1Mb')
 505     1048576
 506     >>> suffix_string_to_number('13.1Gb')
 507     14066017894
 508     """
 509     def suffix_capitalize(s: str) -> str:
 510         if len(s) == 1:
 511             return s.upper()
 512         elif len(s) == 2:
 513             return f"{s[0].upper()}{s[1].lower()}"
 514         return suffix_capitalize(s[0:1])
 515
 516     if is_string(in_str):
 517         if is_integer_number(in_str):
 518             return to_int(in_str)
 519         suffixes = [in_str[-2:], in_str[-1:]]
 520         rest = [in_str[:-2], in_str[:-1]]
 521         for x in range(len(suffixes)):
 522             s = suffixes[x]
 523             s = suffix_capitalize(s)
 524             multiplier = NUM_SUFFIXES.get(s, None)
 525             if multiplier is not None:
 526                 r = rest[x]
 527                 if is_integer_number(r):
 528                     return to_int(r) * multiplier
 529                 if is_decimal_number(r):
 530                     return int(float(r) * multiplier)
 531     return None
 532
 533
 534 def number_to_suffix_string(num: int) -> Optional[str]:
 535     """Take a number (of bytes) and returns a string like "43.8Gb".
 536     Returns none if the input is invalid.
 537
 538     >>> number_to_suffix_string(14066017894)
 539     '13.1Gb'
 540     >>> number_to_suffix_string(1024 * 1024)
 541     '1.0Mb'
 542
 543     """
 544     d = 0.0
 545     suffix = None
 546     for (sfx, size) in NUM_SUFFIXES.items():
 547         if num >= size:
 548             d = num / size
 549             suffix = sfx
 550             break
 551     if suffix is not None:
 552         return f"{d:.1f}{suffix}"
 553     else:
 554         return f'{num:d}'
 555
 556
 557 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 558     """
 559     Checks if a string is a valid credit card number.
 560     If card type is provided then it checks against that specific type only,
 561     otherwise any known credit card number will be accepted.
 562
 563     Supported card types are the following:
 564
 565     - VISA
 566     - MASTERCARD
 567     - AMERICAN_EXPRESS
 568     - DINERS_CLUB
 569     - DISCOVER
 570     - JCB
 571     """
 572     if not is_full_string(in_str):
 573         return False
 574
 575     if card_type is not None:
 576         if card_type not in CREDIT_CARDS:
 577             raise KeyError(
 578                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 579             )
 580         return CREDIT_CARDS[card_type].match(in_str) is not None
 581     for c in CREDIT_CARDS:
 582         if CREDIT_CARDS[c].match(in_str) is not None:
 583             return True
 584     return False
 585
 586
 587 def is_camel_case(in_str: Any) -> bool:
 588     """
 589     Checks if a string is formatted as camel case.
 590
 591     A string is considered camel case when:
 592
 593     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 594     - it contains both lowercase and uppercase letters
 595     - it does not start with a number
 596     """
 597     return (
 598         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 599     )
 600
 601
 602 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 603     """
 604     Checks if a string is formatted as "snake case".
 605
 606     A string is considered snake case when:
 607
 608     - it's composed only by lowercase/uppercase letters and digits
 609     - it contains at least one underscore (or provided separator)
 610     - it does not start with a number
 611
 612     >>> is_snake_case('this_is_a_test')
 613     True
 614     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 615     True
 616     >>> is_snake_case('this-is-a-test')
 617     False
 618     >>> is_snake_case('this-is-a-test', separator='-')
 619     True
 620
 621     """
 622     if is_full_string(in_str):
 623         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 624         re_template = (
 625             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 626         )
 627         r = re_map.get(
 628             separator,
 629             re.compile(
 630                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 631             ),
 632         )
 633         return r.match(in_str) is not None
 634     return False
 635
 636
 637 def is_json(in_str: Any) -> bool:
 638     """
 639     Check if a string is a valid json.
 640
 641     >>> is_json('{"name": "Peter"}')
 642     True
 643     >>> is_json('[1, 2, 3]')
 644     True
 645     >>> is_json('{nope}')
 646     False
 647     """
 648     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 649         try:
 650             return isinstance(json.loads(in_str), (dict, list))
 651         except (TypeError, ValueError, OverflowError):
 652             pass
 653     return False
 654
 655
 656 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 657     """
 658     Check if a string is a valid UUID.
 659
 660     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 661     True
 662     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 663     False
 664     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 665     True
 666     """
 667     # string casting is used to allow UUID itself as input data type
 668     s = str(in_str)
 669     if allow_hex:
 670         return UUID_HEX_OK_RE.match(s) is not None
 671     return UUID_RE.match(s) is not None
 672
 673
 674 def is_ip_v4(in_str: Any) -> bool:
 675     """
 676     Checks if a string is a valid ip v4.
 677
 678     >>> is_ip_v4('255.200.100.75')
 679     True
 680     >>> is_ip_v4('nope')
 681     False
 682     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 683     False
 684     """
 685     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 686         return False
 687
 688     # checks that each entry in the ip is in the valid range (0 to 255)
 689     for token in in_str.split("."):
 690         if not 0 <= int(token) <= 255:
 691             return False
 692     return True
 693
 694
 695 def extract_ip_v4(in_str: Any) -> Optional[str]:
 696     """
 697     Extracts the IPv4 chunk of a string or None.
 698
 699     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 700     '127.0.0.1'
 701     >>> extract_ip_v4('Your mom dresses you funny.')
 702     """
 703     if not is_full_string(in_str):
 704         return None
 705     m = ANYWHERE_IP_V4_RE.search(in_str)
 706     if m is not None:
 707         return m.group(0)
 708     return None
 709
 710
 711 def is_ip_v6(in_str: Any) -> bool:
 712     """
 713     Checks if a string is a valid ip v6.
 714
 715     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 716     True
 717     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 718     False
 719     """
 720     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 721
 722
 723 def extract_ip_v6(in_str: Any) -> Optional[str]:
 724     """
 725     Extract IPv6 chunk or None.
 726
 727     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 728     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 729     >>> extract_ip_v6("(and she's ugly too, btw)")
 730     """
 731     if not is_full_string(in_str):
 732         return None
 733     m = ANYWHERE_IP_V6_RE.search(in_str)
 734     if m is not None:
 735         return m.group(0)
 736     return None
 737
 738
 739 def is_ip(in_str: Any) -> bool:
 740     """
 741     Checks if a string is a valid ip (either v4 or v6).
 742
 743     >>> is_ip('255.200.100.75')
 744     True
 745     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 746     True
 747     >>> is_ip('1.2.3')
 748     False
 749     >>> is_ip('1.2.3.999')
 750     False
 751     """
 752     return is_ip_v6(in_str) or is_ip_v4(in_str)
 753
 754
 755 def extract_ip(in_str: Any) -> Optional[str]:
 756     """
 757     Extract the IP address or None.
 758
 759     >>> extract_ip('Attacker: 255.200.100.75')
 760     '255.200.100.75'
 761     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 762     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 763     >>> extract_ip('1.2.3')
 764
 765     """
 766     ip = extract_ip_v4(in_str)
 767     if ip is None:
 768         ip = extract_ip_v6(in_str)
 769     return ip
 770
 771
 772 def is_mac_address(in_str: Any) -> bool:
 773     """Return True if in_str is a valid MAC address false otherwise.
 774
 775     >>> is_mac_address("34:29:8F:12:0D:2F")
 776     True
 777     >>> is_mac_address('34:29:8f:12:0d:2f')
 778     True
 779     >>> is_mac_address('34-29-8F-12-0D-2F')
 780     True
 781     >>> is_mac_address("test")
 782     False
 783     """
 784     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 785
 786
 787 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 788     """
 789     Extract the MAC address from in_str.
 790
 791     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 792     '34:29:8F:12:0D:2F'
 793
 794     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 795     'd8:5d:e2:34:54:86'
 796
 797     """
 798     if not is_full_string(in_str):
 799         return None
 800     in_str.strip()
 801     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 802     if m is not None:
 803         mac = m.group(0)
 804         mac.replace(":", separator)
 805         mac.replace("-", separator)
 806         return mac
 807     return None
 808
 809
 810 def is_slug(in_str: Any, separator: str = "-") -> bool:
 811     """
 812     Checks if a given string is a slug (as created by `slugify()`).
 813
 814     >>> is_slug('my-blog-post-title')
 815     True
 816     >>> is_slug('My blog post title')
 817     False
 818
 819     """
 820     if not is_full_string(in_str):
 821         return False
 822     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 823     return re.match(rex, in_str) is not None
 824
 825
 826 def contains_html(in_str: str) -> bool:
 827     """
 828     Checks if the given string contains HTML/XML tags.
 829
 830     By design, this function matches ANY type of tag, so don't expect to use it
 831     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 832
 833     >>> contains_html('my string is <strong>bold</strong>')
 834     True
 835     >>> contains_html('my string is not bold')
 836     False
 837
 838     """
 839     if not is_string(in_str):
 840         raise ValueError(in_str)
 841     return HTML_RE.search(in_str) is not None
 842
 843
 844 def words_count(in_str: str) -> int:
 845     """
 846     Returns the number of words contained into the given string.
 847
 848     This method is smart, it does consider only sequence of one or more letter and/or numbers
 849     as "words", so a string like this: "! @ # % ... []" will return zero!
 850     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 851     will be 4 not 1 (even if there are no spaces in the string).
 852
 853     >>> words_count('hello world')
 854     2
 855     >>> words_count('one,two,three.stop')
 856     4
 857
 858     """
 859     if not is_string(in_str):
 860         raise ValueError(in_str)
 861     return len(WORDS_COUNT_RE.findall(in_str))
 862
 863
 864 def generate_uuid(as_hex: bool = False) -> str:
 865     """
 866     Generated an UUID string (using `uuid.uuid4()`).
 867
 868     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 869     generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 870
 871     """
 872     uid = uuid4()
 873     if as_hex:
 874         return uid.hex
 875     return str(uid)
 876
 877
 878 def generate_random_alphanumeric_string(size: int) -> str:
 879     """
 880     Returns a string of the specified size containing random
 881     characters (uppercase/lowercase ascii letters and digits).
 882
 883     random_string(9) # possible output: "cx3QQbzYg"
 884
 885     """
 886     if size < 1:
 887         raise ValueError("size must be >= 1")
 888     chars = string.ascii_letters + string.digits
 889     buffer = [random.choice(chars) for _ in range(size)]
 890     return from_char_list(buffer)
 891
 892
 893 def reverse(in_str: str) -> str:
 894     """
 895     Returns the string with its chars reversed.
 896
 897     >>> reverse('test')
 898     'tset'
 899
 900     """
 901     if not is_string(in_str):
 902         raise ValueError(in_str)
 903     return in_str[::-1]
 904
 905
 906 def camel_case_to_snake_case(in_str, *, separator="_"):
 907     """
 908     Convert a camel case string into a snake case one.
 909     (The original string is returned if is not a valid camel case string)
 910
 911     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 912     'mac_address_extractor_factory'
 913     >>> camel_case_to_snake_case('Luke Skywalker')
 914     'Luke Skywalker'
 915     """
 916     if not is_string(in_str):
 917         raise ValueError(in_str)
 918     if not is_camel_case(in_str):
 919         return in_str
 920     return CAMEL_CASE_REPLACE_RE.sub(
 921         lambda m: m.group(1) + separator, in_str
 922     ).lower()
 923
 924
 925 def snake_case_to_camel_case(
 926     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 927 ) -> str:
 928     """
 929     Convert a snake case string into a camel case one.
 930     (The original string is returned if is not a valid snake case string)
 931
 932     >>> snake_case_to_camel_case('this_is_a_test')
 933     'ThisIsATest'
 934     >>> snake_case_to_camel_case('Han Solo')
 935     'Han Solo'
 936     """
 937     if not is_string(in_str):
 938         raise ValueError(in_str)
 939     if not is_snake_case(in_str, separator=separator):
 940         return in_str
 941     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 942     if not upper_case_first:
 943         tokens[0] = tokens[0].lower()
 944     return from_char_list(tokens)
 945
 946
 947 def to_char_list(in_str: str) -> List[str]:
 948     """Convert a string into a list of chars.
 949
 950     >>> to_char_list('test')
 951     ['t', 'e', 's', 't']
 952     """
 953     if not is_string(in_str):
 954         return []
 955     return list(in_str)
 956
 957
 958 def from_char_list(in_list: List[str]) -> str:
 959     """Convert a char list into a string.
 960
 961     >>> from_char_list(['t', 'e', 's', 't'])
 962     'test'
 963     """
 964     return "".join(in_list)
 965
 966
 967 def shuffle(in_str: str) -> str:
 968     """Return a new string containing same chars of the given one but in
 969     a randomized order.
 970     """
 971     if not is_string(in_str):
 972         raise ValueError(in_str)
 973
 974     # turn the string into a list of chars
 975     chars = to_char_list(in_str)
 976     random.shuffle(chars)
 977     return from_char_list(chars)
 978
 979
 980 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 981     """
 982     Remove html code contained into the given string.
 983
 984     >>> strip_html('test: <a href="foo/bar">click here</a>')
 985     'test: '
 986     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 987     'test: click here'
 988     """
 989     if not is_string(in_str):
 990         raise ValueError(in_str)
 991     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 992     return r.sub("", in_str)
 993
 994
 995 def asciify(in_str: str) -> str:
 996     """
 997     Force string content to be ascii-only by translating all non-ascii
 998     chars into the closest possible representation (eg: ó -> o, Ë ->
 999     E, ç -> c...).
1000
1001     N.B. Some chars may be lost if impossible to translate.
1002
1003     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1004     'eeuuooaaeynAAACIINOE'
1005     """
1006     if not is_string(in_str):
1007         raise ValueError(in_str)
1008
1009     # "NFKD" is the algorithm which is able to successfully translate
1010     # the most of non-ascii chars.
1011     normalized = unicodedata.normalize("NFKD", in_str)
1012
1013     # encode string forcing ascii and ignore any errors
1014     # (unrepresentable chars will be stripped out)
1015     ascii_bytes = normalized.encode("ascii", "ignore")
1016
1017     # turns encoded bytes into an utf-8 string
1018     return ascii_bytes.decode("utf-8")
1019
1020
1021 def slugify(in_str: str, *, separator: str = "-") -> str:
1022     """
1023     Converts a string into a "slug" using provided separator.
1024     The returned string has the following properties:
1025
1026     - it has no spaces
1027     - all letters are in lower case
1028     - all punctuation signs and non alphanumeric chars are removed
1029     - words are divided using provided separator
1030     - all chars are encoded as ascii (by using `asciify()`)
1031     - is safe for URL
1032
1033     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1034     'top-10-reasons-to-love-dogs'
1035     >>> slugify('Mönstér Mägnët')
1036     'monster-magnet'
1037     """
1038     if not is_string(in_str):
1039         raise ValueError(in_str)
1040
1041     # replace any character that is NOT letter or number with spaces
1042     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1043
1044     # replace spaces with join sign
1045     out = SPACES_RE.sub(separator, out)
1046
1047     # normalize joins (remove duplicates)
1048     out = re.sub(re.escape(separator) + r"+", separator, out)
1049     return asciify(out)
1050
1051
1052 def to_bool(in_str: str) -> bool:
1053     """
1054     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1055
1056     A positive boolean (True) is returned if the string value is one
1057     of the following:
1058
1059     - "true"
1060     - "1"
1061     - "yes"
1062     - "y"
1063
1064     Otherwise False is returned.
1065
1066     >>> to_bool('True')
1067     True
1068
1069     >>> to_bool('1')
1070     True
1071
1072     >>> to_bool('yes')
1073     True
1074
1075     >>> to_bool('no')
1076     False
1077
1078     >>> to_bool('huh?')
1079     False
1080
1081     >>> to_bool('on')
1082     True
1083
1084     """
1085     if not is_string(in_str):
1086         raise ValueError(in_str)
1087     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1088
1089
1090 def to_date(in_str: str) -> Optional[datetime.date]:
1091     """
1092     Parses a date string.  See DateParser docs for details.
1093     """
1094     import dateparse.dateparse_utils as dp
1095     try:
1096         d = dp.DateParser()
1097         d.parse(in_str)
1098         return d.get_date()
1099     except dp.ParseException:
1100         logger.warning(f'Unable to parse date {in_str}.')
1101     return None
1102
1103
1104 def valid_date(in_str: str) -> bool:
1105     """
1106     True if the string represents a valid date.
1107     """
1108     import dateparse.dateparse_utils as dp
1109     try:
1110         d = dp.DateParser()
1111         _ = d.parse(in_str)
1112         return True
1113     except dp.ParseException:
1114         logger.warning(f'Unable to parse date {in_str}.')
1115     return False
1116
1117
1118 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1119     """
1120     Parses a datetime string.  See DateParser docs for more info.
1121     """
1122     import dateparse.dateparse_utils as dp
1123     try:
1124         d = dp.DateParser()
1125         dt = d.parse(in_str)
1126         if type(dt) == datetime.datetime:
1127             return dt
1128     except ValueError:
1129         logger.warning(f'Unable to parse datetime {in_str}.')
1130     return None
1131
1132
1133 def valid_datetime(in_str: str) -> bool:
1134     """
1135     True if the string represents a valid datetime.
1136     """
1137     _ = to_datetime(in_str)
1138     if _ is not None:
1139         return True
1140     logger.warning(f'Unable to parse datetime {in_str}.')
1141     return False
1142
1143
1144 def dedent(in_str: str) -> str:
1145     """
1146     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1147     """
1148     if not is_string(in_str):
1149         raise ValueError(in_str)
1150     line_separator = '\n'
1151     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1152     return line_separator.join(lines)
1153
1154
1155 def indent(in_str: str, amount: int) -> str:
1156     """
1157     Indents string by prepending amount spaces.
1158
1159     >>> indent('This is a test', 4)
1160     '    This is a test'
1161
1162     """
1163     if not is_string(in_str):
1164         raise ValueError(in_str)
1165     line_separator = '\n'
1166     lines = [" " * amount + line for line in in_str.split(line_separator)]
1167     return line_separator.join(lines)
1168
1169
1170 def sprintf(*args, **kwargs) -> str:
1171     """String printf, like in C"""
1172     ret = ""
1173
1174     sep = kwargs.pop("sep", None)
1175     if sep is not None:
1176         if not isinstance(sep, str):
1177             raise TypeError("sep must be None or a string")
1178
1179     end = kwargs.pop("end", None)
1180     if end is not None:
1181         if not isinstance(end, str):
1182             raise TypeError("end must be None or a string")
1183
1184     if kwargs:
1185         raise TypeError("invalid keyword arguments to sprint()")
1186
1187     if sep is None:
1188         sep = " "
1189     if end is None:
1190         end = "\n"
1191     for i, arg in enumerate(args):
1192         if i:
1193             ret += sep
1194         if isinstance(arg, str):
1195             ret += arg
1196         else:
1197             ret += str(arg)
1198     ret += end
1199     return ret
1200
1201
1202 class SprintfStdout(object):
1203     """
1204     A context manager that captures outputs to stdout.
1205
1206     with SprintfStdout() as buf:
1207         print("test")
1208     print(buf())
1209
1210     'test\n'
1211     """
1212     def __init__(self) -> None:
1213         self.destination = io.StringIO()
1214         self.recorder = None
1215
1216     def __enter__(self) -> Callable[[], str]:
1217         self.recorder = contextlib.redirect_stdout(self.destination)
1218         self.recorder.__enter__()
1219         return lambda: self.destination.getvalue()
1220
1221     def __exit__(self, *args) -> None:
1222         self.recorder.__exit__(*args)
1223         self.destination.seek(0)
1224         return None  # don't suppress exceptions
1225
1226
1227 def is_are(n: int) -> str:
1228     """Is or are?
1229
1230     >>> is_are(1)
1231     'is'
1232     >>> is_are(2)
1233     'are'
1234
1235     """
1236     if n == 1:
1237         return "is"
1238     return "are"
1239
1240
1241 def pluralize(n: int) -> str:
1242     """Add an s?
1243
1244     >>> pluralize(15)
1245     's'
1246     >>> count = 1
1247     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1248     There is 1 file.
1249     >>> count = 4
1250     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1251     There are 4 files.
1252
1253     """
1254     if n == 1:
1255         return ""
1256     return "s"
1257
1258
1259 def thify(n: int) -> str:
1260     """Return the proper cardinal suffix for a number.
1261
1262     >>> thify(1)
1263     'st'
1264     >>> thify(33)
1265     'rd'
1266     >>> thify(16)
1267     'th'
1268
1269     """
1270     digit = str(n)
1271     assert is_integer_number(digit)
1272     digit = digit[-1:]
1273     if digit == "1":
1274         return "st"
1275     elif digit == "2":
1276         return "nd"
1277     elif digit == "3":
1278         return "rd"
1279     else:
1280         return "th"
1281
1282
1283 def ngrams(txt: str, n: int):
1284     """Return the ngrams from a string.
1285
1286     >>> [x for x in ngrams('This is a test', 2)]
1287     ['This is', 'is a', 'a test']
1288
1289     """
1290     words = txt.split()
1291     for ngram in ngrams_presplit(words, n):
1292         ret = ''
1293         for word in ngram:
1294             ret += f'{word} '
1295         yield ret.strip()
1296
1297
1298 def ngrams_presplit(words: Sequence[str], n: int):
1299     return list_utils.ngrams(words, n)
1300
1301
1302 def bigrams(txt: str):
1303     return ngrams(txt, 2)
1304
1305
1306 def trigrams(txt: str):
1307     return ngrams(txt, 3)
1308
1309
1310 def shuffle_columns_into_list(
1311         input_lines: Iterable[str],
1312         column_specs: Iterable[Iterable[int]],
1313         delim=''
1314 ) -> Iterable[str]:
1315     """Helper to shuffle / parse columnar data and return the results as a
1316     list.  The column_specs argument is an iterable collection of
1317     numeric sequences that indicate one or more column numbers to
1318     copy.
1319
1320     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1321     >>> shuffle_columns_into_list(
1322     ...     cols,
1323     ...     [ [8], [2, 3], [5, 6, 7] ],
1324     ...     delim=' ',
1325     ... )
1326     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1327
1328     """
1329     out = []
1330
1331     # Column specs map input lines' columns into outputs.
1332     # [col1, col2...]
1333     for spec in column_specs:
1334         chunk = ''
1335         for n in spec:
1336             chunk = chunk + delim + input_lines[n]
1337         chunk = chunk.strip(delim)
1338         out.append(chunk)
1339     return out
1340
1341
1342 def shuffle_columns_into_dict(
1343         input_lines: Iterable[str],
1344         column_specs: Iterable[Tuple[str, Iterable[int]]],
1345         delim=''
1346 ) -> Dict[str, str]:
1347     """Helper to shuffle / parse columnar data and return the results
1348     as a dict.
1349
1350     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1351     >>> shuffle_columns_into_dict(
1352     ...     cols,
1353     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1354     ...     delim=' ',
1355     ... )
1356     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1357
1358     """
1359     out = {}
1360
1361     # Column specs map input lines' columns into outputs.
1362     # "key", [col1, col2...]
1363     for spec in column_specs:
1364         chunk = ''
1365         for n in spec[1]:
1366             chunk = chunk + delim + input_lines[n]
1367         chunk = chunk.strip(delim)
1368         out[spec[0]] = chunk
1369     return out
1370
1371
1372 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1373     """Interpolate a string with data from a dict.
1374
1375     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1376     ...                        {'adjective': 'good', 'noun': 'example'})
1377     'This is a good example.'
1378
1379     """
1380     return sprintf(txt.format(**values), end='')
1381
1382
1383 def to_ascii(x: str):
1384     """Encode as ascii bytes string.
1385
1386     >>> to_ascii('test')
1387     b'test'
1388
1389     >>> to_ascii(b'1, 2, 3')
1390     b'1, 2, 3'
1391
1392     """
1393     if type(x) is str:
1394         return x.encode('ascii')
1395     if type(x) is bytes:
1396         return x
1397     raise Exception('to_ascii works with strings and bytes')
1398
1399
1400 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
1401     """Encode txt and then encode the bytes with a 64-character
1402     alphabet.  This is compatible with uudecode.
1403
1404     >>> to_base64('hello?')
1405     b'aGVsbG8/\\n'
1406
1407     """
1408     return base64.encodebytes(txt.encode(encoding, errors))
1409
1410
1411 def is_base64(txt: str) -> bool:
1412     """Determine whether a string is base64 encoded (with Python's standard
1413     base64 alphabet which is the same as what uuencode uses).
1414
1415     >>> is_base64('test')    # all letters in the b64 alphabet
1416     True
1417
1418     >>> is_base64('another test, how do you like this one?')
1419     False
1420
1421     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1422     True
1423
1424     """
1425     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1426     alphabet = set(a.encode('ascii'))
1427     for char in to_ascii(txt.strip()):
1428         if char not in alphabet:
1429             return False
1430     return True
1431
1432
1433 def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
1434     """Convert base64 encoded string back to normal strings.
1435
1436     >>> from_base64(b'aGVsbG8/\\n')
1437     'hello?'
1438
1439     """
1440     return base64.decodebytes(b64).decode(encoding, errors)
1441
1442
1443 def chunk(txt: str, chunk_size):
1444     """Chunk up a string.
1445
1446     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1447     '01001101 11000101 10101010 10101010 10011111 10101000'
1448
1449     """
1450     if len(txt) % chunk_size != 0:
1451         logger.warning(
1452             f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})')
1453     for x in range(0, len(txt), chunk_size):
1454         yield txt[x:x+chunk_size]
1455
1456
1457 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1458     """Encode txt and then chop it into bytes.  Note: only bitstrings
1459     with delimiter='' are interpretable by from_bitstring.
1460
1461     >>> to_bitstring('hello?')
1462     '011010000110010101101100011011000110111100111111'
1463
1464     >>> to_bitstring('test', delimiter=' ')
1465     '01110100 01100101 01110011 01110100'
1466
1467     >>> to_bitstring(b'test')
1468     '01110100011001010111001101110100'
1469
1470     """
1471     etxt = to_ascii(txt)
1472     bits = bin(
1473         int.from_bytes(
1474             etxt,
1475             'big'
1476         )
1477     )
1478     bits = bits[2:]
1479     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1480
1481
1482 def is_bitstring(txt: str) -> bool:
1483     """Is this a bitstring?
1484
1485     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1486     True
1487
1488     >>> is_bitstring('1234')
1489     False
1490
1491     """
1492     return is_binary_integer_number(f'0b{txt}')
1493
1494
1495 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1496     """Convert from bitstring back to bytes then decode into a str.
1497
1498     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1499     'hello?'
1500
1501     """
1502     n = int(bits, 2)
1503     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1504
1505
1506 def ip_v4_sort_key(txt: str) -> Tuple[int]:
1507     """Turn an IPv4 address into a tuple for sorting purposes.
1508
1509     >>> ip_v4_sort_key('10.0.0.18')
1510     (10, 0, 0, 18)
1511
1512     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1513     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1514     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1515
1516     """
1517     if not is_ip_v4(txt):
1518         print(f"not IP: {txt}")
1519         return None
1520     return tuple([int(x) for x in txt.split('.')])
1521
1522
1523 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str]:
1524     """Chunk up a file path so that parent/ancestor paths sort before
1525     children/descendant paths.
1526
1527     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1528     ('usr', 'local', 'bin')
1529
1530     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1531     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1532     ['/usr', '/usr/local', '/usr/local/bin']
1533
1534     """
1535     return tuple([x for x in volume.split('/') if len(x) > 0])
1536
1537
1538 if __name__ == '__main__':
1539     import doctest
1540     doctest.testmod()