string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import base64
   4 import contextlib
   5 import datetime
   6 import io
   7 from itertools import zip_longest
   8 import json
   9 import logging
  10 import numbers
  11 import random
  12 import re
  13 import string
  14 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
  15 import unicodedata
  16 from uuid import uuid4
  17
  18 import list_utils
  19
  20 logger = logging.getLogger(__name__)
  21
  22 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  23
  24 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  25
  26 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  27
  28 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  29
  30 URLS_RAW_STRING = (
  31     r"([a-z-]+://)"  # scheme
  32     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  33     r"(www\.)?"  # www.
  34     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
  35     r"(:\d{2,})?"  # port number
  36     r"(/[a-z\d_%+-]*)*"  # folders
  37     r"(\.[a-z\d_%+-]+)*"  # file extension
  38     r"(\?[a-z\d_+%-=]*)?"  # query string
  39     r"(#\S*)?"  # hash
  40 )
  41
  42 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  43
  44 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  45
  46 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  47
  48 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  49
  50 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  51
  52 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  53
  54 CAMEL_CASE_TEST_RE = re.compile(
  55     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  56 )
  57
  58 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  59
  60 SNAKE_CASE_TEST_RE = re.compile(
  61     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  62 )
  63
  64 SNAKE_CASE_TEST_DASH_RE = re.compile(
  65     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  66 )
  67
  68 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  69
  70 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  71
  72 CREDIT_CARDS = {
  73     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  74     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  75     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  76     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  77     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  78     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  79 }
  80
  81 JSON_WRAPPER_RE = re.compile(
  82     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  83 )
  84
  85 UUID_RE = re.compile(
  86     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  87 )
  88
  89 UUID_HEX_OK_RE = re.compile(
  90     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  91     re.IGNORECASE,
  92 )
  93
  94 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  95
  96 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
  97
  98 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
  99
 100 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 101
 102 MAC_ADDRESS_RE = re.compile(
 103     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
 104 )
 105
 106 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 107     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 108 )
 109
 110 WORDS_COUNT_RE = re.compile(
 111     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 112 )
 113
 114 HTML_RE = re.compile(
 115     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 116     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 117 )
 118
 119 HTML_TAG_ONLY_RE = re.compile(
 120     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 121     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 122 )
 123
 124 SPACES_RE = re.compile(r"\s")
 125
 126 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 127     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 128 )
 129
 130 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 131
 132 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 133
 134 NUM_SUFFIXES = {
 135     "Pb": (1024 ** 5),
 136     "P": (1024 ** 5),
 137     "Tb": (1024 ** 4),
 138     "T": (1024 ** 4),
 139     "Gb": (1024 ** 3),
 140     "G": (1024 ** 3),
 141     "Mb": (1024 ** 2),
 142     "M": (1024 ** 2),
 143     "Kb": (1024 ** 1),
 144     "K": (1024 ** 1),
 145 }
 146
 147
 148 def is_none_or_empty(in_str: Optional[str]) -> bool:
 149     """
 150     Returns true if the input string is either None or an empty string.
 151
 152     >>> is_none_or_empty("")
 153     True
 154     >>> is_none_or_empty(None)
 155     True
 156     >>> is_none_or_empty("   \t   ")
 157     True
 158     >>> is_none_or_empty('Test')
 159     False
 160     """
 161     return in_str is None or len(in_str.strip()) == 0
 162
 163
 164 def is_string(obj: Any) -> bool:
 165     """
 166     Checks if an object is a string.
 167
 168     >>> is_string('test')
 169     True
 170     >>> is_string(123)
 171     False
 172     >>> is_string(100.3)
 173     False
 174     >>> is_string([1, 2, 3])
 175     False
 176     """
 177     return isinstance(obj, str)
 178
 179
 180 def is_empty_string(in_str: Any) -> bool:
 181     return is_empty(in_str)
 182
 183
 184 def is_empty(in_str: Any) -> bool:
 185     """
 186     Checks if input is a string and empty or only whitespace.
 187
 188     >>> is_empty('')
 189     True
 190     >>> is_empty('    \t\t    ')
 191     True
 192     >>> is_empty('test')
 193     False
 194     >>> is_empty(100.88)
 195     False
 196     >>> is_empty([1, 2, 3])
 197     False
 198     """
 199     return is_string(in_str) and in_str.strip() == ""
 200
 201
 202 def is_full_string(in_str: Any) -> bool:
 203     """
 204     Checks that input is a string and is not empty ('') or only whitespace.
 205
 206     >>> is_full_string('test!')
 207     True
 208     >>> is_full_string('')
 209     False
 210     >>> is_full_string('      ')
 211     False
 212     >>> is_full_string(100.999)
 213     False
 214     >>> is_full_string({"a": 1, "b": 2})
 215     False
 216     """
 217     return is_string(in_str) and in_str.strip() != ""
 218
 219
 220 def is_number(in_str: str) -> bool:
 221     """
 222     Checks if a string is a valid number.
 223
 224     >>> is_number(100.5)
 225     Traceback (most recent call last):
 226     ...
 227     ValueError: 100.5
 228     >>> is_number("100.5")
 229     True
 230     >>> is_number("test")
 231     False
 232     >>> is_number("99")
 233     True
 234     >>> is_number([1, 2, 3])
 235     Traceback (most recent call last):
 236     ...
 237     ValueError: [1, 2, 3]
 238     """
 239     if not is_string(in_str):
 240         raise ValueError(in_str)
 241     return NUMBER_RE.match(in_str) is not None
 242
 243
 244 def is_integer_number(in_str: str) -> bool:
 245     """
 246     Checks whether the given string represents an integer or not.
 247
 248     An integer may be signed or unsigned or use a "scientific notation".
 249
 250     >>> is_integer_number('42')
 251     True
 252     >>> is_integer_number('42.0')
 253     False
 254     """
 255     return (
 256         (is_number(in_str) and "." not in in_str) or
 257         is_hexidecimal_integer_number(in_str) or
 258         is_octal_integer_number(in_str) or
 259         is_binary_integer_number(in_str)
 260     )
 261
 262
 263 def is_hexidecimal_integer_number(in_str: str) -> bool:
 264     """
 265     Checks whether a string is a hex integer number.
 266
 267     >>> is_hexidecimal_integer_number('0x12345')
 268     True
 269     >>> is_hexidecimal_integer_number('0x1A3E')
 270     True
 271     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 272     False
 273     >>> is_hexidecimal_integer_number('-0xff')
 274     True
 275     >>> is_hexidecimal_integer_number('test')
 276     False
 277     >>> is_hexidecimal_integer_number(12345)  # Not a string
 278     Traceback (most recent call last):
 279     ...
 280     ValueError: 12345
 281     >>> is_hexidecimal_integer_number(101.4)
 282     Traceback (most recent call last):
 283     ...
 284     ValueError: 101.4
 285     >>> is_hexidecimal_integer_number(0x1A3E)
 286     Traceback (most recent call last):
 287     ...
 288     ValueError: 6718
 289     """
 290     if not is_string(in_str):
 291         raise ValueError(in_str)
 292     return HEX_NUMBER_RE.match(in_str) is not None
 293
 294
 295 def is_octal_integer_number(in_str: str) -> bool:
 296     """
 297     Checks whether a string is an octal number.
 298
 299     >>> is_octal_integer_number('0o777')
 300     True
 301     >>> is_octal_integer_number('-0O115')
 302     True
 303     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 304     False
 305     >>> is_octal_integer_number('7777')  # Needs 0o
 306     False
 307     >>> is_octal_integer_number('test')
 308     False
 309     """
 310     if not is_string(in_str):
 311         raise ValueError(in_str)
 312     return OCT_NUMBER_RE.match(in_str) is not None
 313
 314
 315 def is_binary_integer_number(in_str: str) -> bool:
 316     """
 317     Returns whether a string contains a binary number.
 318
 319     >>> is_binary_integer_number('0b10111')
 320     True
 321     >>> is_binary_integer_number('-0b111')
 322     True
 323     >>> is_binary_integer_number('0B10101')
 324     True
 325     >>> is_binary_integer_number('0b10102')
 326     False
 327     >>> is_binary_integer_number('0xFFF')
 328     False
 329     >>> is_binary_integer_number('test')
 330     False
 331     """
 332     if not is_string(in_str):
 333         raise ValueError(in_str)
 334     return BIN_NUMBER_RE.match(in_str) is not None
 335
 336
 337 def to_int(in_str: str) -> int:
 338     """Returns the integral value of the string or raises on error.
 339
 340     >>> to_int('1234')
 341     1234
 342     >>> to_int('test')
 343     Traceback (most recent call last):
 344     ...
 345     ValueError: invalid literal for int() with base 10: 'test'
 346     """
 347     if not is_string(in_str):
 348         raise ValueError(in_str)
 349     if is_binary_integer_number(in_str):
 350         return int(in_str, 2)
 351     if is_octal_integer_number(in_str):
 352         return int(in_str, 8)
 353     if is_hexidecimal_integer_number(in_str):
 354         return int(in_str, 16)
 355     return int(in_str)
 356
 357
 358 def is_decimal_number(in_str: str) -> bool:
 359     """
 360     Checks whether the given string represents a decimal or not.
 361
 362     A decimal may be signed or unsigned or use a "scientific notation".
 363
 364     >>> is_decimal_number('42.0')
 365     True
 366     >>> is_decimal_number('42')
 367     False
 368     """
 369     return is_number(in_str) and "." in in_str
 370
 371
 372 def strip_escape_sequences(in_str: str) -> str:
 373     """
 374     Remove escape sequences in the input string.
 375
 376     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 377     'this is a test!'
 378     """
 379     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 380     return in_str
 381
 382
 383 def add_thousands_separator(
 384         in_str: str,
 385         *,
 386         separator_char = ',',
 387         places = 3
 388 ) -> str:
 389     """
 390     Add thousands separator to a numeric string.  Also handles numbers.
 391
 392     >>> add_thousands_separator('12345678')
 393     '12,345,678'
 394     >>> add_thousands_separator(12345678)
 395     '12,345,678'
 396     >>> add_thousands_separator(12345678.99)
 397     '12,345,678.99'
 398     >>> add_thousands_separator('test')
 399     Traceback (most recent call last):
 400     ...
 401     ValueError: test
 402
 403     """
 404     if isinstance(in_str, numbers.Number):
 405         in_str = f'{in_str}'
 406     if is_number(in_str):
 407         return _add_thousands_separator(
 408             in_str,
 409             separator_char = separator_char,
 410             places = places
 411         )
 412     raise ValueError(in_str)
 413
 414
 415 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 416     decimal_part = ""
 417     if '.' in in_str:
 418         (in_str, decimal_part) = in_str.split('.')
 419     tmp = [iter(in_str[::-1])] * places
 420     ret = separator_char.join(
 421         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 422     if len(decimal_part) > 0:
 423         ret += '.'
 424         ret += decimal_part
 425     return ret
 426
 427
 428 # Full url example:
 429 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 430 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 431     """
 432     Check if a string is a valid url.
 433
 434     >>> is_url('http://www.mysite.com')
 435     True
 436     >>> is_url('https://mysite.com')
 437     True
 438     >>> is_url('.mysite.com')
 439     False
 440     """
 441     if not is_full_string(in_str):
 442         return False
 443
 444     valid = URL_RE.match(in_str) is not None
 445
 446     if allowed_schemes:
 447         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 448     return valid
 449
 450
 451 def is_email(in_str: Any) -> bool:
 452     """
 453     Check if a string is a valid email.
 454
 455     Reference: https://tools.ietf.org/html/rfc3696#section-3
 456
 457     >>> is_email('[email protected]')
 458     True
 459     >>> is_email('@gmail.com')
 460     False
 461     """
 462     if (
 463         not is_full_string(in_str)
 464         or len(in_str) > 320
 465         or in_str.startswith(".")
 466     ):
 467         return False
 468
 469     try:
 470         # we expect 2 tokens, one before "@" and one after, otherwise
 471         # we have an exception and the email is not valid.
 472         head, tail = in_str.split("@")
 473
 474         # head's size must be <= 64, tail <= 255, head must not start
 475         # with a dot or contain multiple consecutive dots.
 476         if (
 477             len(head) > 64
 478             or len(tail) > 255
 479             or head.endswith(".")
 480             or (".." in head)
 481         ):
 482             return False
 483
 484         # removes escaped spaces, so that later on the test regex will
 485         # accept the string.
 486         head = head.replace("\\ ", "")
 487         if head.startswith('"') and head.endswith('"'):
 488             head = head.replace(" ", "")[1:-1]
 489         return EMAIL_RE.match(head + "@" + tail) is not None
 490
 491     except ValueError:
 492         # borderline case in which we have multiple "@" signs but the
 493         # head part is correctly escaped.
 494         if ESCAPED_AT_SIGN.search(in_str) is not None:
 495             # replace "@" with "a" in the head
 496             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 497         return False
 498
 499
 500 def suffix_string_to_number(in_str: str) -> Optional[int]:
 501     """Take a string like "33Gb" and convert it into a number (of bytes)
 502     like 34603008.  Return None if the input string is not valid.
 503
 504     >>> suffix_string_to_number('1Mb')
 505     1048576
 506     >>> suffix_string_to_number('13.1Gb')
 507     14066017894
 508     """
 509     def suffix_capitalize(s: str) -> str:
 510         if len(s) == 1:
 511             return s.upper()
 512         elif len(s) == 2:
 513             return f"{s[0].upper()}{s[1].lower()}"
 514         return suffix_capitalize(s[0:1])
 515
 516     if is_string(in_str):
 517         if is_integer_number(in_str):
 518             return to_int(in_str)
 519         suffixes = [in_str[-2:], in_str[-1:]]
 520         rest = [in_str[:-2], in_str[:-1]]
 521         for x in range(len(suffixes)):
 522             s = suffixes[x]
 523             s = suffix_capitalize(s)
 524             multiplier = NUM_SUFFIXES.get(s, None)
 525             if multiplier is not None:
 526                 r = rest[x]
 527                 if is_integer_number(r):
 528                     return to_int(r) * multiplier
 529                 if is_decimal_number(r):
 530                     return int(float(r) * multiplier)
 531     return None
 532
 533
 534 def number_to_suffix_string(num: int) -> Optional[str]:
 535     """Take a number (of bytes) and returns a string like "43.8Gb".
 536     Returns none if the input is invalid.
 537
 538     >>> number_to_suffix_string(14066017894)
 539     '13.1Gb'
 540     >>> number_to_suffix_string(1024 * 1024)
 541     '1.0Mb'
 542
 543     """
 544     d = 0.0
 545     suffix = None
 546     for (sfx, size) in NUM_SUFFIXES.items():
 547         if num >= size:
 548             d = num / size
 549             suffix = sfx
 550             break
 551     if suffix is not None:
 552         return f"{d:.1f}{suffix}"
 553     else:
 554         return f'{num:d}'
 555
 556
 557 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 558     """
 559     Checks if a string is a valid credit card number.
 560     If card type is provided then it checks against that specific type only,
 561     otherwise any known credit card number will be accepted.
 562
 563     Supported card types are the following:
 564
 565     - VISA
 566     - MASTERCARD
 567     - AMERICAN_EXPRESS
 568     - DINERS_CLUB
 569     - DISCOVER
 570     - JCB
 571     """
 572     if not is_full_string(in_str):
 573         return False
 574
 575     if card_type is not None:
 576         if card_type not in CREDIT_CARDS:
 577             raise KeyError(
 578                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 579             )
 580         return CREDIT_CARDS[card_type].match(in_str) is not None
 581     for c in CREDIT_CARDS:
 582         if CREDIT_CARDS[c].match(in_str) is not None:
 583             return True
 584     return False
 585
 586
 587 def is_camel_case(in_str: Any) -> bool:
 588     """
 589     Checks if a string is formatted as camel case.
 590
 591     A string is considered camel case when:
 592
 593     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 594     - it contains both lowercase and uppercase letters
 595     - it does not start with a number
 596     """
 597     return (
 598         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 599     )
 600
 601
 602 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 603     """
 604     Checks if a string is formatted as "snake case".
 605
 606     A string is considered snake case when:
 607
 608     - it's composed only by lowercase/uppercase letters and digits
 609     - it contains at least one underscore (or provided separator)
 610     - it does not start with a number
 611
 612     >>> is_snake_case('this_is_a_test')
 613     True
 614     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 615     True
 616     >>> is_snake_case('this-is-a-test')
 617     False
 618     >>> is_snake_case('this-is-a-test', separator='-')
 619     True
 620
 621     """
 622     if is_full_string(in_str):
 623         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 624         re_template = (
 625             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 626         )
 627         r = re_map.get(
 628             separator,
 629             re.compile(
 630                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 631             ),
 632         )
 633         return r.match(in_str) is not None
 634     return False
 635
 636
 637 def is_json(in_str: Any) -> bool:
 638     """
 639     Check if a string is a valid json.
 640
 641     >>> is_json('{"name": "Peter"}')
 642     True
 643     >>> is_json('[1, 2, 3]')
 644     True
 645     >>> is_json('{nope}')
 646     False
 647     """
 648     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 649         try:
 650             return isinstance(json.loads(in_str), (dict, list))
 651         except (TypeError, ValueError, OverflowError):
 652             pass
 653     return False
 654
 655
 656 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 657     """
 658     Check if a string is a valid UUID.
 659
 660     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 661     True
 662     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 663     False
 664     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 665     True
 666     """
 667     # string casting is used to allow UUID itself as input data type
 668     s = str(in_str)
 669     if allow_hex:
 670         return UUID_HEX_OK_RE.match(s) is not None
 671     return UUID_RE.match(s) is not None
 672
 673
 674 def is_ip_v4(in_str: Any) -> bool:
 675     """
 676     Checks if a string is a valid ip v4.
 677
 678     >>> is_ip_v4('255.200.100.75')
 679     True
 680     >>> is_ip_v4('nope')
 681     False
 682     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 683     False
 684     """
 685     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 686         return False
 687
 688     # checks that each entry in the ip is in the valid range (0 to 255)
 689     for token in in_str.split("."):
 690         if not 0 <= int(token) <= 255:
 691             return False
 692     return True
 693
 694
 695 def extract_ip_v4(in_str: Any) -> Optional[str]:
 696     """
 697     Extracts the IPv4 chunk of a string or None.
 698
 699     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 700     '127.0.0.1'
 701     >>> extract_ip_v4('Your mom dresses you funny.')
 702     """
 703     if not is_full_string(in_str):
 704         return None
 705     m = ANYWHERE_IP_V4_RE.search(in_str)
 706     if m is not None:
 707         return m.group(0)
 708     return None
 709
 710
 711 def is_ip_v6(in_str: Any) -> bool:
 712     """
 713     Checks if a string is a valid ip v6.
 714
 715     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 716     True
 717     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 718     False
 719     """
 720     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 721
 722
 723 def extract_ip_v6(in_str: Any) -> Optional[str]:
 724     """
 725     Extract IPv6 chunk or None.
 726
 727     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 728     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 729     >>> extract_ip_v6("(and she's ugly too, btw)")
 730     """
 731     if not is_full_string(in_str):
 732         return None
 733     m = ANYWHERE_IP_V6_RE.search(in_str)
 734     if m is not None:
 735         return m.group(0)
 736     return None
 737
 738
 739 def is_ip(in_str: Any) -> bool:
 740     """
 741     Checks if a string is a valid ip (either v4 or v6).
 742
 743     >>> is_ip('255.200.100.75')
 744     True
 745     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 746     True
 747     >>> is_ip('1.2.3')
 748     False
 749     >>> is_ip('1.2.3.999')
 750     False
 751     """
 752     return is_ip_v6(in_str) or is_ip_v4(in_str)
 753
 754
 755 def extract_ip(in_str: Any) -> Optional[str]:
 756     """
 757     Extract the IP address or None.
 758
 759     >>> extract_ip('Attacker: 255.200.100.75')
 760     '255.200.100.75'
 761     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 762     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 763     >>> extract_ip('1.2.3')
 764
 765     """
 766     ip = extract_ip_v4(in_str)
 767     if ip is None:
 768         ip = extract_ip_v6(in_str)
 769     return ip
 770
 771
 772 def is_mac_address(in_str: Any) -> bool:
 773     """Return True if in_str is a valid MAC address false otherwise.
 774
 775     >>> is_mac_address("34:29:8F:12:0D:2F")
 776     True
 777     >>> is_mac_address('34:29:8f:12:0d:2f')
 778     True
 779     >>> is_mac_address('34-29-8F-12-0D-2F')
 780     True
 781     >>> is_mac_address("test")
 782     False
 783     """
 784     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 785
 786
 787 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 788     """
 789     Extract the MAC address from in_str.
 790
 791     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 792     '34:29:8F:12:0D:2F'
 793
 794     """
 795     if not is_full_string(in_str):
 796         return None
 797     in_str.strip()
 798     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 799     if m is not None:
 800         mac = m.group(0)
 801         mac.replace(":", separator)
 802         mac.replace("-", separator)
 803         return mac
 804     return None
 805
 806
 807 def is_slug(in_str: Any, separator: str = "-") -> bool:
 808     """
 809     Checks if a given string is a slug (as created by `slugify()`).
 810
 811     >>> is_slug('my-blog-post-title')
 812     True
 813     >>> is_slug('My blog post title')
 814     False
 815
 816     """
 817     if not is_full_string(in_str):
 818         return False
 819     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 820     return re.match(rex, in_str) is not None
 821
 822
 823 def contains_html(in_str: str) -> bool:
 824     """
 825     Checks if the given string contains HTML/XML tags.
 826
 827     By design, this function matches ANY type of tag, so don't expect to use it
 828     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 829
 830     >>> contains_html('my string is <strong>bold</strong>')
 831     True
 832     >>> contains_html('my string is not bold')
 833     False
 834
 835     """
 836     if not is_string(in_str):
 837         raise ValueError(in_str)
 838     return HTML_RE.search(in_str) is not None
 839
 840
 841 def words_count(in_str: str) -> int:
 842     """
 843     Returns the number of words contained into the given string.
 844
 845     This method is smart, it does consider only sequence of one or more letter and/or numbers
 846     as "words", so a string like this: "! @ # % ... []" will return zero!
 847     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 848     will be 4 not 1 (even if there are no spaces in the string).
 849
 850     >>> words_count('hello world')
 851     2
 852     >>> words_count('one,two,three.stop')
 853     4
 854
 855     """
 856     if not is_string(in_str):
 857         raise ValueError(in_str)
 858     return len(WORDS_COUNT_RE.findall(in_str))
 859
 860
 861 def generate_uuid(as_hex: bool = False) -> str:
 862     """
 863     Generated an UUID string (using `uuid.uuid4()`).
 864
 865     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 866     generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 867
 868     """
 869     uid = uuid4()
 870     if as_hex:
 871         return uid.hex
 872     return str(uid)
 873
 874
 875 def generate_random_alphanumeric_string(size: int) -> str:
 876     """
 877     Returns a string of the specified size containing random
 878     characters (uppercase/lowercase ascii letters and digits).
 879
 880     random_string(9) # possible output: "cx3QQbzYg"
 881
 882     """
 883     if size < 1:
 884         raise ValueError("size must be >= 1")
 885     chars = string.ascii_letters + string.digits
 886     buffer = [random.choice(chars) for _ in range(size)]
 887     return from_char_list(buffer)
 888
 889
 890 def reverse(in_str: str) -> str:
 891     """
 892     Returns the string with its chars reversed.
 893
 894     >>> reverse('test')
 895     'tset'
 896
 897     """
 898     if not is_string(in_str):
 899         raise ValueError(in_str)
 900     return in_str[::-1]
 901
 902
 903 def camel_case_to_snake_case(in_str, *, separator="_"):
 904     """
 905     Convert a camel case string into a snake case one.
 906     (The original string is returned if is not a valid camel case string)
 907
 908     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 909     'mac_address_extractor_factory'
 910     >>> camel_case_to_snake_case('Luke Skywalker')
 911     'Luke Skywalker'
 912     """
 913     if not is_string(in_str):
 914         raise ValueError(in_str)
 915     if not is_camel_case(in_str):
 916         return in_str
 917     return CAMEL_CASE_REPLACE_RE.sub(
 918         lambda m: m.group(1) + separator, in_str
 919     ).lower()
 920
 921
 922 def snake_case_to_camel_case(
 923     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 924 ) -> str:
 925     """
 926     Convert a snake case string into a camel case one.
 927     (The original string is returned if is not a valid snake case string)
 928
 929     >>> snake_case_to_camel_case('this_is_a_test')
 930     'ThisIsATest'
 931     >>> snake_case_to_camel_case('Han Solo')
 932     'Han Solo'
 933     """
 934     if not is_string(in_str):
 935         raise ValueError(in_str)
 936     if not is_snake_case(in_str, separator=separator):
 937         return in_str
 938     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 939     if not upper_case_first:
 940         tokens[0] = tokens[0].lower()
 941     return from_char_list(tokens)
 942
 943
 944 def to_char_list(in_str: str) -> List[str]:
 945     """Convert a string into a list of chars.
 946
 947     >>> to_char_list('test')
 948     ['t', 'e', 's', 't']
 949     """
 950     if not is_string(in_str):
 951         return []
 952     return list(in_str)
 953
 954
 955 def from_char_list(in_list: List[str]) -> str:
 956     """Convert a char list into a string.
 957
 958     >>> from_char_list(['t', 'e', 's', 't'])
 959     'test'
 960     """
 961     return "".join(in_list)
 962
 963
 964 def shuffle(in_str: str) -> str:
 965     """Return a new string containing same chars of the given one but in
 966     a randomized order.
 967     """
 968     if not is_string(in_str):
 969         raise ValueError(in_str)
 970
 971     # turn the string into a list of chars
 972     chars = to_char_list(in_str)
 973     random.shuffle(chars)
 974     return from_char_list(chars)
 975
 976
 977 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 978     """
 979     Remove html code contained into the given string.
 980
 981     >>> strip_html('test: <a href="foo/bar">click here</a>')
 982     'test: '
 983     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 984     'test: click here'
 985     """
 986     if not is_string(in_str):
 987         raise ValueError(in_str)
 988     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 989     return r.sub("", in_str)
 990
 991
 992 def asciify(in_str: str) -> str:
 993     """
 994     Force string content to be ascii-only by translating all non-ascii
 995     chars into the closest possible representation (eg: ó -> o, Ë ->
 996     E, ç -> c...).
 997
 998     N.B. Some chars may be lost if impossible to translate.
 999
1000     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1001     'eeuuooaaeynAAACIINOE'
1002     """
1003     if not is_string(in_str):
1004         raise ValueError(in_str)
1005
1006     # "NFKD" is the algorithm which is able to successfully translate
1007     # the most of non-ascii chars.
1008     normalized = unicodedata.normalize("NFKD", in_str)
1009
1010     # encode string forcing ascii and ignore any errors
1011     # (unrepresentable chars will be stripped out)
1012     ascii_bytes = normalized.encode("ascii", "ignore")
1013
1014     # turns encoded bytes into an utf-8 string
1015     return ascii_bytes.decode("utf-8")
1016
1017
1018 def slugify(in_str: str, *, separator: str = "-") -> str:
1019     """
1020     Converts a string into a "slug" using provided separator.
1021     The returned string has the following properties:
1022
1023     - it has no spaces
1024     - all letters are in lower case
1025     - all punctuation signs and non alphanumeric chars are removed
1026     - words are divided using provided separator
1027     - all chars are encoded as ascii (by using `asciify()`)
1028     - is safe for URL
1029
1030     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1031     'top-10-reasons-to-love-dogs'
1032     >>> slugify('Mönstér Mägnët')
1033     'monster-magnet'
1034     """
1035     if not is_string(in_str):
1036         raise ValueError(in_str)
1037
1038     # replace any character that is NOT letter or number with spaces
1039     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1040
1041     # replace spaces with join sign
1042     out = SPACES_RE.sub(separator, out)
1043
1044     # normalize joins (remove duplicates)
1045     out = re.sub(re.escape(separator) + r"+", separator, out)
1046     return asciify(out)
1047
1048
1049 def to_bool(in_str: str) -> bool:
1050     """
1051     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1052
1053     A positive boolean (True) is returned if the string value is one
1054     of the following:
1055
1056     - "true"
1057     - "1"
1058     - "yes"
1059     - "y"
1060
1061     Otherwise False is returned.
1062
1063     >>> to_bool('True')
1064     True
1065
1066     >>> to_bool('1')
1067     True
1068
1069     >>> to_bool('yes')
1070     True
1071
1072     >>> to_bool('no')
1073     False
1074
1075     >>> to_bool('huh?')
1076     False
1077
1078     >>> to_bool('on')
1079     True
1080
1081     """
1082     if not is_string(in_str):
1083         raise ValueError(in_str)
1084     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1085
1086
1087 def to_date(in_str: str) -> Optional[datetime.date]:
1088     """
1089     Parses a date string.  See DateParser docs for details.
1090     """
1091     import dateparse.dateparse_utils as dp
1092     try:
1093         d = dp.DateParser()
1094         d.parse(in_str)
1095         return d.get_date()
1096     except dp.ParseException:
1097         logger.warning(f'Unable to parse date {in_str}.')
1098     return None
1099
1100
1101 def valid_date(in_str: str) -> bool:
1102     """
1103     True if the string represents a valid date.
1104     """
1105     import dateparse.dateparse_utils as dp
1106     try:
1107         d = dp.DateParser()
1108         _ = d.parse(in_str)
1109         return True
1110     except dp.ParseException:
1111         logger.warning(f'Unable to parse date {in_str}.')
1112     return False
1113
1114
1115 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1116     """
1117     Parses a datetime string.  See DateParser docs for more info.
1118     """
1119     import dateparse.dateparse_utils as dp
1120     try:
1121         d = dp.DateParser()
1122         dt = d.parse(in_str)
1123         if type(dt) == datetime.datetime:
1124             return dt
1125     except ValueError:
1126         logger.warning(f'Unable to parse datetime {in_str}.')
1127     return None
1128
1129
1130 def valid_datetime(in_str: str) -> bool:
1131     """
1132     True if the string represents a valid datetime.
1133     """
1134     _ = to_datetime(in_str)
1135     if _ is not None:
1136         return True
1137     logger.warning(f'Unable to parse datetime {in_str}.')
1138     return False
1139
1140
1141 def dedent(in_str: str) -> str:
1142     """
1143     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1144     """
1145     if not is_string(in_str):
1146         raise ValueError(in_str)
1147     line_separator = '\n'
1148     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1149     return line_separator.join(lines)
1150
1151
1152 def indent(in_str: str, amount: int) -> str:
1153     """
1154     Indents string by prepending amount spaces.
1155
1156     >>> indent('This is a test', 4)
1157     '    This is a test'
1158
1159     """
1160     if not is_string(in_str):
1161         raise ValueError(in_str)
1162     line_separator = '\n'
1163     lines = [" " * amount + line for line in in_str.split(line_separator)]
1164     return line_separator.join(lines)
1165
1166
1167 def sprintf(*args, **kwargs) -> str:
1168     """String printf, like in C"""
1169     ret = ""
1170
1171     sep = kwargs.pop("sep", None)
1172     if sep is not None:
1173         if not isinstance(sep, str):
1174             raise TypeError("sep must be None or a string")
1175
1176     end = kwargs.pop("end", None)
1177     if end is not None:
1178         if not isinstance(end, str):
1179             raise TypeError("end must be None or a string")
1180
1181     if kwargs:
1182         raise TypeError("invalid keyword arguments to sprint()")
1183
1184     if sep is None:
1185         sep = " "
1186     if end is None:
1187         end = "\n"
1188     for i, arg in enumerate(args):
1189         if i:
1190             ret += sep
1191         if isinstance(arg, str):
1192             ret += arg
1193         else:
1194             ret += str(arg)
1195     ret += end
1196     return ret
1197
1198
1199 class SprintfStdout(object):
1200     """
1201     A context manager that captures outputs to stdout.
1202
1203     with SprintfStdout() as buf:
1204         print("test")
1205     print(buf())
1206
1207     'test\n'
1208     """
1209     def __init__(self) -> None:
1210         self.destination = io.StringIO()
1211         self.recorder = None
1212
1213     def __enter__(self) -> Callable[[], str]:
1214         self.recorder = contextlib.redirect_stdout(self.destination)
1215         self.recorder.__enter__()
1216         return lambda: self.destination.getvalue()
1217
1218     def __exit__(self, *args) -> None:
1219         self.recorder.__exit__(*args)
1220         self.destination.seek(0)
1221         return None  # don't suppress exceptions
1222
1223
1224 def is_are(n: int) -> str:
1225     """Is or are?
1226
1227     >>> is_are(1)
1228     'is'
1229     >>> is_are(2)
1230     'are'
1231
1232     """
1233     if n == 1:
1234         return "is"
1235     return "are"
1236
1237
1238 def pluralize(n: int) -> str:
1239     """Add an s?
1240
1241     >>> pluralize(15)
1242     's'
1243     >>> count = 1
1244     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1245     There is 1 file.
1246     >>> count = 4
1247     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1248     There are 4 files.
1249
1250     """
1251     if n == 1:
1252         return ""
1253     return "s"
1254
1255
1256 def thify(n: int) -> str:
1257     """Return the proper cardinal suffix for a number.
1258
1259     >>> thify(1)
1260     'st'
1261     >>> thify(33)
1262     'rd'
1263     >>> thify(16)
1264     'th'
1265
1266     """
1267     digit = str(n)
1268     assert is_integer_number(digit)
1269     digit = digit[-1:]
1270     if digit == "1":
1271         return "st"
1272     elif digit == "2":
1273         return "nd"
1274     elif digit == "3":
1275         return "rd"
1276     else:
1277         return "th"
1278
1279
1280 def ngrams(txt: str, n: int):
1281     """Return the ngrams from a string.
1282
1283     >>> [x for x in ngrams('This is a test', 2)]
1284     ['This is', 'is a', 'a test']
1285
1286     """
1287     words = txt.split()
1288     for ngram in ngrams_presplit(words, n):
1289         ret = ''
1290         for word in ngram:
1291             ret += f'{word} '
1292         yield ret.strip()
1293
1294
1295 def ngrams_presplit(words: Sequence[str], n: int):
1296     return list_utils.ngrams(words, n)
1297
1298
1299 def bigrams(txt: str):
1300     return ngrams(txt, 2)
1301
1302
1303 def trigrams(txt: str):
1304     return ngrams(txt, 3)
1305
1306
1307 def shuffle_columns_into_list(
1308         input_lines: Iterable[str],
1309         column_specs: Iterable[Iterable[int]],
1310         delim=''
1311 ) -> Iterable[str]:
1312     """Helper to shuffle / parse columnar data and return the results as a
1313     list.  The column_specs argument is an iterable collection of
1314     numeric sequences that indicate one or more column numbers to
1315     copy.
1316
1317     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1318     >>> shuffle_columns_into_list(
1319     ...     cols,
1320     ...     [ [8], [2, 3], [5, 6, 7] ],
1321     ...     delim=' ',
1322     ... )
1323     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1324
1325     """
1326     out = []
1327
1328     # Column specs map input lines' columns into outputs.
1329     # [col1, col2...]
1330     for spec in column_specs:
1331         chunk = ''
1332         for n in spec:
1333             chunk = chunk + delim + input_lines[n]
1334         chunk = chunk.strip(delim)
1335         out.append(chunk)
1336     return out
1337
1338
1339 def shuffle_columns_into_dict(
1340         input_lines: Iterable[str],
1341         column_specs: Iterable[Tuple[str, Iterable[int]]],
1342         delim=''
1343 ) -> Dict[str, str]:
1344     """Helper to shuffle / parse columnar data and return the results
1345     as a dict.
1346
1347     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1348     >>> shuffle_columns_into_dict(
1349     ...     cols,
1350     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1351     ...     delim=' ',
1352     ... )
1353     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1354
1355     """
1356     out = {}
1357
1358     # Column specs map input lines' columns into outputs.
1359     # "key", [col1, col2...]
1360     for spec in column_specs:
1361         chunk = ''
1362         for n in spec[1]:
1363             chunk = chunk + delim + input_lines[n]
1364         chunk = chunk.strip(delim)
1365         out[spec[0]] = chunk
1366     return out
1367
1368
1369 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1370     """Interpolate a string with data from a dict.
1371
1372     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1373     ...                        {'adjective': 'good', 'noun': 'example'})
1374     'This is a good example.'
1375
1376     """
1377     return sprintf(txt.format(**values), end='')
1378
1379
1380 def to_ascii(x: str):
1381     """Encode as ascii bytes string.
1382
1383     >>> to_ascii('test')
1384     b'test'
1385
1386     >>> to_ascii(b'1, 2, 3')
1387     b'1, 2, 3'
1388
1389     """
1390     if type(x) is str:
1391         return x.encode('ascii')
1392     if type(x) is bytes:
1393         return x
1394     raise Exception('to_ascii works with strings and bytes')
1395
1396
1397 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
1398     """Encode txt and then encode the bytes with a 64-character
1399     alphabet.  This is compatible with uudecode.
1400
1401     >>> to_base64('hello?')
1402     b'aGVsbG8/\\n'
1403
1404     """
1405     return base64.encodebytes(txt.encode(encoding, errors))
1406
1407
1408 def is_base64(txt: str) -> bool:
1409     """Determine whether a string is base64 encoded (with Python's standard
1410     base64 alphabet which is the same as what uuencode uses).
1411
1412     >>> is_base64('test')    # all letters in the b64 alphabet
1413     True
1414
1415     >>> is_base64('another test, how do you like this one?')
1416     False
1417
1418     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1419     True
1420
1421     """
1422     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1423     alphabet = set(a.encode('ascii'))
1424     for char in to_ascii(txt.strip()):
1425         if char not in alphabet:
1426             return False
1427     return True
1428
1429
1430 def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
1431     """Convert base64 encoded string back to normal strings.
1432
1433     >>> from_base64(b'aGVsbG8/\\n')
1434     'hello?'
1435
1436     """
1437     return base64.decodebytes(b64).decode(encoding, errors)
1438
1439
1440 def chunk(txt: str, chunk_size):
1441     """Chunk up a string.
1442
1443     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1444     '01001101 11000101 10101010 10101010 10011111 10101000'
1445
1446     """
1447     if len(txt) % chunk_size != 0:
1448         logger.warning(
1449             f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})')
1450     for x in range(0, len(txt), chunk_size):
1451         yield txt[x:x+chunk_size]
1452
1453
1454 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1455     """Encode txt and then chop it into bytes.  Note: only bitstrings
1456     with delimiter='' are interpretable by from_bitstring.
1457
1458     >>> to_bitstring('hello?')
1459     '011010000110010101101100011011000110111100111111'
1460
1461     >>> to_bitstring('test', delimiter=' ')
1462     '01110100 01100101 01110011 01110100'
1463
1464     """
1465     bits = bin(
1466         int.from_bytes(
1467             txt.encode(encoding, errors),
1468             'big'
1469         )
1470     )
1471     bits = bits[2:]
1472     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1473
1474
1475 def is_bitstring(txt: str) -> bool:
1476     """Is this a bitstring?
1477
1478     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1479     True
1480
1481     >>> is_bitstring('1234')
1482     False
1483
1484     """
1485     return is_binary_integer_number(f'0b{txt}')
1486
1487
1488 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1489     """Convert from bitstring back to bytes then decode into a str.
1490
1491     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1492     'hello?'
1493
1494     """
1495     n = int(bits, 2)
1496     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1497
1498
1499 if __name__ == '__main__':
1500     import doctest
1501     doctest.testmod()