string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import base64
   4 import contextlib
   5 import datetime
   6 import io
   7 from itertools import zip_longest
   8 import json
   9 import logging
  10 import numbers
  11 import random
  12 import re
  13 import string
  14 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
  15 import unicodedata
  16 from uuid import uuid4
  17 import warnings
  18
  19 import list_utils
  20
  21 logger = logging.getLogger(__name__)
  22
  23 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  24
  25 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  26
  27 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  28
  29 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  30
  31 URLS_RAW_STRING = (
  32     r"([a-z-]+://)"  # scheme
  33     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  34     r"(www\.)?"  # www.
  35     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
  36     r"(:\d{2,})?"  # port number
  37     r"(/[a-z\d_%+-]*)*"  # folders
  38     r"(\.[a-z\d_%+-]+)*"  # file extension
  39     r"(\?[a-z\d_+%-=]*)?"  # query string
  40     r"(#\S*)?"  # hash
  41 )
  42
  43 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  44
  45 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  46
  47 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  48
  49 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  50
  51 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  52
  53 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  54
  55 CAMEL_CASE_TEST_RE = re.compile(
  56     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  57 )
  58
  59 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  60
  61 SNAKE_CASE_TEST_RE = re.compile(
  62     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  63 )
  64
  65 SNAKE_CASE_TEST_DASH_RE = re.compile(
  66     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  67 )
  68
  69 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  70
  71 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  72
  73 CREDIT_CARDS = {
  74     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  75     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  76     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  77     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  78     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  79     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  80 }
  81
  82 JSON_WRAPPER_RE = re.compile(
  83     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  84 )
  85
  86 UUID_RE = re.compile(
  87     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  88 )
  89
  90 UUID_HEX_OK_RE = re.compile(
  91     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  92     re.IGNORECASE,
  93 )
  94
  95 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  96
  97 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
  98
  99 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 100
 101 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 102
 103 MAC_ADDRESS_RE = re.compile(
 104     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
 105 )
 106
 107 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 108     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 109 )
 110
 111 WORDS_COUNT_RE = re.compile(
 112     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 113 )
 114
 115 HTML_RE = re.compile(
 116     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 117     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 118 )
 119
 120 HTML_TAG_ONLY_RE = re.compile(
 121     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 122     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 123 )
 124
 125 SPACES_RE = re.compile(r"\s")
 126
 127 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 128     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 129 )
 130
 131 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 132
 133 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 134
 135 NUM_SUFFIXES = {
 136     "Pb": (1024 ** 5),
 137     "P": (1024 ** 5),
 138     "Tb": (1024 ** 4),
 139     "T": (1024 ** 4),
 140     "Gb": (1024 ** 3),
 141     "G": (1024 ** 3),
 142     "Mb": (1024 ** 2),
 143     "M": (1024 ** 2),
 144     "Kb": (1024 ** 1),
 145     "K": (1024 ** 1),
 146 }
 147
 148
 149 def is_none_or_empty(in_str: Optional[str]) -> bool:
 150     """
 151     Returns true if the input string is either None or an empty string.
 152
 153     >>> is_none_or_empty("")
 154     True
 155     >>> is_none_or_empty(None)
 156     True
 157     >>> is_none_or_empty("   \t   ")
 158     True
 159     >>> is_none_or_empty('Test')
 160     False
 161     """
 162     return in_str is None or len(in_str.strip()) == 0
 163
 164
 165 def is_string(obj: Any) -> bool:
 166     """
 167     Checks if an object is a string.
 168
 169     >>> is_string('test')
 170     True
 171     >>> is_string(123)
 172     False
 173     >>> is_string(100.3)
 174     False
 175     >>> is_string([1, 2, 3])
 176     False
 177     """
 178     return isinstance(obj, str)
 179
 180
 181 def is_empty_string(in_str: Any) -> bool:
 182     return is_empty(in_str)
 183
 184
 185 def is_empty(in_str: Any) -> bool:
 186     """
 187     Checks if input is a string and empty or only whitespace.
 188
 189     >>> is_empty('')
 190     True
 191     >>> is_empty('    \t\t    ')
 192     True
 193     >>> is_empty('test')
 194     False
 195     >>> is_empty(100.88)
 196     False
 197     >>> is_empty([1, 2, 3])
 198     False
 199     """
 200     return is_string(in_str) and in_str.strip() == ""
 201
 202
 203 def is_full_string(in_str: Any) -> bool:
 204     """
 205     Checks that input is a string and is not empty ('') or only whitespace.
 206
 207     >>> is_full_string('test!')
 208     True
 209     >>> is_full_string('')
 210     False
 211     >>> is_full_string('      ')
 212     False
 213     >>> is_full_string(100.999)
 214     False
 215     >>> is_full_string({"a": 1, "b": 2})
 216     False
 217     """
 218     return is_string(in_str) and in_str.strip() != ""
 219
 220
 221 def is_number(in_str: str) -> bool:
 222     """
 223     Checks if a string is a valid number.
 224
 225     >>> is_number(100.5)
 226     Traceback (most recent call last):
 227     ...
 228     ValueError: 100.5
 229     >>> is_number("100.5")
 230     True
 231     >>> is_number("test")
 232     False
 233     >>> is_number("99")
 234     True
 235     >>> is_number([1, 2, 3])
 236     Traceback (most recent call last):
 237     ...
 238     ValueError: [1, 2, 3]
 239     """
 240     if not is_string(in_str):
 241         raise ValueError(in_str)
 242     return NUMBER_RE.match(in_str) is not None
 243
 244
 245 def is_integer_number(in_str: str) -> bool:
 246     """
 247     Checks whether the given string represents an integer or not.
 248
 249     An integer may be signed or unsigned or use a "scientific notation".
 250
 251     >>> is_integer_number('42')
 252     True
 253     >>> is_integer_number('42.0')
 254     False
 255     """
 256     return (
 257         (is_number(in_str) and "." not in in_str) or
 258         is_hexidecimal_integer_number(in_str) or
 259         is_octal_integer_number(in_str) or
 260         is_binary_integer_number(in_str)
 261     )
 262
 263
 264 def is_hexidecimal_integer_number(in_str: str) -> bool:
 265     """
 266     Checks whether a string is a hex integer number.
 267
 268     >>> is_hexidecimal_integer_number('0x12345')
 269     True
 270     >>> is_hexidecimal_integer_number('0x1A3E')
 271     True
 272     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 273     False
 274     >>> is_hexidecimal_integer_number('-0xff')
 275     True
 276     >>> is_hexidecimal_integer_number('test')
 277     False
 278     >>> is_hexidecimal_integer_number(12345)  # Not a string
 279     Traceback (most recent call last):
 280     ...
 281     ValueError: 12345
 282     >>> is_hexidecimal_integer_number(101.4)
 283     Traceback (most recent call last):
 284     ...
 285     ValueError: 101.4
 286     >>> is_hexidecimal_integer_number(0x1A3E)
 287     Traceback (most recent call last):
 288     ...
 289     ValueError: 6718
 290     """
 291     if not is_string(in_str):
 292         raise ValueError(in_str)
 293     return HEX_NUMBER_RE.match(in_str) is not None
 294
 295
 296 def is_octal_integer_number(in_str: str) -> bool:
 297     """
 298     Checks whether a string is an octal number.
 299
 300     >>> is_octal_integer_number('0o777')
 301     True
 302     >>> is_octal_integer_number('-0O115')
 303     True
 304     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 305     False
 306     >>> is_octal_integer_number('7777')  # Needs 0o
 307     False
 308     >>> is_octal_integer_number('test')
 309     False
 310     """
 311     if not is_string(in_str):
 312         raise ValueError(in_str)
 313     return OCT_NUMBER_RE.match(in_str) is not None
 314
 315
 316 def is_binary_integer_number(in_str: str) -> bool:
 317     """
 318     Returns whether a string contains a binary number.
 319
 320     >>> is_binary_integer_number('0b10111')
 321     True
 322     >>> is_binary_integer_number('-0b111')
 323     True
 324     >>> is_binary_integer_number('0B10101')
 325     True
 326     >>> is_binary_integer_number('0b10102')
 327     False
 328     >>> is_binary_integer_number('0xFFF')
 329     False
 330     >>> is_binary_integer_number('test')
 331     False
 332     """
 333     if not is_string(in_str):
 334         raise ValueError(in_str)
 335     return BIN_NUMBER_RE.match(in_str) is not None
 336
 337
 338 def to_int(in_str: str) -> int:
 339     """Returns the integral value of the string or raises on error.
 340
 341     >>> to_int('1234')
 342     1234
 343     >>> to_int('test')
 344     Traceback (most recent call last):
 345     ...
 346     ValueError: invalid literal for int() with base 10: 'test'
 347     """
 348     if not is_string(in_str):
 349         raise ValueError(in_str)
 350     if is_binary_integer_number(in_str):
 351         return int(in_str, 2)
 352     if is_octal_integer_number(in_str):
 353         return int(in_str, 8)
 354     if is_hexidecimal_integer_number(in_str):
 355         return int(in_str, 16)
 356     return int(in_str)
 357
 358
 359 def is_decimal_number(in_str: str) -> bool:
 360     """
 361     Checks whether the given string represents a decimal or not.
 362
 363     A decimal may be signed or unsigned or use a "scientific notation".
 364
 365     >>> is_decimal_number('42.0')
 366     True
 367     >>> is_decimal_number('42')
 368     False
 369     """
 370     return is_number(in_str) and "." in in_str
 371
 372
 373 def strip_escape_sequences(in_str: str) -> str:
 374     """
 375     Remove escape sequences in the input string.
 376
 377     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 378     'this is a test!'
 379     """
 380     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 381     return in_str
 382
 383
 384 def add_thousands_separator(
 385         in_str: str,
 386         *,
 387         separator_char = ',',
 388         places = 3
 389 ) -> str:
 390     """
 391     Add thousands separator to a numeric string.  Also handles numbers.
 392
 393     >>> add_thousands_separator('12345678')
 394     '12,345,678'
 395     >>> add_thousands_separator(12345678)
 396     '12,345,678'
 397     >>> add_thousands_separator(12345678.99)
 398     '12,345,678.99'
 399     >>> add_thousands_separator('test')
 400     Traceback (most recent call last):
 401     ...
 402     ValueError: test
 403
 404     """
 405     if isinstance(in_str, numbers.Number):
 406         in_str = f'{in_str}'
 407     if is_number(in_str):
 408         return _add_thousands_separator(
 409             in_str,
 410             separator_char = separator_char,
 411             places = places
 412         )
 413     raise ValueError(in_str)
 414
 415
 416 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 417     decimal_part = ""
 418     if '.' in in_str:
 419         (in_str, decimal_part) = in_str.split('.')
 420     tmp = [iter(in_str[::-1])] * places
 421     ret = separator_char.join(
 422         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 423     if len(decimal_part) > 0:
 424         ret += '.'
 425         ret += decimal_part
 426     return ret
 427
 428
 429 # Full url example:
 430 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 431 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 432     """
 433     Check if a string is a valid url.
 434
 435     >>> is_url('http://www.mysite.com')
 436     True
 437     >>> is_url('https://mysite.com')
 438     True
 439     >>> is_url('.mysite.com')
 440     False
 441     """
 442     if not is_full_string(in_str):
 443         return False
 444
 445     valid = URL_RE.match(in_str) is not None
 446
 447     if allowed_schemes:
 448         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 449     return valid
 450
 451
 452 def is_email(in_str: Any) -> bool:
 453     """
 454     Check if a string is a valid email.
 455
 456     Reference: https://tools.ietf.org/html/rfc3696#section-3
 457
 458     >>> is_email('[email protected]')
 459     True
 460     >>> is_email('@gmail.com')
 461     False
 462     """
 463     if (
 464         not is_full_string(in_str)
 465         or len(in_str) > 320
 466         or in_str.startswith(".")
 467     ):
 468         return False
 469
 470     try:
 471         # we expect 2 tokens, one before "@" and one after, otherwise
 472         # we have an exception and the email is not valid.
 473         head, tail = in_str.split("@")
 474
 475         # head's size must be <= 64, tail <= 255, head must not start
 476         # with a dot or contain multiple consecutive dots.
 477         if (
 478             len(head) > 64
 479             or len(tail) > 255
 480             or head.endswith(".")
 481             or (".." in head)
 482         ):
 483             return False
 484
 485         # removes escaped spaces, so that later on the test regex will
 486         # accept the string.
 487         head = head.replace("\\ ", "")
 488         if head.startswith('"') and head.endswith('"'):
 489             head = head.replace(" ", "")[1:-1]
 490         return EMAIL_RE.match(head + "@" + tail) is not None
 491
 492     except ValueError:
 493         # borderline case in which we have multiple "@" signs but the
 494         # head part is correctly escaped.
 495         if ESCAPED_AT_SIGN.search(in_str) is not None:
 496             # replace "@" with "a" in the head
 497             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 498         return False
 499
 500
 501 def suffix_string_to_number(in_str: str) -> Optional[int]:
 502     """Take a string like "33Gb" and convert it into a number (of bytes)
 503     like 34603008.  Return None if the input string is not valid.
 504
 505     >>> suffix_string_to_number('1Mb')
 506     1048576
 507     >>> suffix_string_to_number('13.1Gb')
 508     14066017894
 509     """
 510     def suffix_capitalize(s: str) -> str:
 511         if len(s) == 1:
 512             return s.upper()
 513         elif len(s) == 2:
 514             return f"{s[0].upper()}{s[1].lower()}"
 515         return suffix_capitalize(s[0:1])
 516
 517     if is_string(in_str):
 518         if is_integer_number(in_str):
 519             return to_int(in_str)
 520         suffixes = [in_str[-2:], in_str[-1:]]
 521         rest = [in_str[:-2], in_str[:-1]]
 522         for x in range(len(suffixes)):
 523             s = suffixes[x]
 524             s = suffix_capitalize(s)
 525             multiplier = NUM_SUFFIXES.get(s, None)
 526             if multiplier is not None:
 527                 r = rest[x]
 528                 if is_integer_number(r):
 529                     return to_int(r) * multiplier
 530                 if is_decimal_number(r):
 531                     return int(float(r) * multiplier)
 532     return None
 533
 534
 535 def number_to_suffix_string(num: int) -> Optional[str]:
 536     """Take a number (of bytes) and returns a string like "43.8Gb".
 537     Returns none if the input is invalid.
 538
 539     >>> number_to_suffix_string(14066017894)
 540     '13.1Gb'
 541     >>> number_to_suffix_string(1024 * 1024)
 542     '1.0Mb'
 543
 544     """
 545     d = 0.0
 546     suffix = None
 547     for (sfx, size) in NUM_SUFFIXES.items():
 548         if num >= size:
 549             d = num / size
 550             suffix = sfx
 551             break
 552     if suffix is not None:
 553         return f"{d:.1f}{suffix}"
 554     else:
 555         return f'{num:d}'
 556
 557
 558 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 559     """
 560     Checks if a string is a valid credit card number.
 561     If card type is provided then it checks against that specific type only,
 562     otherwise any known credit card number will be accepted.
 563
 564     Supported card types are the following:
 565
 566     - VISA
 567     - MASTERCARD
 568     - AMERICAN_EXPRESS
 569     - DINERS_CLUB
 570     - DISCOVER
 571     - JCB
 572     """
 573     if not is_full_string(in_str):
 574         return False
 575
 576     if card_type is not None:
 577         if card_type not in CREDIT_CARDS:
 578             raise KeyError(
 579                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 580             )
 581         return CREDIT_CARDS[card_type].match(in_str) is not None
 582     for c in CREDIT_CARDS:
 583         if CREDIT_CARDS[c].match(in_str) is not None:
 584             return True
 585     return False
 586
 587
 588 def is_camel_case(in_str: Any) -> bool:
 589     """
 590     Checks if a string is formatted as camel case.
 591
 592     A string is considered camel case when:
 593
 594     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 595     - it contains both lowercase and uppercase letters
 596     - it does not start with a number
 597     """
 598     return (
 599         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 600     )
 601
 602
 603 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 604     """
 605     Checks if a string is formatted as "snake case".
 606
 607     A string is considered snake case when:
 608
 609     - it's composed only by lowercase/uppercase letters and digits
 610     - it contains at least one underscore (or provided separator)
 611     - it does not start with a number
 612
 613     >>> is_snake_case('this_is_a_test')
 614     True
 615     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 616     True
 617     >>> is_snake_case('this-is-a-test')
 618     False
 619     >>> is_snake_case('this-is-a-test', separator='-')
 620     True
 621
 622     """
 623     if is_full_string(in_str):
 624         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 625         re_template = (
 626             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 627         )
 628         r = re_map.get(
 629             separator,
 630             re.compile(
 631                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 632             ),
 633         )
 634         return r.match(in_str) is not None
 635     return False
 636
 637
 638 def is_json(in_str: Any) -> bool:
 639     """
 640     Check if a string is a valid json.
 641
 642     >>> is_json('{"name": "Peter"}')
 643     True
 644     >>> is_json('[1, 2, 3]')
 645     True
 646     >>> is_json('{nope}')
 647     False
 648     """
 649     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 650         try:
 651             return isinstance(json.loads(in_str), (dict, list))
 652         except (TypeError, ValueError, OverflowError):
 653             pass
 654     return False
 655
 656
 657 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 658     """
 659     Check if a string is a valid UUID.
 660
 661     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 662     True
 663     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 664     False
 665     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 666     True
 667     """
 668     # string casting is used to allow UUID itself as input data type
 669     s = str(in_str)
 670     if allow_hex:
 671         return UUID_HEX_OK_RE.match(s) is not None
 672     return UUID_RE.match(s) is not None
 673
 674
 675 def is_ip_v4(in_str: Any) -> bool:
 676     """
 677     Checks if a string is a valid ip v4.
 678
 679     >>> is_ip_v4('255.200.100.75')
 680     True
 681     >>> is_ip_v4('nope')
 682     False
 683     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 684     False
 685     """
 686     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 687         return False
 688
 689     # checks that each entry in the ip is in the valid range (0 to 255)
 690     for token in in_str.split("."):
 691         if not 0 <= int(token) <= 255:
 692             return False
 693     return True
 694
 695
 696 def extract_ip_v4(in_str: Any) -> Optional[str]:
 697     """
 698     Extracts the IPv4 chunk of a string or None.
 699
 700     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 701     '127.0.0.1'
 702     >>> extract_ip_v4('Your mom dresses you funny.')
 703     """
 704     if not is_full_string(in_str):
 705         return None
 706     m = ANYWHERE_IP_V4_RE.search(in_str)
 707     if m is not None:
 708         return m.group(0)
 709     return None
 710
 711
 712 def is_ip_v6(in_str: Any) -> bool:
 713     """
 714     Checks if a string is a valid ip v6.
 715
 716     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 717     True
 718     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 719     False
 720     """
 721     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 722
 723
 724 def extract_ip_v6(in_str: Any) -> Optional[str]:
 725     """
 726     Extract IPv6 chunk or None.
 727
 728     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 729     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 730     >>> extract_ip_v6("(and she's ugly too, btw)")
 731     """
 732     if not is_full_string(in_str):
 733         return None
 734     m = ANYWHERE_IP_V6_RE.search(in_str)
 735     if m is not None:
 736         return m.group(0)
 737     return None
 738
 739
 740 def is_ip(in_str: Any) -> bool:
 741     """
 742     Checks if a string is a valid ip (either v4 or v6).
 743
 744     >>> is_ip('255.200.100.75')
 745     True
 746     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 747     True
 748     >>> is_ip('1.2.3')
 749     False
 750     >>> is_ip('1.2.3.999')
 751     False
 752     """
 753     return is_ip_v6(in_str) or is_ip_v4(in_str)
 754
 755
 756 def extract_ip(in_str: Any) -> Optional[str]:
 757     """
 758     Extract the IP address or None.
 759
 760     >>> extract_ip('Attacker: 255.200.100.75')
 761     '255.200.100.75'
 762     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 763     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 764     >>> extract_ip('1.2.3')
 765
 766     """
 767     ip = extract_ip_v4(in_str)
 768     if ip is None:
 769         ip = extract_ip_v6(in_str)
 770     return ip
 771
 772
 773 def is_mac_address(in_str: Any) -> bool:
 774     """Return True if in_str is a valid MAC address false otherwise.
 775
 776     >>> is_mac_address("34:29:8F:12:0D:2F")
 777     True
 778     >>> is_mac_address('34:29:8f:12:0d:2f')
 779     True
 780     >>> is_mac_address('34-29-8F-12-0D-2F')
 781     True
 782     >>> is_mac_address("test")
 783     False
 784     """
 785     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 786
 787
 788 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 789     """
 790     Extract the MAC address from in_str.
 791
 792     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 793     '34:29:8F:12:0D:2F'
 794
 795     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 796     'd8:5d:e2:34:54:86'
 797
 798     """
 799     if not is_full_string(in_str):
 800         return None
 801     in_str.strip()
 802     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 803     if m is not None:
 804         mac = m.group(0)
 805         mac.replace(":", separator)
 806         mac.replace("-", separator)
 807         return mac
 808     return None
 809
 810
 811 def is_slug(in_str: Any, separator: str = "-") -> bool:
 812     """
 813     Checks if a given string is a slug (as created by `slugify()`).
 814
 815     >>> is_slug('my-blog-post-title')
 816     True
 817     >>> is_slug('My blog post title')
 818     False
 819
 820     """
 821     if not is_full_string(in_str):
 822         return False
 823     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 824     return re.match(rex, in_str) is not None
 825
 826
 827 def contains_html(in_str: str) -> bool:
 828     """
 829     Checks if the given string contains HTML/XML tags.
 830
 831     By design, this function matches ANY type of tag, so don't expect to use it
 832     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 833
 834     >>> contains_html('my string is <strong>bold</strong>')
 835     True
 836     >>> contains_html('my string is not bold')
 837     False
 838
 839     """
 840     if not is_string(in_str):
 841         raise ValueError(in_str)
 842     return HTML_RE.search(in_str) is not None
 843
 844
 845 def words_count(in_str: str) -> int:
 846     """
 847     Returns the number of words contained into the given string.
 848
 849     This method is smart, it does consider only sequence of one or more letter and/or numbers
 850     as "words", so a string like this: "! @ # % ... []" will return zero!
 851     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 852     will be 4 not 1 (even if there are no spaces in the string).
 853
 854     >>> words_count('hello world')
 855     2
 856     >>> words_count('one,two,three.stop')
 857     4
 858
 859     """
 860     if not is_string(in_str):
 861         raise ValueError(in_str)
 862     return len(WORDS_COUNT_RE.findall(in_str))
 863
 864
 865 def generate_uuid(omit_dashes: bool = False) -> str:
 866     """
 867     Generated an UUID string (using `uuid.uuid4()`).
 868
 869     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 870     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 871
 872     """
 873     uid = uuid4()
 874     if omit_dashes:
 875         return uid.hex
 876     return str(uid)
 877
 878
 879 def generate_random_alphanumeric_string(size: int) -> str:
 880     """
 881     Returns a string of the specified size containing random
 882     characters (uppercase/lowercase ascii letters and digits).
 883
 884     random_string(9) # possible output: "cx3QQbzYg"
 885
 886     """
 887     if size < 1:
 888         raise ValueError("size must be >= 1")
 889     chars = string.ascii_letters + string.digits
 890     buffer = [random.choice(chars) for _ in range(size)]
 891     return from_char_list(buffer)
 892
 893
 894 def reverse(in_str: str) -> str:
 895     """
 896     Returns the string with its chars reversed.
 897
 898     >>> reverse('test')
 899     'tset'
 900
 901     """
 902     if not is_string(in_str):
 903         raise ValueError(in_str)
 904     return in_str[::-1]
 905
 906
 907 def camel_case_to_snake_case(in_str, *, separator="_"):
 908     """
 909     Convert a camel case string into a snake case one.
 910     (The original string is returned if is not a valid camel case string)
 911
 912     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 913     'mac_address_extractor_factory'
 914     >>> camel_case_to_snake_case('Luke Skywalker')
 915     'Luke Skywalker'
 916     """
 917     if not is_string(in_str):
 918         raise ValueError(in_str)
 919     if not is_camel_case(in_str):
 920         return in_str
 921     return CAMEL_CASE_REPLACE_RE.sub(
 922         lambda m: m.group(1) + separator, in_str
 923     ).lower()
 924
 925
 926 def snake_case_to_camel_case(
 927     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 928 ) -> str:
 929     """
 930     Convert a snake case string into a camel case one.
 931     (The original string is returned if is not a valid snake case string)
 932
 933     >>> snake_case_to_camel_case('this_is_a_test')
 934     'ThisIsATest'
 935     >>> snake_case_to_camel_case('Han Solo')
 936     'Han Solo'
 937     """
 938     if not is_string(in_str):
 939         raise ValueError(in_str)
 940     if not is_snake_case(in_str, separator=separator):
 941         return in_str
 942     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 943     if not upper_case_first:
 944         tokens[0] = tokens[0].lower()
 945     return from_char_list(tokens)
 946
 947
 948 def to_char_list(in_str: str) -> List[str]:
 949     """Convert a string into a list of chars.
 950
 951     >>> to_char_list('test')
 952     ['t', 'e', 's', 't']
 953     """
 954     if not is_string(in_str):
 955         return []
 956     return list(in_str)
 957
 958
 959 def from_char_list(in_list: List[str]) -> str:
 960     """Convert a char list into a string.
 961
 962     >>> from_char_list(['t', 'e', 's', 't'])
 963     'test'
 964     """
 965     return "".join(in_list)
 966
 967
 968 def shuffle(in_str: str) -> str:
 969     """Return a new string containing same chars of the given one but in
 970     a randomized order.
 971     """
 972     if not is_string(in_str):
 973         raise ValueError(in_str)
 974
 975     # turn the string into a list of chars
 976     chars = to_char_list(in_str)
 977     random.shuffle(chars)
 978     return from_char_list(chars)
 979
 980
 981 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 982     """
 983     Remove html code contained into the given string.
 984
 985     >>> strip_html('test: <a href="foo/bar">click here</a>')
 986     'test: '
 987     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 988     'test: click here'
 989     """
 990     if not is_string(in_str):
 991         raise ValueError(in_str)
 992     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 993     return r.sub("", in_str)
 994
 995
 996 def asciify(in_str: str) -> str:
 997     """
 998     Force string content to be ascii-only by translating all non-ascii
 999     chars into the closest possible representation (eg: ó -> o, Ë ->
1000     E, ç -> c...).
1001
1002     N.B. Some chars may be lost if impossible to translate.
1003
1004     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1005     'eeuuooaaeynAAACIINOE'
1006     """
1007     if not is_string(in_str):
1008         raise ValueError(in_str)
1009
1010     # "NFKD" is the algorithm which is able to successfully translate
1011     # the most of non-ascii chars.
1012     normalized = unicodedata.normalize("NFKD", in_str)
1013
1014     # encode string forcing ascii and ignore any errors
1015     # (unrepresentable chars will be stripped out)
1016     ascii_bytes = normalized.encode("ascii", "ignore")
1017
1018     # turns encoded bytes into an utf-8 string
1019     return ascii_bytes.decode("utf-8")
1020
1021
1022 def slugify(in_str: str, *, separator: str = "-") -> str:
1023     """
1024     Converts a string into a "slug" using provided separator.
1025     The returned string has the following properties:
1026
1027     - it has no spaces
1028     - all letters are in lower case
1029     - all punctuation signs and non alphanumeric chars are removed
1030     - words are divided using provided separator
1031     - all chars are encoded as ascii (by using `asciify()`)
1032     - is safe for URL
1033
1034     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1035     'top-10-reasons-to-love-dogs'
1036     >>> slugify('Mönstér Mägnët')
1037     'monster-magnet'
1038     """
1039     if not is_string(in_str):
1040         raise ValueError(in_str)
1041
1042     # replace any character that is NOT letter or number with spaces
1043     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1044
1045     # replace spaces with join sign
1046     out = SPACES_RE.sub(separator, out)
1047
1048     # normalize joins (remove duplicates)
1049     out = re.sub(re.escape(separator) + r"+", separator, out)
1050     return asciify(out)
1051
1052
1053 def to_bool(in_str: str) -> bool:
1054     """
1055     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1056
1057     A positive boolean (True) is returned if the string value is one
1058     of the following:
1059
1060     - "true"
1061     - "1"
1062     - "yes"
1063     - "y"
1064
1065     Otherwise False is returned.
1066
1067     >>> to_bool('True')
1068     True
1069
1070     >>> to_bool('1')
1071     True
1072
1073     >>> to_bool('yes')
1074     True
1075
1076     >>> to_bool('no')
1077     False
1078
1079     >>> to_bool('huh?')
1080     False
1081
1082     >>> to_bool('on')
1083     True
1084
1085     """
1086     if not is_string(in_str):
1087         raise ValueError(in_str)
1088     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1089
1090
1091 def to_date(in_str: str) -> Optional[datetime.date]:
1092     """
1093     Parses a date string.  See DateParser docs for details.
1094     """
1095     import dateparse.dateparse_utils as dp
1096     try:
1097         d = dp.DateParser()
1098         d.parse(in_str)
1099         return d.get_date()
1100     except dp.ParseException:
1101         msg = f'Unable to parse date {in_str}.'
1102         logger.warning(msg)
1103         warnings.warn(msg)
1104     return None
1105
1106
1107 def valid_date(in_str: str) -> bool:
1108     """
1109     True if the string represents a valid date.
1110     """
1111     import dateparse.dateparse_utils as dp
1112     try:
1113         d = dp.DateParser()
1114         _ = d.parse(in_str)
1115         return True
1116     except dp.ParseException:
1117         msg = f'Unable to parse date {in_str}.'
1118         logger.warning(msg)
1119         warnings.warn(msg)
1120     return False
1121
1122
1123 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1124     """
1125     Parses a datetime string.  See DateParser docs for more info.
1126     """
1127     import dateparse.dateparse_utils as dp
1128     try:
1129         d = dp.DateParser()
1130         dt = d.parse(in_str)
1131         if type(dt) == datetime.datetime:
1132             return dt
1133     except ValueError:
1134         msg = f'Unable to parse datetime {in_str}.'
1135         logger.warning(msg)
1136         warnings.warn(msg)
1137     return None
1138
1139
1140 def valid_datetime(in_str: str) -> bool:
1141     """
1142     True if the string represents a valid datetime.
1143     """
1144     _ = to_datetime(in_str)
1145     if _ is not None:
1146         return True
1147     msg = f'Unable to parse datetime {in_str}.'
1148     logger.warning(msg)
1149     warnings.warn(msg)
1150     return False
1151
1152
1153 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1154     """
1155     Squeeze runs of more than one character_to_squeeze into one.
1156
1157     >>> squeeze(' this        is       a    test    ')
1158     ' this is a test '
1159
1160     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1161     'one|!|two|!|three'
1162
1163     """
1164     return re.sub(
1165         r'(' + re.escape(character_to_squeeze) + r')+',
1166         character_to_squeeze,
1167         in_str
1168     )
1169
1170
1171 def dedent(in_str: str) -> str:
1172     """
1173     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1174     """
1175     if not is_string(in_str):
1176         raise ValueError(in_str)
1177     line_separator = '\n'
1178     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1179     return line_separator.join(lines)
1180
1181
1182 def indent(in_str: str, amount: int) -> str:
1183     """
1184     Indents string by prepending amount spaces.
1185
1186     >>> indent('This is a test', 4)
1187     '    This is a test'
1188
1189     """
1190     if not is_string(in_str):
1191         raise ValueError(in_str)
1192     line_separator = '\n'
1193     lines = [" " * amount + line for line in in_str.split(line_separator)]
1194     return line_separator.join(lines)
1195
1196
1197 def sprintf(*args, **kwargs) -> str:
1198     """String printf, like in C"""
1199     ret = ""
1200
1201     sep = kwargs.pop("sep", None)
1202     if sep is not None:
1203         if not isinstance(sep, str):
1204             raise TypeError("sep must be None or a string")
1205
1206     end = kwargs.pop("end", None)
1207     if end is not None:
1208         if not isinstance(end, str):
1209             raise TypeError("end must be None or a string")
1210
1211     if kwargs:
1212         raise TypeError("invalid keyword arguments to sprint()")
1213
1214     if sep is None:
1215         sep = " "
1216     if end is None:
1217         end = "\n"
1218     for i, arg in enumerate(args):
1219         if i:
1220             ret += sep
1221         if isinstance(arg, str):
1222             ret += arg
1223         else:
1224             ret += str(arg)
1225     ret += end
1226     return ret
1227
1228
1229 class SprintfStdout(object):
1230     """
1231     A context manager that captures outputs to stdout.
1232
1233     with SprintfStdout() as buf:
1234         print("test")
1235     print(buf())
1236
1237     'test\n'
1238     """
1239     def __init__(self) -> None:
1240         self.destination = io.StringIO()
1241         self.recorder = None
1242
1243     def __enter__(self) -> Callable[[], str]:
1244         self.recorder = contextlib.redirect_stdout(self.destination)
1245         self.recorder.__enter__()
1246         return lambda: self.destination.getvalue()
1247
1248     def __exit__(self, *args) -> None:
1249         self.recorder.__exit__(*args)
1250         self.destination.seek(0)
1251         return None  # don't suppress exceptions
1252
1253
1254 def is_are(n: int) -> str:
1255     """Is or are?
1256
1257     >>> is_are(1)
1258     'is'
1259     >>> is_are(2)
1260     'are'
1261
1262     """
1263     if n == 1:
1264         return "is"
1265     return "are"
1266
1267
1268 def pluralize(n: int) -> str:
1269     """Add an s?
1270
1271     >>> pluralize(15)
1272     's'
1273     >>> count = 1
1274     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1275     There is 1 file.
1276     >>> count = 4
1277     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1278     There are 4 files.
1279
1280     """
1281     if n == 1:
1282         return ""
1283     return "s"
1284
1285
1286 def thify(n: int) -> str:
1287     """Return the proper cardinal suffix for a number.
1288
1289     >>> thify(1)
1290     'st'
1291     >>> thify(33)
1292     'rd'
1293     >>> thify(16)
1294     'th'
1295
1296     """
1297     digit = str(n)
1298     assert is_integer_number(digit)
1299     digit = digit[-1:]
1300     if digit == "1":
1301         return "st"
1302     elif digit == "2":
1303         return "nd"
1304     elif digit == "3":
1305         return "rd"
1306     else:
1307         return "th"
1308
1309
1310 def ngrams(txt: str, n: int):
1311     """Return the ngrams from a string.
1312
1313     >>> [x for x in ngrams('This is a test', 2)]
1314     ['This is', 'is a', 'a test']
1315
1316     """
1317     words = txt.split()
1318     for ngram in ngrams_presplit(words, n):
1319         ret = ''
1320         for word in ngram:
1321             ret += f'{word} '
1322         yield ret.strip()
1323
1324
1325 def ngrams_presplit(words: Sequence[str], n: int):
1326     return list_utils.ngrams(words, n)
1327
1328
1329 def bigrams(txt: str):
1330     return ngrams(txt, 2)
1331
1332
1333 def trigrams(txt: str):
1334     return ngrams(txt, 3)
1335
1336
1337 def shuffle_columns_into_list(
1338         input_lines: Iterable[str],
1339         column_specs: Iterable[Iterable[int]],
1340         delim=''
1341 ) -> Iterable[str]:
1342     """Helper to shuffle / parse columnar data and return the results as a
1343     list.  The column_specs argument is an iterable collection of
1344     numeric sequences that indicate one or more column numbers to
1345     copy.
1346
1347     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1348     >>> shuffle_columns_into_list(
1349     ...     cols,
1350     ...     [ [8], [2, 3], [5, 6, 7] ],
1351     ...     delim=' ',
1352     ... )
1353     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1354
1355     """
1356     out = []
1357
1358     # Column specs map input lines' columns into outputs.
1359     # [col1, col2...]
1360     for spec in column_specs:
1361         chunk = ''
1362         for n in spec:
1363             chunk = chunk + delim + input_lines[n]
1364         chunk = chunk.strip(delim)
1365         out.append(chunk)
1366     return out
1367
1368
1369 def shuffle_columns_into_dict(
1370         input_lines: Iterable[str],
1371         column_specs: Iterable[Tuple[str, Iterable[int]]],
1372         delim=''
1373 ) -> Dict[str, str]:
1374     """Helper to shuffle / parse columnar data and return the results
1375     as a dict.
1376
1377     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1378     >>> shuffle_columns_into_dict(
1379     ...     cols,
1380     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1381     ...     delim=' ',
1382     ... )
1383     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1384
1385     """
1386     out = {}
1387
1388     # Column specs map input lines' columns into outputs.
1389     # "key", [col1, col2...]
1390     for spec in column_specs:
1391         chunk = ''
1392         for n in spec[1]:
1393             chunk = chunk + delim + input_lines[n]
1394         chunk = chunk.strip(delim)
1395         out[spec[0]] = chunk
1396     return out
1397
1398
1399 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1400     """Interpolate a string with data from a dict.
1401
1402     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1403     ...                        {'adjective': 'good', 'noun': 'example'})
1404     'This is a good example.'
1405
1406     """
1407     return sprintf(txt.format(**values), end='')
1408
1409
1410 def to_ascii(x: str):
1411     """Encode as ascii bytes string.
1412
1413     >>> to_ascii('test')
1414     b'test'
1415
1416     >>> to_ascii(b'1, 2, 3')
1417     b'1, 2, 3'
1418
1419     """
1420     if type(x) is str:
1421         return x.encode('ascii')
1422     if type(x) is bytes:
1423         return x
1424     raise Exception('to_ascii works with strings and bytes')
1425
1426
1427 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
1428     """Encode txt and then encode the bytes with a 64-character
1429     alphabet.  This is compatible with uudecode.
1430
1431     >>> to_base64('hello?')
1432     b'aGVsbG8/\\n'
1433
1434     """
1435     return base64.encodebytes(txt.encode(encoding, errors))
1436
1437
1438 def is_base64(txt: str) -> bool:
1439     """Determine whether a string is base64 encoded (with Python's standard
1440     base64 alphabet which is the same as what uuencode uses).
1441
1442     >>> is_base64('test')    # all letters in the b64 alphabet
1443     True
1444
1445     >>> is_base64('another test, how do you like this one?')
1446     False
1447
1448     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1449     True
1450
1451     """
1452     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1453     alphabet = set(a.encode('ascii'))
1454     for char in to_ascii(txt.strip()):
1455         if char not in alphabet:
1456             return False
1457     return True
1458
1459
1460 def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
1461     """Convert base64 encoded string back to normal strings.
1462
1463     >>> from_base64(b'aGVsbG8/\\n')
1464     'hello?'
1465
1466     """
1467     return base64.decodebytes(b64).decode(encoding, errors)
1468
1469
1470 def chunk(txt: str, chunk_size):
1471     """Chunk up a string.
1472
1473     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1474     '01001101 11000101 10101010 10101010 10011111 10101000'
1475
1476     """
1477     if len(txt) % chunk_size != 0:
1478         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1479         logger.warning(msg)
1480         warnings.warn(msg)
1481     for x in range(0, len(txt), chunk_size):
1482         yield txt[x:x+chunk_size]
1483
1484
1485 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1486     """Encode txt and then chop it into bytes.  Note: only bitstrings
1487     with delimiter='' are interpretable by from_bitstring.
1488
1489     >>> to_bitstring('hello?')
1490     '011010000110010101101100011011000110111100111111'
1491
1492     >>> to_bitstring('test', delimiter=' ')
1493     '01110100 01100101 01110011 01110100'
1494
1495     >>> to_bitstring(b'test')
1496     '01110100011001010111001101110100'
1497
1498     """
1499     etxt = to_ascii(txt)
1500     bits = bin(
1501         int.from_bytes(
1502             etxt,
1503             'big'
1504         )
1505     )
1506     bits = bits[2:]
1507     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1508
1509
1510 def is_bitstring(txt: str) -> bool:
1511     """Is this a bitstring?
1512
1513     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1514     True
1515
1516     >>> is_bitstring('1234')
1517     False
1518
1519     """
1520     return is_binary_integer_number(f'0b{txt}')
1521
1522
1523 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1524     """Convert from bitstring back to bytes then decode into a str.
1525
1526     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1527     'hello?'
1528
1529     """
1530     n = int(bits, 2)
1531     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1532
1533
1534 def ip_v4_sort_key(txt: str) -> Tuple[int]:
1535     """Turn an IPv4 address into a tuple for sorting purposes.
1536
1537     >>> ip_v4_sort_key('10.0.0.18')
1538     (10, 0, 0, 18)
1539
1540     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1541     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1542     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1543
1544     """
1545     if not is_ip_v4(txt):
1546         print(f"not IP: {txt}")
1547         return None
1548     return tuple([int(x) for x in txt.split('.')])
1549
1550
1551 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str]:
1552     """Chunk up a file path so that parent/ancestor paths sort before
1553     children/descendant paths.
1554
1555     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1556     ('usr', 'local', 'bin')
1557
1558     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1559     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1560     ['/usr', '/usr/local', '/usr/local/bin']
1561
1562     """
1563     return tuple([x for x in volume.split('/') if len(x) > 0])
1564
1565
1566 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1567     """Execute several replace operations in a row.
1568
1569     >>> s = 'this_is a-test!'
1570     >>> replace_all(s, ' _-!', '')
1571     'thisisatest'
1572
1573     """
1574     for char in replace_set:
1575         in_str = in_str.replace(char, replacement)
1576     return in_str
1577
1578
1579 if __name__ == '__main__':
1580     import doctest
1581     doctest.testmod()