string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import base64
   4 import contextlib
   5 import datetime
   6 import io
   7 from itertools import zip_longest
   8 import json
   9 import logging
  10 import numbers
  11 import random
  12 import re
  13 import string
  14 from typing import (
  15     Any,
  16     Callable,
  17     Dict,
  18     Iterable,
  19     List,
  20     Optional,
  21     Sequence,
  22     Tuple,
  23 )
  24 import unicodedata
  25 from uuid import uuid4
  26 import warnings
  27
  28 import list_utils
  29
  30 logger = logging.getLogger(__name__)
  31
  32 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  33
  34 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  35
  36 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  37
  38 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  39
  40 URLS_RAW_STRING = (
  41     r"([a-z-]+://)"  # scheme
  42     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  43     r"(www\.)?"  # www.
  44     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  45     r"(:\d{2,})?"  # port number
  46     r"(/[a-z\d_%+-]*)*"  # folders
  47     r"(\.[a-z\d_%+-]+)*"  # file extension
  48     r"(\?[a-z\d_+%-=]*)?"  # query string
  49     r"(#\S*)?"  # hash
  50 )
  51
  52 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  53
  54 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  55
  56 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  57
  58 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  59
  60 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  61
  62 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  63
  64 CAMEL_CASE_TEST_RE = re.compile(
  65     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  66 )
  67
  68 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  69
  70 SNAKE_CASE_TEST_RE = re.compile(
  71     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  72 )
  73
  74 SNAKE_CASE_TEST_DASH_RE = re.compile(
  75     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  76 )
  77
  78 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  79
  80 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  81
  82 CREDIT_CARDS = {
  83     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  84     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  85     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  86     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  87     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  88     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  89 }
  90
  91 JSON_WRAPPER_RE = re.compile(
  92     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  93 )
  94
  95 UUID_RE = re.compile(
  96     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  97 )
  98
  99 UUID_HEX_OK_RE = re.compile(
 100     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 101     re.IGNORECASE,
 102 )
 103
 104 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 105
 106 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 107
 108 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 109
 110 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 111
 112 MAC_ADDRESS_RE = re.compile(
 113     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
 114 )
 115
 116 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 117     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 118 )
 119
 120 WORDS_COUNT_RE = re.compile(
 121     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 122 )
 123
 124 HTML_RE = re.compile(
 125     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 126     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 127 )
 128
 129 HTML_TAG_ONLY_RE = re.compile(
 130     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 131     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 132 )
 133
 134 SPACES_RE = re.compile(r"\s")
 135
 136 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 137     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 138 )
 139
 140 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 141
 142 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 143
 144 NUM_SUFFIXES = {
 145     "Pb": (1024 ** 5),
 146     "P": (1024 ** 5),
 147     "Tb": (1024 ** 4),
 148     "T": (1024 ** 4),
 149     "Gb": (1024 ** 3),
 150     "G": (1024 ** 3),
 151     "Mb": (1024 ** 2),
 152     "M": (1024 ** 2),
 153     "Kb": (1024 ** 1),
 154     "K": (1024 ** 1),
 155 }
 156
 157
 158 def is_none_or_empty(in_str: Optional[str]) -> bool:
 159     """
 160     Returns true if the input string is either None or an empty string.
 161
 162     >>> is_none_or_empty("")
 163     True
 164     >>> is_none_or_empty(None)
 165     True
 166     >>> is_none_or_empty("   \t   ")
 167     True
 168     >>> is_none_or_empty('Test')
 169     False
 170     """
 171     return in_str is None or len(in_str.strip()) == 0
 172
 173
 174 def is_string(obj: Any) -> bool:
 175     """
 176     Checks if an object is a string.
 177
 178     >>> is_string('test')
 179     True
 180     >>> is_string(123)
 181     False
 182     >>> is_string(100.3)
 183     False
 184     >>> is_string([1, 2, 3])
 185     False
 186     """
 187     return isinstance(obj, str)
 188
 189
 190 def is_empty_string(in_str: Any) -> bool:
 191     return is_empty(in_str)
 192
 193
 194 def is_empty(in_str: Any) -> bool:
 195     """
 196     Checks if input is a string and empty or only whitespace.
 197
 198     >>> is_empty('')
 199     True
 200     >>> is_empty('    \t\t    ')
 201     True
 202     >>> is_empty('test')
 203     False
 204     >>> is_empty(100.88)
 205     False
 206     >>> is_empty([1, 2, 3])
 207     False
 208     """
 209     return is_string(in_str) and in_str.strip() == ""
 210
 211
 212 def is_full_string(in_str: Any) -> bool:
 213     """
 214     Checks that input is a string and is not empty ('') or only whitespace.
 215
 216     >>> is_full_string('test!')
 217     True
 218     >>> is_full_string('')
 219     False
 220     >>> is_full_string('      ')
 221     False
 222     >>> is_full_string(100.999)
 223     False
 224     >>> is_full_string({"a": 1, "b": 2})
 225     False
 226     """
 227     return is_string(in_str) and in_str.strip() != ""
 228
 229
 230 def is_number(in_str: str) -> bool:
 231     """
 232     Checks if a string is a valid number.
 233
 234     >>> is_number(100.5)
 235     Traceback (most recent call last):
 236     ...
 237     ValueError: 100.5
 238     >>> is_number("100.5")
 239     True
 240     >>> is_number("test")
 241     False
 242     >>> is_number("99")
 243     True
 244     >>> is_number([1, 2, 3])
 245     Traceback (most recent call last):
 246     ...
 247     ValueError: [1, 2, 3]
 248     """
 249     if not is_string(in_str):
 250         raise ValueError(in_str)
 251     return NUMBER_RE.match(in_str) is not None
 252
 253
 254 def is_integer_number(in_str: str) -> bool:
 255     """
 256     Checks whether the given string represents an integer or not.
 257
 258     An integer may be signed or unsigned or use a "scientific notation".
 259
 260     >>> is_integer_number('42')
 261     True
 262     >>> is_integer_number('42.0')
 263     False
 264     """
 265     return (
 266         (is_number(in_str) and "." not in in_str)
 267         or is_hexidecimal_integer_number(in_str)
 268         or is_octal_integer_number(in_str)
 269         or is_binary_integer_number(in_str)
 270     )
 271
 272
 273 def is_hexidecimal_integer_number(in_str: str) -> bool:
 274     """
 275     Checks whether a string is a hex integer number.
 276
 277     >>> is_hexidecimal_integer_number('0x12345')
 278     True
 279     >>> is_hexidecimal_integer_number('0x1A3E')
 280     True
 281     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 282     False
 283     >>> is_hexidecimal_integer_number('-0xff')
 284     True
 285     >>> is_hexidecimal_integer_number('test')
 286     False
 287     >>> is_hexidecimal_integer_number(12345)  # Not a string
 288     Traceback (most recent call last):
 289     ...
 290     ValueError: 12345
 291     >>> is_hexidecimal_integer_number(101.4)
 292     Traceback (most recent call last):
 293     ...
 294     ValueError: 101.4
 295     >>> is_hexidecimal_integer_number(0x1A3E)
 296     Traceback (most recent call last):
 297     ...
 298     ValueError: 6718
 299     """
 300     if not is_string(in_str):
 301         raise ValueError(in_str)
 302     return HEX_NUMBER_RE.match(in_str) is not None
 303
 304
 305 def is_octal_integer_number(in_str: str) -> bool:
 306     """
 307     Checks whether a string is an octal number.
 308
 309     >>> is_octal_integer_number('0o777')
 310     True
 311     >>> is_octal_integer_number('-0O115')
 312     True
 313     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 314     False
 315     >>> is_octal_integer_number('7777')  # Needs 0o
 316     False
 317     >>> is_octal_integer_number('test')
 318     False
 319     """
 320     if not is_string(in_str):
 321         raise ValueError(in_str)
 322     return OCT_NUMBER_RE.match(in_str) is not None
 323
 324
 325 def is_binary_integer_number(in_str: str) -> bool:
 326     """
 327     Returns whether a string contains a binary number.
 328
 329     >>> is_binary_integer_number('0b10111')
 330     True
 331     >>> is_binary_integer_number('-0b111')
 332     True
 333     >>> is_binary_integer_number('0B10101')
 334     True
 335     >>> is_binary_integer_number('0b10102')
 336     False
 337     >>> is_binary_integer_number('0xFFF')
 338     False
 339     >>> is_binary_integer_number('test')
 340     False
 341     """
 342     if not is_string(in_str):
 343         raise ValueError(in_str)
 344     return BIN_NUMBER_RE.match(in_str) is not None
 345
 346
 347 def to_int(in_str: str) -> int:
 348     """Returns the integral value of the string or raises on error.
 349
 350     >>> to_int('1234')
 351     1234
 352     >>> to_int('test')
 353     Traceback (most recent call last):
 354     ...
 355     ValueError: invalid literal for int() with base 10: 'test'
 356     """
 357     if not is_string(in_str):
 358         raise ValueError(in_str)
 359     if is_binary_integer_number(in_str):
 360         return int(in_str, 2)
 361     if is_octal_integer_number(in_str):
 362         return int(in_str, 8)
 363     if is_hexidecimal_integer_number(in_str):
 364         return int(in_str, 16)
 365     return int(in_str)
 366
 367
 368 def is_decimal_number(in_str: str) -> bool:
 369     """
 370     Checks whether the given string represents a decimal or not.
 371
 372     A decimal may be signed or unsigned or use a "scientific notation".
 373
 374     >>> is_decimal_number('42.0')
 375     True
 376     >>> is_decimal_number('42')
 377     False
 378     """
 379     return is_number(in_str) and "." in in_str
 380
 381
 382 def strip_escape_sequences(in_str: str) -> str:
 383     """
 384     Remove escape sequences in the input string.
 385
 386     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 387     'this is a test!'
 388     """
 389     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 390     return in_str
 391
 392
 393 def add_thousands_separator(
 394     in_str: str, *, separator_char=',', places=3
 395 ) -> str:
 396     """
 397     Add thousands separator to a numeric string.  Also handles numbers.
 398
 399     >>> add_thousands_separator('12345678')
 400     '12,345,678'
 401     >>> add_thousands_separator(12345678)
 402     '12,345,678'
 403     >>> add_thousands_separator(12345678.99)
 404     '12,345,678.99'
 405     >>> add_thousands_separator('test')
 406     Traceback (most recent call last):
 407     ...
 408     ValueError: test
 409
 410     """
 411     if isinstance(in_str, numbers.Number):
 412         in_str = f'{in_str}'
 413     if is_number(in_str):
 414         return _add_thousands_separator(
 415             in_str, separator_char=separator_char, places=places
 416         )
 417     raise ValueError(in_str)
 418
 419
 420 def _add_thousands_separator(
 421     in_str: str, *, separator_char=',', places=3
 422 ) -> str:
 423     decimal_part = ""
 424     if '.' in in_str:
 425         (in_str, decimal_part) = in_str.split('.')
 426     tmp = [iter(in_str[::-1])] * places
 427     ret = separator_char.join(
 428         "".join(x) for x in zip_longest(*tmp, fillvalue="")
 429     )[::-1]
 430     if len(decimal_part) > 0:
 431         ret += '.'
 432         ret += decimal_part
 433     return ret
 434
 435
 436 # Full url example:
 437 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 438 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 439     """
 440     Check if a string is a valid url.
 441
 442     >>> is_url('http://www.mysite.com')
 443     True
 444     >>> is_url('https://mysite.com')
 445     True
 446     >>> is_url('.mysite.com')
 447     False
 448     """
 449     if not is_full_string(in_str):
 450         return False
 451
 452     valid = URL_RE.match(in_str) is not None
 453
 454     if allowed_schemes:
 455         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 456     return valid
 457
 458
 459 def is_email(in_str: Any) -> bool:
 460     """
 461     Check if a string is a valid email.
 462
 463     Reference: https://tools.ietf.org/html/rfc3696#section-3
 464
 465     >>> is_email('[email protected]')
 466     True
 467     >>> is_email('@gmail.com')
 468     False
 469     """
 470     if (
 471         not is_full_string(in_str)
 472         or len(in_str) > 320
 473         or in_str.startswith(".")
 474     ):
 475         return False
 476
 477     try:
 478         # we expect 2 tokens, one before "@" and one after, otherwise
 479         # we have an exception and the email is not valid.
 480         head, tail = in_str.split("@")
 481
 482         # head's size must be <= 64, tail <= 255, head must not start
 483         # with a dot or contain multiple consecutive dots.
 484         if (
 485             len(head) > 64
 486             or len(tail) > 255
 487             or head.endswith(".")
 488             or (".." in head)
 489         ):
 490             return False
 491
 492         # removes escaped spaces, so that later on the test regex will
 493         # accept the string.
 494         head = head.replace("\\ ", "")
 495         if head.startswith('"') and head.endswith('"'):
 496             head = head.replace(" ", "")[1:-1]
 497         return EMAIL_RE.match(head + "@" + tail) is not None
 498
 499     except ValueError:
 500         # borderline case in which we have multiple "@" signs but the
 501         # head part is correctly escaped.
 502         if ESCAPED_AT_SIGN.search(in_str) is not None:
 503             # replace "@" with "a" in the head
 504             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 505         return False
 506
 507
 508 def suffix_string_to_number(in_str: str) -> Optional[int]:
 509     """Take a string like "33Gb" and convert it into a number (of bytes)
 510     like 34603008.  Return None if the input string is not valid.
 511
 512     >>> suffix_string_to_number('1Mb')
 513     1048576
 514     >>> suffix_string_to_number('13.1Gb')
 515     14066017894
 516     """
 517
 518     def suffix_capitalize(s: str) -> str:
 519         if len(s) == 1:
 520             return s.upper()
 521         elif len(s) == 2:
 522             return f"{s[0].upper()}{s[1].lower()}"
 523         return suffix_capitalize(s[0:1])
 524
 525     if is_string(in_str):
 526         if is_integer_number(in_str):
 527             return to_int(in_str)
 528         suffixes = [in_str[-2:], in_str[-1:]]
 529         rest = [in_str[:-2], in_str[:-1]]
 530         for x in range(len(suffixes)):
 531             s = suffixes[x]
 532             s = suffix_capitalize(s)
 533             multiplier = NUM_SUFFIXES.get(s, None)
 534             if multiplier is not None:
 535                 r = rest[x]
 536                 if is_integer_number(r):
 537                     return to_int(r) * multiplier
 538                 if is_decimal_number(r):
 539                     return int(float(r) * multiplier)
 540     return None
 541
 542
 543 def number_to_suffix_string(num: int) -> Optional[str]:
 544     """Take a number (of bytes) and returns a string like "43.8Gb".
 545     Returns none if the input is invalid.
 546
 547     >>> number_to_suffix_string(14066017894)
 548     '13.1Gb'
 549     >>> number_to_suffix_string(1024 * 1024)
 550     '1.0Mb'
 551
 552     """
 553     d = 0.0
 554     suffix = None
 555     for (sfx, size) in NUM_SUFFIXES.items():
 556         if num >= size:
 557             d = num / size
 558             suffix = sfx
 559             break
 560     if suffix is not None:
 561         return f"{d:.1f}{suffix}"
 562     else:
 563         return f'{num:d}'
 564
 565
 566 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 567     """
 568     Checks if a string is a valid credit card number.
 569     If card type is provided then it checks against that specific type only,
 570     otherwise any known credit card number will be accepted.
 571
 572     Supported card types are the following:
 573
 574     - VISA
 575     - MASTERCARD
 576     - AMERICAN_EXPRESS
 577     - DINERS_CLUB
 578     - DISCOVER
 579     - JCB
 580     """
 581     if not is_full_string(in_str):
 582         return False
 583
 584     if card_type is not None:
 585         if card_type not in CREDIT_CARDS:
 586             raise KeyError(
 587                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 588             )
 589         return CREDIT_CARDS[card_type].match(in_str) is not None
 590     for c in CREDIT_CARDS:
 591         if CREDIT_CARDS[c].match(in_str) is not None:
 592             return True
 593     return False
 594
 595
 596 def is_camel_case(in_str: Any) -> bool:
 597     """
 598     Checks if a string is formatted as camel case.
 599
 600     A string is considered camel case when:
 601
 602     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 603     - it contains both lowercase and uppercase letters
 604     - it does not start with a number
 605     """
 606     return (
 607         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 608     )
 609
 610
 611 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 612     """
 613     Checks if a string is formatted as "snake case".
 614
 615     A string is considered snake case when:
 616
 617     - it's composed only by lowercase/uppercase letters and digits
 618     - it contains at least one underscore (or provided separator)
 619     - it does not start with a number
 620
 621     >>> is_snake_case('this_is_a_test')
 622     True
 623     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 624     True
 625     >>> is_snake_case('this-is-a-test')
 626     False
 627     >>> is_snake_case('this-is-a-test', separator='-')
 628     True
 629
 630     """
 631     if is_full_string(in_str):
 632         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 633         re_template = (
 634             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 635         )
 636         r = re_map.get(
 637             separator,
 638             re.compile(
 639                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 640             ),
 641         )
 642         return r.match(in_str) is not None
 643     return False
 644
 645
 646 def is_json(in_str: Any) -> bool:
 647     """
 648     Check if a string is a valid json.
 649
 650     >>> is_json('{"name": "Peter"}')
 651     True
 652     >>> is_json('[1, 2, 3]')
 653     True
 654     >>> is_json('{nope}')
 655     False
 656     """
 657     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 658         try:
 659             return isinstance(json.loads(in_str), (dict, list))
 660         except (TypeError, ValueError, OverflowError):
 661             pass
 662     return False
 663
 664
 665 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 666     """
 667     Check if a string is a valid UUID.
 668
 669     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 670     True
 671     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 672     False
 673     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 674     True
 675     """
 676     # string casting is used to allow UUID itself as input data type
 677     s = str(in_str)
 678     if allow_hex:
 679         return UUID_HEX_OK_RE.match(s) is not None
 680     return UUID_RE.match(s) is not None
 681
 682
 683 def is_ip_v4(in_str: Any) -> bool:
 684     """
 685     Checks if a string is a valid ip v4.
 686
 687     >>> is_ip_v4('255.200.100.75')
 688     True
 689     >>> is_ip_v4('nope')
 690     False
 691     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 692     False
 693     """
 694     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 695         return False
 696
 697     # checks that each entry in the ip is in the valid range (0 to 255)
 698     for token in in_str.split("."):
 699         if not 0 <= int(token) <= 255:
 700             return False
 701     return True
 702
 703
 704 def extract_ip_v4(in_str: Any) -> Optional[str]:
 705     """
 706     Extracts the IPv4 chunk of a string or None.
 707
 708     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 709     '127.0.0.1'
 710     >>> extract_ip_v4('Your mom dresses you funny.')
 711     """
 712     if not is_full_string(in_str):
 713         return None
 714     m = ANYWHERE_IP_V4_RE.search(in_str)
 715     if m is not None:
 716         return m.group(0)
 717     return None
 718
 719
 720 def is_ip_v6(in_str: Any) -> bool:
 721     """
 722     Checks if a string is a valid ip v6.
 723
 724     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 725     True
 726     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 727     False
 728     """
 729     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 730
 731
 732 def extract_ip_v6(in_str: Any) -> Optional[str]:
 733     """
 734     Extract IPv6 chunk or None.
 735
 736     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 737     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 738     >>> extract_ip_v6("(and she's ugly too, btw)")
 739     """
 740     if not is_full_string(in_str):
 741         return None
 742     m = ANYWHERE_IP_V6_RE.search(in_str)
 743     if m is not None:
 744         return m.group(0)
 745     return None
 746
 747
 748 def is_ip(in_str: Any) -> bool:
 749     """
 750     Checks if a string is a valid ip (either v4 or v6).
 751
 752     >>> is_ip('255.200.100.75')
 753     True
 754     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 755     True
 756     >>> is_ip('1.2.3')
 757     False
 758     >>> is_ip('1.2.3.999')
 759     False
 760     """
 761     return is_ip_v6(in_str) or is_ip_v4(in_str)
 762
 763
 764 def extract_ip(in_str: Any) -> Optional[str]:
 765     """
 766     Extract the IP address or None.
 767
 768     >>> extract_ip('Attacker: 255.200.100.75')
 769     '255.200.100.75'
 770     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 771     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 772     >>> extract_ip('1.2.3')
 773
 774     """
 775     ip = extract_ip_v4(in_str)
 776     if ip is None:
 777         ip = extract_ip_v6(in_str)
 778     return ip
 779
 780
 781 def is_mac_address(in_str: Any) -> bool:
 782     """Return True if in_str is a valid MAC address false otherwise.
 783
 784     >>> is_mac_address("34:29:8F:12:0D:2F")
 785     True
 786     >>> is_mac_address('34:29:8f:12:0d:2f')
 787     True
 788     >>> is_mac_address('34-29-8F-12-0D-2F')
 789     True
 790     >>> is_mac_address("test")
 791     False
 792     """
 793     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 794
 795
 796 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 797     """
 798     Extract the MAC address from in_str.
 799
 800     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 801     '34:29:8F:12:0D:2F'
 802
 803     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 804     'd8:5d:e2:34:54:86'
 805
 806     """
 807     if not is_full_string(in_str):
 808         return None
 809     in_str.strip()
 810     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 811     if m is not None:
 812         mac = m.group(0)
 813         mac.replace(":", separator)
 814         mac.replace("-", separator)
 815         return mac
 816     return None
 817
 818
 819 def is_slug(in_str: Any, separator: str = "-") -> bool:
 820     """
 821     Checks if a given string is a slug (as created by `slugify()`).
 822
 823     >>> is_slug('my-blog-post-title')
 824     True
 825     >>> is_slug('My blog post title')
 826     False
 827
 828     """
 829     if not is_full_string(in_str):
 830         return False
 831     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 832     return re.match(rex, in_str) is not None
 833
 834
 835 def contains_html(in_str: str) -> bool:
 836     """
 837     Checks if the given string contains HTML/XML tags.
 838
 839     By design, this function matches ANY type of tag, so don't expect to use it
 840     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 841
 842     >>> contains_html('my string is <strong>bold</strong>')
 843     True
 844     >>> contains_html('my string is not bold')
 845     False
 846
 847     """
 848     if not is_string(in_str):
 849         raise ValueError(in_str)
 850     return HTML_RE.search(in_str) is not None
 851
 852
 853 def words_count(in_str: str) -> int:
 854     """
 855     Returns the number of words contained into the given string.
 856
 857     This method is smart, it does consider only sequence of one or more letter and/or numbers
 858     as "words", so a string like this: "! @ # % ... []" will return zero!
 859     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 860     will be 4 not 1 (even if there are no spaces in the string).
 861
 862     >>> words_count('hello world')
 863     2
 864     >>> words_count('one,two,three.stop')
 865     4
 866
 867     """
 868     if not is_string(in_str):
 869         raise ValueError(in_str)
 870     return len(WORDS_COUNT_RE.findall(in_str))
 871
 872
 873 def generate_uuid(omit_dashes: bool = False) -> str:
 874     """
 875     Generated an UUID string (using `uuid.uuid4()`).
 876
 877     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 878     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 879
 880     """
 881     uid = uuid4()
 882     if omit_dashes:
 883         return uid.hex
 884     return str(uid)
 885
 886
 887 def generate_random_alphanumeric_string(size: int) -> str:
 888     """
 889     Returns a string of the specified size containing random
 890     characters (uppercase/lowercase ascii letters and digits).
 891
 892     random_string(9) # possible output: "cx3QQbzYg"
 893
 894     """
 895     if size < 1:
 896         raise ValueError("size must be >= 1")
 897     chars = string.ascii_letters + string.digits
 898     buffer = [random.choice(chars) for _ in range(size)]
 899     return from_char_list(buffer)
 900
 901
 902 def reverse(in_str: str) -> str:
 903     """
 904     Returns the string with its chars reversed.
 905
 906     >>> reverse('test')
 907     'tset'
 908
 909     """
 910     if not is_string(in_str):
 911         raise ValueError(in_str)
 912     return in_str[::-1]
 913
 914
 915 def camel_case_to_snake_case(in_str, *, separator="_"):
 916     """
 917     Convert a camel case string into a snake case one.
 918     (The original string is returned if is not a valid camel case string)
 919
 920     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 921     'mac_address_extractor_factory'
 922     >>> camel_case_to_snake_case('Luke Skywalker')
 923     'Luke Skywalker'
 924     """
 925     if not is_string(in_str):
 926         raise ValueError(in_str)
 927     if not is_camel_case(in_str):
 928         return in_str
 929     return CAMEL_CASE_REPLACE_RE.sub(
 930         lambda m: m.group(1) + separator, in_str
 931     ).lower()
 932
 933
 934 def snake_case_to_camel_case(
 935     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 936 ) -> str:
 937     """
 938     Convert a snake case string into a camel case one.
 939     (The original string is returned if is not a valid snake case string)
 940
 941     >>> snake_case_to_camel_case('this_is_a_test')
 942     'ThisIsATest'
 943     >>> snake_case_to_camel_case('Han Solo')
 944     'Han Solo'
 945     """
 946     if not is_string(in_str):
 947         raise ValueError(in_str)
 948     if not is_snake_case(in_str, separator=separator):
 949         return in_str
 950     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 951     if not upper_case_first:
 952         tokens[0] = tokens[0].lower()
 953     return from_char_list(tokens)
 954
 955
 956 def to_char_list(in_str: str) -> List[str]:
 957     """Convert a string into a list of chars.
 958
 959     >>> to_char_list('test')
 960     ['t', 'e', 's', 't']
 961     """
 962     if not is_string(in_str):
 963         return []
 964     return list(in_str)
 965
 966
 967 def from_char_list(in_list: List[str]) -> str:
 968     """Convert a char list into a string.
 969
 970     >>> from_char_list(['t', 'e', 's', 't'])
 971     'test'
 972     """
 973     return "".join(in_list)
 974
 975
 976 def shuffle(in_str: str) -> str:
 977     """Return a new string containing same chars of the given one but in
 978     a randomized order.
 979     """
 980     if not is_string(in_str):
 981         raise ValueError(in_str)
 982
 983     # turn the string into a list of chars
 984     chars = to_char_list(in_str)
 985     random.shuffle(chars)
 986     return from_char_list(chars)
 987
 988
 989 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 990     """
 991     Remove html code contained into the given string.
 992
 993     >>> strip_html('test: <a href="foo/bar">click here</a>')
 994     'test: '
 995     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 996     'test: click here'
 997     """
 998     if not is_string(in_str):
 999         raise ValueError(in_str)
1000     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1001     return r.sub("", in_str)
1002
1003
1004 def asciify(in_str: str) -> str:
1005     """
1006     Force string content to be ascii-only by translating all non-ascii
1007     chars into the closest possible representation (eg: ó -> o, Ë ->
1008     E, ç -> c...).
1009
1010     N.B. Some chars may be lost if impossible to translate.
1011
1012     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1013     'eeuuooaaeynAAACIINOE'
1014     """
1015     if not is_string(in_str):
1016         raise ValueError(in_str)
1017
1018     # "NFKD" is the algorithm which is able to successfully translate
1019     # the most of non-ascii chars.
1020     normalized = unicodedata.normalize("NFKD", in_str)
1021
1022     # encode string forcing ascii and ignore any errors
1023     # (unrepresentable chars will be stripped out)
1024     ascii_bytes = normalized.encode("ascii", "ignore")
1025
1026     # turns encoded bytes into an utf-8 string
1027     return ascii_bytes.decode("utf-8")
1028
1029
1030 def slugify(in_str: str, *, separator: str = "-") -> str:
1031     """
1032     Converts a string into a "slug" using provided separator.
1033     The returned string has the following properties:
1034
1035     - it has no spaces
1036     - all letters are in lower case
1037     - all punctuation signs and non alphanumeric chars are removed
1038     - words are divided using provided separator
1039     - all chars are encoded as ascii (by using `asciify()`)
1040     - is safe for URL
1041
1042     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1043     'top-10-reasons-to-love-dogs'
1044     >>> slugify('Mönstér Mägnët')
1045     'monster-magnet'
1046     """
1047     if not is_string(in_str):
1048         raise ValueError(in_str)
1049
1050     # replace any character that is NOT letter or number with spaces
1051     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1052
1053     # replace spaces with join sign
1054     out = SPACES_RE.sub(separator, out)
1055
1056     # normalize joins (remove duplicates)
1057     out = re.sub(re.escape(separator) + r"+", separator, out)
1058     return asciify(out)
1059
1060
1061 def to_bool(in_str: str) -> bool:
1062     """
1063     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1064
1065     A positive boolean (True) is returned if the string value is one
1066     of the following:
1067
1068     - "true"
1069     - "1"
1070     - "yes"
1071     - "y"
1072
1073     Otherwise False is returned.
1074
1075     >>> to_bool('True')
1076     True
1077
1078     >>> to_bool('1')
1079     True
1080
1081     >>> to_bool('yes')
1082     True
1083
1084     >>> to_bool('no')
1085     False
1086
1087     >>> to_bool('huh?')
1088     False
1089
1090     >>> to_bool('on')
1091     True
1092
1093     """
1094     if not is_string(in_str):
1095         raise ValueError(in_str)
1096     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1097
1098
1099 def to_date(in_str: str) -> Optional[datetime.date]:
1100     """
1101     Parses a date string.  See DateParser docs for details.
1102     """
1103     import dateparse.dateparse_utils as dp
1104
1105     try:
1106         d = dp.DateParser()
1107         d.parse(in_str)
1108         return d.get_date()
1109     except dp.ParseException:
1110         msg = f'Unable to parse date {in_str}.'
1111         logger.warning(msg)
1112     return None
1113
1114
1115 def valid_date(in_str: str) -> bool:
1116     """
1117     True if the string represents a valid date.
1118     """
1119     import dateparse.dateparse_utils as dp
1120
1121     try:
1122         d = dp.DateParser()
1123         _ = d.parse(in_str)
1124         return True
1125     except dp.ParseException:
1126         msg = f'Unable to parse date {in_str}.'
1127         logger.warning(msg)
1128     return False
1129
1130
1131 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1132     """
1133     Parses a datetime string.  See DateParser docs for more info.
1134     """
1135     import dateparse.dateparse_utils as dp
1136
1137     try:
1138         d = dp.DateParser()
1139         dt = d.parse(in_str)
1140         if type(dt) == datetime.datetime:
1141             return dt
1142     except ValueError:
1143         msg = f'Unable to parse datetime {in_str}.'
1144         logger.warning(msg)
1145     return None
1146
1147
1148 def valid_datetime(in_str: str) -> bool:
1149     """
1150     True if the string represents a valid datetime.
1151     """
1152     _ = to_datetime(in_str)
1153     if _ is not None:
1154         return True
1155     msg = f'Unable to parse datetime {in_str}.'
1156     logger.warning(msg)
1157     return False
1158
1159
1160 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1161     """
1162     Squeeze runs of more than one character_to_squeeze into one.
1163
1164     >>> squeeze(' this        is       a    test    ')
1165     ' this is a test '
1166
1167     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1168     'one|!|two|!|three'
1169
1170     """
1171     return re.sub(
1172         r'(' + re.escape(character_to_squeeze) + r')+',
1173         character_to_squeeze,
1174         in_str,
1175     )
1176
1177
1178 def dedent(in_str: str) -> str:
1179     """
1180     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1181     """
1182     if not is_string(in_str):
1183         raise ValueError(in_str)
1184     line_separator = '\n'
1185     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1186     return line_separator.join(lines)
1187
1188
1189 def indent(in_str: str, amount: int) -> str:
1190     """
1191     Indents string by prepending amount spaces.
1192
1193     >>> indent('This is a test', 4)
1194     '    This is a test'
1195
1196     """
1197     if not is_string(in_str):
1198         raise ValueError(in_str)
1199     line_separator = '\n'
1200     lines = [" " * amount + line for line in in_str.split(line_separator)]
1201     return line_separator.join(lines)
1202
1203
1204 def sprintf(*args, **kwargs) -> str:
1205     """String printf, like in C"""
1206     ret = ""
1207
1208     sep = kwargs.pop("sep", None)
1209     if sep is not None:
1210         if not isinstance(sep, str):
1211             raise TypeError("sep must be None or a string")
1212
1213     end = kwargs.pop("end", None)
1214     if end is not None:
1215         if not isinstance(end, str):
1216             raise TypeError("end must be None or a string")
1217
1218     if kwargs:
1219         raise TypeError("invalid keyword arguments to sprint()")
1220
1221     if sep is None:
1222         sep = " "
1223     if end is None:
1224         end = "\n"
1225     for i, arg in enumerate(args):
1226         if i:
1227             ret += sep
1228         if isinstance(arg, str):
1229             ret += arg
1230         else:
1231             ret += str(arg)
1232     ret += end
1233     return ret
1234
1235
1236 class SprintfStdout(object):
1237     """
1238     A context manager that captures outputs to stdout.
1239
1240     with SprintfStdout() as buf:
1241         print("test")
1242     print(buf())
1243
1244     'test\n'
1245     """
1246
1247     def __init__(self) -> None:
1248         self.destination = io.StringIO()
1249         self.recorder = None
1250
1251     def __enter__(self) -> Callable[[], str]:
1252         self.recorder = contextlib.redirect_stdout(self.destination)
1253         self.recorder.__enter__()
1254         return lambda: self.destination.getvalue()
1255
1256     def __exit__(self, *args) -> None:
1257         self.recorder.__exit__(*args)
1258         self.destination.seek(0)
1259         return None  # don't suppress exceptions
1260
1261
1262 def is_are(n: int) -> str:
1263     """Is or are?
1264
1265     >>> is_are(1)
1266     'is'
1267     >>> is_are(2)
1268     'are'
1269
1270     """
1271     if n == 1:
1272         return "is"
1273     return "are"
1274
1275
1276 def pluralize(n: int) -> str:
1277     """Add an s?
1278
1279     >>> pluralize(15)
1280     's'
1281     >>> count = 1
1282     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1283     There is 1 file.
1284     >>> count = 4
1285     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1286     There are 4 files.
1287
1288     """
1289     if n == 1:
1290         return ""
1291     return "s"
1292
1293
1294 def thify(n: int) -> str:
1295     """Return the proper cardinal suffix for a number.
1296
1297     >>> thify(1)
1298     'st'
1299     >>> thify(33)
1300     'rd'
1301     >>> thify(16)
1302     'th'
1303
1304     """
1305     digit = str(n)
1306     assert is_integer_number(digit)
1307     digit = digit[-1:]
1308     if digit == "1":
1309         return "st"
1310     elif digit == "2":
1311         return "nd"
1312     elif digit == "3":
1313         return "rd"
1314     else:
1315         return "th"
1316
1317
1318 def ngrams(txt: str, n: int):
1319     """Return the ngrams from a string.
1320
1321     >>> [x for x in ngrams('This is a test', 2)]
1322     ['This is', 'is a', 'a test']
1323
1324     """
1325     words = txt.split()
1326     for ngram in ngrams_presplit(words, n):
1327         ret = ''
1328         for word in ngram:
1329             ret += f'{word} '
1330         yield ret.strip()
1331
1332
1333 def ngrams_presplit(words: Sequence[str], n: int):
1334     return list_utils.ngrams(words, n)
1335
1336
1337 def bigrams(txt: str):
1338     return ngrams(txt, 2)
1339
1340
1341 def trigrams(txt: str):
1342     return ngrams(txt, 3)
1343
1344
1345 def shuffle_columns_into_list(
1346     input_lines: Iterable[str], column_specs: Iterable[Iterable[int]], delim=''
1347 ) -> Iterable[str]:
1348     """Helper to shuffle / parse columnar data and return the results as a
1349     list.  The column_specs argument is an iterable collection of
1350     numeric sequences that indicate one or more column numbers to
1351     copy.
1352
1353     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1354     >>> shuffle_columns_into_list(
1355     ...     cols,
1356     ...     [ [8], [2, 3], [5, 6, 7] ],
1357     ...     delim=' ',
1358     ... )
1359     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1360
1361     """
1362     out = []
1363
1364     # Column specs map input lines' columns into outputs.
1365     # [col1, col2...]
1366     for spec in column_specs:
1367         chunk = ''
1368         for n in spec:
1369             chunk = chunk + delim + input_lines[n]
1370         chunk = chunk.strip(delim)
1371         out.append(chunk)
1372     return out
1373
1374
1375 def shuffle_columns_into_dict(
1376     input_lines: Iterable[str],
1377     column_specs: Iterable[Tuple[str, Iterable[int]]],
1378     delim='',
1379 ) -> Dict[str, str]:
1380     """Helper to shuffle / parse columnar data and return the results
1381     as a dict.
1382
1383     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1384     >>> shuffle_columns_into_dict(
1385     ...     cols,
1386     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1387     ...     delim=' ',
1388     ... )
1389     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1390
1391     """
1392     out = {}
1393
1394     # Column specs map input lines' columns into outputs.
1395     # "key", [col1, col2...]
1396     for spec in column_specs:
1397         chunk = ''
1398         for n in spec[1]:
1399             chunk = chunk + delim + input_lines[n]
1400         chunk = chunk.strip(delim)
1401         out[spec[0]] = chunk
1402     return out
1403
1404
1405 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1406     """Interpolate a string with data from a dict.
1407
1408     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1409     ...                        {'adjective': 'good', 'noun': 'example'})
1410     'This is a good example.'
1411
1412     """
1413     return sprintf(txt.format(**values), end='')
1414
1415
1416 def to_ascii(x: str):
1417     """Encode as ascii bytes string.
1418
1419     >>> to_ascii('test')
1420     b'test'
1421
1422     >>> to_ascii(b'1, 2, 3')
1423     b'1, 2, 3'
1424
1425     """
1426     if type(x) is str:
1427         return x.encode('ascii')
1428     if type(x) is bytes:
1429         return x
1430     raise Exception('to_ascii works with strings and bytes')
1431
1432
1433 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
1434     """Encode txt and then encode the bytes with a 64-character
1435     alphabet.  This is compatible with uudecode.
1436
1437     >>> to_base64('hello?')
1438     b'aGVsbG8/\\n'
1439
1440     """
1441     return base64.encodebytes(txt.encode(encoding, errors))
1442
1443
1444 def is_base64(txt: str) -> bool:
1445     """Determine whether a string is base64 encoded (with Python's standard
1446     base64 alphabet which is the same as what uuencode uses).
1447
1448     >>> is_base64('test')    # all letters in the b64 alphabet
1449     True
1450
1451     >>> is_base64('another test, how do you like this one?')
1452     False
1453
1454     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1455     True
1456
1457     """
1458     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1459     alphabet = set(a.encode('ascii'))
1460     for char in to_ascii(txt.strip()):
1461         if char not in alphabet:
1462             return False
1463     return True
1464
1465
1466 def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
1467     """Convert base64 encoded string back to normal strings.
1468
1469     >>> from_base64(b'aGVsbG8/\\n')
1470     'hello?'
1471
1472     """
1473     return base64.decodebytes(b64).decode(encoding, errors)
1474
1475
1476 def chunk(txt: str, chunk_size):
1477     """Chunk up a string.
1478
1479     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1480     '01001101 11000101 10101010 10101010 10011111 10101000'
1481
1482     """
1483     if len(txt) % chunk_size != 0:
1484         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1485         logger.warning(msg)
1486         warnings.warn(msg, stacklevel=2)
1487     for x in range(0, len(txt), chunk_size):
1488         yield txt[x : x + chunk_size]
1489
1490
1491 def to_bitstring(
1492     txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass'
1493 ) -> str:
1494     """Encode txt and then chop it into bytes.  Note: only bitstrings
1495     with delimiter='' are interpretable by from_bitstring.
1496
1497     >>> to_bitstring('hello?')
1498     '011010000110010101101100011011000110111100111111'
1499
1500     >>> to_bitstring('test', delimiter=' ')
1501     '01110100 01100101 01110011 01110100'
1502
1503     >>> to_bitstring(b'test')
1504     '01110100011001010111001101110100'
1505
1506     """
1507     etxt = to_ascii(txt)
1508     bits = bin(int.from_bytes(etxt, 'big'))
1509     bits = bits[2:]
1510     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1511
1512
1513 def is_bitstring(txt: str) -> bool:
1514     """Is this a bitstring?
1515
1516     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1517     True
1518
1519     >>> is_bitstring('1234')
1520     False
1521
1522     """
1523     return is_binary_integer_number(f'0b{txt}')
1524
1525
1526 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1527     """Convert from bitstring back to bytes then decode into a str.
1528
1529     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1530     'hello?'
1531
1532     """
1533     n = int(bits, 2)
1534     return (
1535         n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors)
1536         or '\0'
1537     )
1538
1539
1540 def ip_v4_sort_key(txt: str) -> Tuple[int]:
1541     """Turn an IPv4 address into a tuple for sorting purposes.
1542
1543     >>> ip_v4_sort_key('10.0.0.18')
1544     (10, 0, 0, 18)
1545
1546     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1547     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1548     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1549
1550     """
1551     if not is_ip_v4(txt):
1552         print(f"not IP: {txt}")
1553         return None
1554     return tuple([int(x) for x in txt.split('.')])
1555
1556
1557 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str]:
1558     """Chunk up a file path so that parent/ancestor paths sort before
1559     children/descendant paths.
1560
1561     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1562     ('usr', 'local', 'bin')
1563
1564     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1565     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1566     ['/usr', '/usr/local', '/usr/local/bin']
1567
1568     """
1569     return tuple([x for x in volume.split('/') if len(x) > 0])
1570
1571
1572 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1573     """Execute several replace operations in a row.
1574
1575     >>> s = 'this_is a-test!'
1576     >>> replace_all(s, ' _-!', '')
1577     'thisisatest'
1578
1579     """
1580     for char in replace_set:
1581         in_str = in_str.replace(char, replacement)
1582     return in_str
1583
1584
1585 if __name__ == '__main__':
1586     import doctest
1587
1588     doctest.testmod()