string_utils.py

   1 #!/usr/bin/env python3
   2
   3 """The MIT License (MIT)
   4
   5 Copyright (c) 2016-2020 Davide Zanotti
   6 Modifications Copyright (c) 2021-2022 Scott Gasch
   7
   8 Permission is hereby granted, free of charge, to any person obtaining a copy
   9 of this software and associated documentation files (the "Software"), to deal
  10 in the Software without restriction, including without limitation the rights
  11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12 copies of the Software, and to permit persons to whom the Software is
  13 furnished to do so, subject to the following conditions:
  14
  15 The above copyright notice and this permission notice shall be included in all
  16 copies or substantial portions of the Software.
  17
  18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24 SOFTWARE.
  25
  26 This class is based on: https://github.com/daveoncode/python-string-utils.
  27 """
  28
  29 import base64
  30 import contextlib  # type: ignore
  31 import datetime
  32 import io
  33 from itertools import zip_longest
  34 import json
  35 import logging
  36 import numbers
  37 import random
  38 import re
  39 import string
  40 from typing import (
  41     Any,
  42     Callable,
  43     Dict,
  44     Iterable,
  45     List,
  46     Optional,
  47     Sequence,
  48     Tuple,
  49 )
  50 import unicodedata
  51 from uuid import uuid4
  52 import warnings
  53
  54 import list_utils
  55
  56 logger = logging.getLogger(__name__)
  57
  58 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  59
  60 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  61
  62 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  63
  64 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  65
  66 URLS_RAW_STRING = (
  67     r"([a-z-]+://)"  # scheme
  68     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  69     r"(www\.)?"  # www.
  70     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  71     r"(:\d{2,})?"  # port number
  72     r"(/[a-z\d_%+-]*)*"  # folders
  73     r"(\.[a-z\d_%+-]+)*"  # file extension
  74     r"(\?[a-z\d_+%-=]*)?"  # query string
  75     r"(#\S*)?"  # hash
  76 )
  77
  78 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  79
  80 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  81
  82 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  83
  84 EMAILS_RAW_STRING = (
  85     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  86 )
  87
  88 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  89
  90 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  91
  92 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
  93
  94 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  95
  96 SNAKE_CASE_TEST_RE = re.compile(
  97     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  98 )
  99
 100 SNAKE_CASE_TEST_DASH_RE = re.compile(
 101     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
 102 )
 103
 104 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 105
 106 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 107
 108 CREDIT_CARDS = {
 109     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 110     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 111     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 112     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 113     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 114     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 115 }
 116
 117 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 118
 119 UUID_RE = re.compile(
 120     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 121 )
 122
 123 UUID_HEX_OK_RE = re.compile(
 124     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 125     re.IGNORECASE,
 126 )
 127
 128 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 129
 130 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 131
 132 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 133
 134 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 135
 136 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 137
 138 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 139     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 140 )
 141
 142 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 143
 144 HTML_RE = re.compile(
 145     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 146     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 147 )
 148
 149 HTML_TAG_ONLY_RE = re.compile(
 150     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 151     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 152 )
 153
 154 SPACES_RE = re.compile(r"\s")
 155
 156 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 157
 158 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 159
 160 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 161
 162 NUM_SUFFIXES = {
 163     "Pb": (1024 ** 5),
 164     "P": (1024 ** 5),
 165     "Tb": (1024 ** 4),
 166     "T": (1024 ** 4),
 167     "Gb": (1024 ** 3),
 168     "G": (1024 ** 3),
 169     "Mb": (1024 ** 2),
 170     "M": (1024 ** 2),
 171     "Kb": (1024 ** 1),
 172     "K": (1024 ** 1),
 173 }
 174
 175
 176 def is_none_or_empty(in_str: Optional[str]) -> bool:
 177     """
 178     Returns true if the input string is either None or an empty string.
 179
 180     >>> is_none_or_empty("")
 181     True
 182     >>> is_none_or_empty(None)
 183     True
 184     >>> is_none_or_empty("   \t   ")
 185     True
 186     >>> is_none_or_empty('Test')
 187     False
 188     """
 189     return in_str is None or len(in_str.strip()) == 0
 190
 191
 192 def is_string(obj: Any) -> bool:
 193     """
 194     Checks if an object is a string.
 195
 196     >>> is_string('test')
 197     True
 198     >>> is_string(123)
 199     False
 200     >>> is_string(100.3)
 201     False
 202     >>> is_string([1, 2, 3])
 203     False
 204     """
 205     return isinstance(obj, str)
 206
 207
 208 def is_empty_string(in_str: Any) -> bool:
 209     return is_empty(in_str)
 210
 211
 212 def is_empty(in_str: Any) -> bool:
 213     """
 214     Checks if input is a string and empty or only whitespace.
 215
 216     >>> is_empty('')
 217     True
 218     >>> is_empty('    \t\t    ')
 219     True
 220     >>> is_empty('test')
 221     False
 222     >>> is_empty(100.88)
 223     False
 224     >>> is_empty([1, 2, 3])
 225     False
 226     """
 227     return is_string(in_str) and in_str.strip() == ""
 228
 229
 230 def is_full_string(in_str: Any) -> bool:
 231     """
 232     Checks that input is a string and is not empty ('') or only whitespace.
 233
 234     >>> is_full_string('test!')
 235     True
 236     >>> is_full_string('')
 237     False
 238     >>> is_full_string('      ')
 239     False
 240     >>> is_full_string(100.999)
 241     False
 242     >>> is_full_string({"a": 1, "b": 2})
 243     False
 244     """
 245     return is_string(in_str) and in_str.strip() != ""
 246
 247
 248 def is_number(in_str: str) -> bool:
 249     """
 250     Checks if a string is a valid number.
 251
 252     >>> is_number(100.5)
 253     Traceback (most recent call last):
 254     ...
 255     ValueError: 100.5
 256     >>> is_number("100.5")
 257     True
 258     >>> is_number("test")
 259     False
 260     >>> is_number("99")
 261     True
 262     >>> is_number([1, 2, 3])
 263     Traceback (most recent call last):
 264     ...
 265     ValueError: [1, 2, 3]
 266     """
 267     if not is_string(in_str):
 268         raise ValueError(in_str)
 269     return NUMBER_RE.match(in_str) is not None
 270
 271
 272 def is_integer_number(in_str: str) -> bool:
 273     """
 274     Checks whether the given string represents an integer or not.
 275
 276     An integer may be signed or unsigned or use a "scientific notation".
 277
 278     >>> is_integer_number('42')
 279     True
 280     >>> is_integer_number('42.0')
 281     False
 282     """
 283     return (
 284         (is_number(in_str) and "." not in in_str)
 285         or is_hexidecimal_integer_number(in_str)
 286         or is_octal_integer_number(in_str)
 287         or is_binary_integer_number(in_str)
 288     )
 289
 290
 291 def is_hexidecimal_integer_number(in_str: str) -> bool:
 292     """
 293     Checks whether a string is a hex integer number.
 294
 295     >>> is_hexidecimal_integer_number('0x12345')
 296     True
 297     >>> is_hexidecimal_integer_number('0x1A3E')
 298     True
 299     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 300     False
 301     >>> is_hexidecimal_integer_number('-0xff')
 302     True
 303     >>> is_hexidecimal_integer_number('test')
 304     False
 305     >>> is_hexidecimal_integer_number(12345)  # Not a string
 306     Traceback (most recent call last):
 307     ...
 308     ValueError: 12345
 309     >>> is_hexidecimal_integer_number(101.4)
 310     Traceback (most recent call last):
 311     ...
 312     ValueError: 101.4
 313     >>> is_hexidecimal_integer_number(0x1A3E)
 314     Traceback (most recent call last):
 315     ...
 316     ValueError: 6718
 317     """
 318     if not is_string(in_str):
 319         raise ValueError(in_str)
 320     return HEX_NUMBER_RE.match(in_str) is not None
 321
 322
 323 def is_octal_integer_number(in_str: str) -> bool:
 324     """
 325     Checks whether a string is an octal number.
 326
 327     >>> is_octal_integer_number('0o777')
 328     True
 329     >>> is_octal_integer_number('-0O115')
 330     True
 331     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 332     False
 333     >>> is_octal_integer_number('7777')  # Needs 0o
 334     False
 335     >>> is_octal_integer_number('test')
 336     False
 337     """
 338     if not is_string(in_str):
 339         raise ValueError(in_str)
 340     return OCT_NUMBER_RE.match(in_str) is not None
 341
 342
 343 def is_binary_integer_number(in_str: str) -> bool:
 344     """
 345     Returns whether a string contains a binary number.
 346
 347     >>> is_binary_integer_number('0b10111')
 348     True
 349     >>> is_binary_integer_number('-0b111')
 350     True
 351     >>> is_binary_integer_number('0B10101')
 352     True
 353     >>> is_binary_integer_number('0b10102')
 354     False
 355     >>> is_binary_integer_number('0xFFF')
 356     False
 357     >>> is_binary_integer_number('test')
 358     False
 359     """
 360     if not is_string(in_str):
 361         raise ValueError(in_str)
 362     return BIN_NUMBER_RE.match(in_str) is not None
 363
 364
 365 def to_int(in_str: str) -> int:
 366     """Returns the integral value of the string or raises on error.
 367
 368     >>> to_int('1234')
 369     1234
 370     >>> to_int('test')
 371     Traceback (most recent call last):
 372     ...
 373     ValueError: invalid literal for int() with base 10: 'test'
 374     """
 375     if not is_string(in_str):
 376         raise ValueError(in_str)
 377     if is_binary_integer_number(in_str):
 378         return int(in_str, 2)
 379     if is_octal_integer_number(in_str):
 380         return int(in_str, 8)
 381     if is_hexidecimal_integer_number(in_str):
 382         return int(in_str, 16)
 383     return int(in_str)
 384
 385
 386 def is_decimal_number(in_str: str) -> bool:
 387     """
 388     Checks whether the given string represents a decimal or not.
 389
 390     A decimal may be signed or unsigned or use a "scientific notation".
 391
 392     >>> is_decimal_number('42.0')
 393     True
 394     >>> is_decimal_number('42')
 395     False
 396     """
 397     return is_number(in_str) and "." in in_str
 398
 399
 400 def strip_escape_sequences(in_str: str) -> str:
 401     """
 402     Remove escape sequences in the input string.
 403
 404     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 405     'this is a test!'
 406     """
 407     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 408     return in_str
 409
 410
 411 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 412     """
 413     Add thousands separator to a numeric string.  Also handles numbers.
 414
 415     >>> add_thousands_separator('12345678')
 416     '12,345,678'
 417     >>> add_thousands_separator(12345678)
 418     '12,345,678'
 419     >>> add_thousands_separator(12345678.99)
 420     '12,345,678.99'
 421     >>> add_thousands_separator('test')
 422     Traceback (most recent call last):
 423     ...
 424     ValueError: test
 425
 426     """
 427     if isinstance(in_str, numbers.Number):
 428         in_str = f'{in_str}'
 429     if is_number(in_str):
 430         return _add_thousands_separator(
 431             in_str, separator_char=separator_char, places=places
 432         )
 433     raise ValueError(in_str)
 434
 435
 436 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 437     decimal_part = ""
 438     if '.' in in_str:
 439         (in_str, decimal_part) = in_str.split('.')
 440     tmp = [iter(in_str[::-1])] * places
 441     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 442     if len(decimal_part) > 0:
 443         ret += '.'
 444         ret += decimal_part
 445     return ret
 446
 447
 448 # Full url example:
 449 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 450 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 451     """
 452     Check if a string is a valid url.
 453
 454     >>> is_url('http://www.mysite.com')
 455     True
 456     >>> is_url('https://mysite.com')
 457     True
 458     >>> is_url('.mysite.com')
 459     False
 460     """
 461     if not is_full_string(in_str):
 462         return False
 463
 464     valid = URL_RE.match(in_str) is not None
 465
 466     if allowed_schemes:
 467         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 468     return valid
 469
 470
 471 def is_email(in_str: Any) -> bool:
 472     """
 473     Check if a string is a valid email.
 474
 475     Reference: https://tools.ietf.org/html/rfc3696#section-3
 476
 477     >>> is_email('[email protected]')
 478     True
 479     >>> is_email('@gmail.com')
 480     False
 481     """
 482     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 483         return False
 484
 485     try:
 486         # we expect 2 tokens, one before "@" and one after, otherwise
 487         # we have an exception and the email is not valid.
 488         head, tail = in_str.split("@")
 489
 490         # head's size must be <= 64, tail <= 255, head must not start
 491         # with a dot or contain multiple consecutive dots.
 492         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 493             return False
 494
 495         # removes escaped spaces, so that later on the test regex will
 496         # accept the string.
 497         head = head.replace("\\ ", "")
 498         if head.startswith('"') and head.endswith('"'):
 499             head = head.replace(" ", "")[1:-1]
 500         return EMAIL_RE.match(head + "@" + tail) is not None
 501
 502     except ValueError:
 503         # borderline case in which we have multiple "@" signs but the
 504         # head part is correctly escaped.
 505         if ESCAPED_AT_SIGN.search(in_str) is not None:
 506             # replace "@" with "a" in the head
 507             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 508         return False
 509
 510
 511 def suffix_string_to_number(in_str: str) -> Optional[int]:
 512     """Take a string like "33Gb" and convert it into a number (of bytes)
 513     like 34603008.  Return None if the input string is not valid.
 514
 515     >>> suffix_string_to_number('1Mb')
 516     1048576
 517     >>> suffix_string_to_number('13.1Gb')
 518     14066017894
 519     """
 520
 521     def suffix_capitalize(s: str) -> str:
 522         if len(s) == 1:
 523             return s.upper()
 524         elif len(s) == 2:
 525             return f"{s[0].upper()}{s[1].lower()}"
 526         return suffix_capitalize(s[0:1])
 527
 528     if is_string(in_str):
 529         if is_integer_number(in_str):
 530             return to_int(in_str)
 531         suffixes = [in_str[-2:], in_str[-1:]]
 532         rest = [in_str[:-2], in_str[:-1]]
 533         for x in range(len(suffixes)):
 534             s = suffixes[x]
 535             s = suffix_capitalize(s)
 536             multiplier = NUM_SUFFIXES.get(s, None)
 537             if multiplier is not None:
 538                 r = rest[x]
 539                 if is_integer_number(r):
 540                     return to_int(r) * multiplier
 541                 if is_decimal_number(r):
 542                     return int(float(r) * multiplier)
 543     return None
 544
 545
 546 def number_to_suffix_string(num: int) -> Optional[str]:
 547     """Take a number (of bytes) and returns a string like "43.8Gb".
 548     Returns none if the input is invalid.
 549
 550     >>> number_to_suffix_string(14066017894)
 551     '13.1Gb'
 552     >>> number_to_suffix_string(1024 * 1024)
 553     '1.0Mb'
 554
 555     """
 556     d = 0.0
 557     suffix = None
 558     for (sfx, size) in NUM_SUFFIXES.items():
 559         if num >= size:
 560             d = num / size
 561             suffix = sfx
 562             break
 563     if suffix is not None:
 564         return f"{d:.1f}{suffix}"
 565     else:
 566         return f'{num:d}'
 567
 568
 569 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 570     """
 571     Checks if a string is a valid credit card number.
 572     If card type is provided then it checks against that specific type only,
 573     otherwise any known credit card number will be accepted.
 574
 575     Supported card types are the following:
 576
 577     - VISA
 578     - MASTERCARD
 579     - AMERICAN_EXPRESS
 580     - DINERS_CLUB
 581     - DISCOVER
 582     - JCB
 583     """
 584     if not is_full_string(in_str):
 585         return False
 586
 587     if card_type is not None:
 588         if card_type not in CREDIT_CARDS:
 589             raise KeyError(
 590                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 591             )
 592         return CREDIT_CARDS[card_type].match(in_str) is not None
 593     for c in CREDIT_CARDS:
 594         if CREDIT_CARDS[c].match(in_str) is not None:
 595             return True
 596     return False
 597
 598
 599 def is_camel_case(in_str: Any) -> bool:
 600     """
 601     Checks if a string is formatted as camel case.
 602
 603     A string is considered camel case when:
 604
 605     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 606     - it contains both lowercase and uppercase letters
 607     - it does not start with a number
 608     """
 609     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 610
 611
 612 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 613     """
 614     Checks if a string is formatted as "snake case".
 615
 616     A string is considered snake case when:
 617
 618     - it's composed only by lowercase/uppercase letters and digits
 619     - it contains at least one underscore (or provided separator)
 620     - it does not start with a number
 621
 622     >>> is_snake_case('this_is_a_test')
 623     True
 624     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 625     True
 626     >>> is_snake_case('this-is-a-test')
 627     False
 628     >>> is_snake_case('this-is-a-test', separator='-')
 629     True
 630
 631     """
 632     if is_full_string(in_str):
 633         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 634         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 635         r = re_map.get(
 636             separator,
 637             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 638         )
 639         return r.match(in_str) is not None
 640     return False
 641
 642
 643 def is_json(in_str: Any) -> bool:
 644     """
 645     Check if a string is a valid json.
 646
 647     >>> is_json('{"name": "Peter"}')
 648     True
 649     >>> is_json('[1, 2, 3]')
 650     True
 651     >>> is_json('{nope}')
 652     False
 653     """
 654     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 655         try:
 656             return isinstance(json.loads(in_str), (dict, list))
 657         except (TypeError, ValueError, OverflowError):
 658             pass
 659     return False
 660
 661
 662 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 663     """
 664     Check if a string is a valid UUID.
 665
 666     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 667     True
 668     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 669     False
 670     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 671     True
 672     """
 673     # string casting is used to allow UUID itself as input data type
 674     s = str(in_str)
 675     if allow_hex:
 676         return UUID_HEX_OK_RE.match(s) is not None
 677     return UUID_RE.match(s) is not None
 678
 679
 680 def is_ip_v4(in_str: Any) -> bool:
 681     """
 682     Checks if a string is a valid ip v4.
 683
 684     >>> is_ip_v4('255.200.100.75')
 685     True
 686     >>> is_ip_v4('nope')
 687     False
 688     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 689     False
 690     """
 691     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 692         return False
 693
 694     # checks that each entry in the ip is in the valid range (0 to 255)
 695     for token in in_str.split("."):
 696         if not 0 <= int(token) <= 255:
 697             return False
 698     return True
 699
 700
 701 def extract_ip_v4(in_str: Any) -> Optional[str]:
 702     """
 703     Extracts the IPv4 chunk of a string or None.
 704
 705     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 706     '127.0.0.1'
 707     >>> extract_ip_v4('Your mom dresses you funny.')
 708     """
 709     if not is_full_string(in_str):
 710         return None
 711     m = ANYWHERE_IP_V4_RE.search(in_str)
 712     if m is not None:
 713         return m.group(0)
 714     return None
 715
 716
 717 def is_ip_v6(in_str: Any) -> bool:
 718     """
 719     Checks if a string is a valid ip v6.
 720
 721     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 722     True
 723     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 724     False
 725     """
 726     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 727
 728
 729 def extract_ip_v6(in_str: Any) -> Optional[str]:
 730     """
 731     Extract IPv6 chunk or None.
 732
 733     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 734     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 735     >>> extract_ip_v6("(and she's ugly too, btw)")
 736     """
 737     if not is_full_string(in_str):
 738         return None
 739     m = ANYWHERE_IP_V6_RE.search(in_str)
 740     if m is not None:
 741         return m.group(0)
 742     return None
 743
 744
 745 def is_ip(in_str: Any) -> bool:
 746     """
 747     Checks if a string is a valid ip (either v4 or v6).
 748
 749     >>> is_ip('255.200.100.75')
 750     True
 751     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 752     True
 753     >>> is_ip('1.2.3')
 754     False
 755     >>> is_ip('1.2.3.999')
 756     False
 757     """
 758     return is_ip_v6(in_str) or is_ip_v4(in_str)
 759
 760
 761 def extract_ip(in_str: Any) -> Optional[str]:
 762     """
 763     Extract the IP address or None.
 764
 765     >>> extract_ip('Attacker: 255.200.100.75')
 766     '255.200.100.75'
 767     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 768     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 769     >>> extract_ip('1.2.3')
 770
 771     """
 772     ip = extract_ip_v4(in_str)
 773     if ip is None:
 774         ip = extract_ip_v6(in_str)
 775     return ip
 776
 777
 778 def is_mac_address(in_str: Any) -> bool:
 779     """Return True if in_str is a valid MAC address false otherwise.
 780
 781     >>> is_mac_address("34:29:8F:12:0D:2F")
 782     True
 783     >>> is_mac_address('34:29:8f:12:0d:2f')
 784     True
 785     >>> is_mac_address('34-29-8F-12-0D-2F')
 786     True
 787     >>> is_mac_address("test")
 788     False
 789     """
 790     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 791
 792
 793 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 794     """
 795     Extract the MAC address from in_str.
 796
 797     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 798     '34:29:8F:12:0D:2F'
 799
 800     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 801     'd8:5d:e2:34:54:86'
 802
 803     """
 804     if not is_full_string(in_str):
 805         return None
 806     in_str.strip()
 807     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 808     if m is not None:
 809         mac = m.group(0)
 810         mac.replace(":", separator)
 811         mac.replace("-", separator)
 812         return mac
 813     return None
 814
 815
 816 def is_slug(in_str: Any, separator: str = "-") -> bool:
 817     """
 818     Checks if a given string is a slug (as created by `slugify()`).
 819
 820     >>> is_slug('my-blog-post-title')
 821     True
 822     >>> is_slug('My blog post title')
 823     False
 824
 825     """
 826     if not is_full_string(in_str):
 827         return False
 828     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 829     return re.match(rex, in_str) is not None
 830
 831
 832 def contains_html(in_str: str) -> bool:
 833     """
 834     Checks if the given string contains HTML/XML tags.
 835
 836     By design, this function matches ANY type of tag, so don't expect to use it
 837     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 838
 839     >>> contains_html('my string is <strong>bold</strong>')
 840     True
 841     >>> contains_html('my string is not bold')
 842     False
 843
 844     """
 845     if not is_string(in_str):
 846         raise ValueError(in_str)
 847     return HTML_RE.search(in_str) is not None
 848
 849
 850 def words_count(in_str: str) -> int:
 851     """
 852     Returns the number of words contained into the given string.
 853
 854     This method is smart, it does consider only sequence of one or more letter and/or numbers
 855     as "words", so a string like this: "! @ # % ... []" will return zero!
 856     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 857     will be 4 not 1 (even if there are no spaces in the string).
 858
 859     >>> words_count('hello world')
 860     2
 861     >>> words_count('one,two,three.stop')
 862     4
 863
 864     """
 865     if not is_string(in_str):
 866         raise ValueError(in_str)
 867     return len(WORDS_COUNT_RE.findall(in_str))
 868
 869
 870 def generate_uuid(omit_dashes: bool = False) -> str:
 871     """
 872     Generated an UUID string (using `uuid.uuid4()`).
 873
 874     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 875     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 876
 877     """
 878     uid = uuid4()
 879     if omit_dashes:
 880         return uid.hex
 881     return str(uid)
 882
 883
 884 def generate_random_alphanumeric_string(size: int) -> str:
 885     """
 886     Returns a string of the specified size containing random
 887     characters (uppercase/lowercase ascii letters and digits).
 888
 889     random_string(9) # possible output: "cx3QQbzYg"
 890
 891     """
 892     if size < 1:
 893         raise ValueError("size must be >= 1")
 894     chars = string.ascii_letters + string.digits
 895     buffer = [random.choice(chars) for _ in range(size)]
 896     return from_char_list(buffer)
 897
 898
 899 def reverse(in_str: str) -> str:
 900     """
 901     Returns the string with its chars reversed.
 902
 903     >>> reverse('test')
 904     'tset'
 905
 906     """
 907     if not is_string(in_str):
 908         raise ValueError(in_str)
 909     return in_str[::-1]
 910
 911
 912 def camel_case_to_snake_case(in_str, *, separator="_"):
 913     """
 914     Convert a camel case string into a snake case one.
 915     (The original string is returned if is not a valid camel case string)
 916
 917     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 918     'mac_address_extractor_factory'
 919     >>> camel_case_to_snake_case('Luke Skywalker')
 920     'Luke Skywalker'
 921     """
 922     if not is_string(in_str):
 923         raise ValueError(in_str)
 924     if not is_camel_case(in_str):
 925         return in_str
 926     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
 927
 928
 929 def snake_case_to_camel_case(
 930     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 931 ) -> str:
 932     """
 933     Convert a snake case string into a camel case one.
 934     (The original string is returned if is not a valid snake case string)
 935
 936     >>> snake_case_to_camel_case('this_is_a_test')
 937     'ThisIsATest'
 938     >>> snake_case_to_camel_case('Han Solo')
 939     'Han Solo'
 940     """
 941     if not is_string(in_str):
 942         raise ValueError(in_str)
 943     if not is_snake_case(in_str, separator=separator):
 944         return in_str
 945     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 946     if not upper_case_first:
 947         tokens[0] = tokens[0].lower()
 948     return from_char_list(tokens)
 949
 950
 951 def to_char_list(in_str: str) -> List[str]:
 952     """Convert a string into a list of chars.
 953
 954     >>> to_char_list('test')
 955     ['t', 'e', 's', 't']
 956     """
 957     if not is_string(in_str):
 958         return []
 959     return list(in_str)
 960
 961
 962 def from_char_list(in_list: List[str]) -> str:
 963     """Convert a char list into a string.
 964
 965     >>> from_char_list(['t', 'e', 's', 't'])
 966     'test'
 967     """
 968     return "".join(in_list)
 969
 970
 971 def shuffle(in_str: str) -> str:
 972     """Return a new string containing same chars of the given one but in
 973     a randomized order.
 974     """
 975     if not is_string(in_str):
 976         raise ValueError(in_str)
 977
 978     # turn the string into a list of chars
 979     chars = to_char_list(in_str)
 980     random.shuffle(chars)
 981     return from_char_list(chars)
 982
 983
 984 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 985     """
 986     Remove html code contained into the given string.
 987
 988     >>> strip_html('test: <a href="foo/bar">click here</a>')
 989     'test: '
 990     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 991     'test: click here'
 992     """
 993     if not is_string(in_str):
 994         raise ValueError(in_str)
 995     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 996     return r.sub("", in_str)
 997
 998
 999 def asciify(in_str: str) -> str:
1000     """
1001     Force string content to be ascii-only by translating all non-ascii
1002     chars into the closest possible representation (eg: ó -> o, Ë ->
1003     E, ç -> c...).
1004
1005     N.B. Some chars may be lost if impossible to translate.
1006
1007     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1008     'eeuuooaaeynAAACIINOE'
1009     """
1010     if not is_string(in_str):
1011         raise ValueError(in_str)
1012
1013     # "NFKD" is the algorithm which is able to successfully translate
1014     # the most of non-ascii chars.
1015     normalized = unicodedata.normalize("NFKD", in_str)
1016
1017     # encode string forcing ascii and ignore any errors
1018     # (unrepresentable chars will be stripped out)
1019     ascii_bytes = normalized.encode("ascii", "ignore")
1020
1021     # turns encoded bytes into an utf-8 string
1022     return ascii_bytes.decode("utf-8")
1023
1024
1025 def slugify(in_str: str, *, separator: str = "-") -> str:
1026     """
1027     Converts a string into a "slug" using provided separator.
1028     The returned string has the following properties:
1029
1030     - it has no spaces
1031     - all letters are in lower case
1032     - all punctuation signs and non alphanumeric chars are removed
1033     - words are divided using provided separator
1034     - all chars are encoded as ascii (by using `asciify()`)
1035     - is safe for URL
1036
1037     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1038     'top-10-reasons-to-love-dogs'
1039     >>> slugify('Mönstér Mägnët')
1040     'monster-magnet'
1041     """
1042     if not is_string(in_str):
1043         raise ValueError(in_str)
1044
1045     # replace any character that is NOT letter or number with spaces
1046     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1047
1048     # replace spaces with join sign
1049     out = SPACES_RE.sub(separator, out)
1050
1051     # normalize joins (remove duplicates)
1052     out = re.sub(re.escape(separator) + r"+", separator, out)
1053     return asciify(out)
1054
1055
1056 def to_bool(in_str: str) -> bool:
1057     """
1058     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1059
1060     A positive boolean (True) is returned if the string value is one
1061     of the following:
1062
1063     - "true"
1064     - "1"
1065     - "yes"
1066     - "y"
1067
1068     Otherwise False is returned.
1069
1070     >>> to_bool('True')
1071     True
1072
1073     >>> to_bool('1')
1074     True
1075
1076     >>> to_bool('yes')
1077     True
1078
1079     >>> to_bool('no')
1080     False
1081
1082     >>> to_bool('huh?')
1083     False
1084
1085     >>> to_bool('on')
1086     True
1087
1088     """
1089     if not is_string(in_str):
1090         raise ValueError(in_str)
1091     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1092
1093
1094 def to_date(in_str: str) -> Optional[datetime.date]:
1095     """
1096     Parses a date string.  See DateParser docs for details.
1097     """
1098     import dateparse.dateparse_utils as dp
1099
1100     try:
1101         d = dp.DateParser()
1102         d.parse(in_str)
1103         return d.get_date()
1104     except dp.ParseException:
1105         msg = f'Unable to parse date {in_str}.'
1106         logger.warning(msg)
1107     return None
1108
1109
1110 def valid_date(in_str: str) -> bool:
1111     """
1112     True if the string represents a valid date.
1113     """
1114     import dateparse.dateparse_utils as dp
1115
1116     try:
1117         d = dp.DateParser()
1118         _ = d.parse(in_str)
1119         return True
1120     except dp.ParseException:
1121         msg = f'Unable to parse date {in_str}.'
1122         logger.warning(msg)
1123     return False
1124
1125
1126 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1127     """
1128     Parses a datetime string.  See DateParser docs for more info.
1129     """
1130     import dateparse.dateparse_utils as dp
1131
1132     try:
1133         d = dp.DateParser()
1134         dt = d.parse(in_str)
1135         if type(dt) == datetime.datetime:
1136             return dt
1137     except ValueError:
1138         msg = f'Unable to parse datetime {in_str}.'
1139         logger.warning(msg)
1140     return None
1141
1142
1143 def valid_datetime(in_str: str) -> bool:
1144     """
1145     True if the string represents a valid datetime.
1146     """
1147     _ = to_datetime(in_str)
1148     if _ is not None:
1149         return True
1150     msg = f'Unable to parse datetime {in_str}.'
1151     logger.warning(msg)
1152     return False
1153
1154
1155 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1156     """
1157     Squeeze runs of more than one character_to_squeeze into one.
1158
1159     >>> squeeze(' this        is       a    test    ')
1160     ' this is a test '
1161
1162     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1163     'one|!|two|!|three'
1164
1165     """
1166     return re.sub(
1167         r'(' + re.escape(character_to_squeeze) + r')+',
1168         character_to_squeeze,
1169         in_str,
1170     )
1171
1172
1173 def dedent(in_str: str) -> str:
1174     """
1175     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1176     """
1177     if not is_string(in_str):
1178         raise ValueError(in_str)
1179     line_separator = '\n'
1180     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1181     return line_separator.join(lines)
1182
1183
1184 def indent(in_str: str, amount: int) -> str:
1185     """
1186     Indents string by prepending amount spaces.
1187
1188     >>> indent('This is a test', 4)
1189     '    This is a test'
1190
1191     """
1192     if not is_string(in_str):
1193         raise ValueError(in_str)
1194     line_separator = '\n'
1195     lines = [" " * amount + line for line in in_str.split(line_separator)]
1196     return line_separator.join(lines)
1197
1198
1199 def sprintf(*args, **kwargs) -> str:
1200     """String printf, like in C"""
1201     ret = ""
1202
1203     sep = kwargs.pop("sep", None)
1204     if sep is not None:
1205         if not isinstance(sep, str):
1206             raise TypeError("sep must be None or a string")
1207
1208     end = kwargs.pop("end", None)
1209     if end is not None:
1210         if not isinstance(end, str):
1211             raise TypeError("end must be None or a string")
1212
1213     if kwargs:
1214         raise TypeError("invalid keyword arguments to sprint()")
1215
1216     if sep is None:
1217         sep = " "
1218     if end is None:
1219         end = "\n"
1220     for i, arg in enumerate(args):
1221         if i:
1222             ret += sep
1223         if isinstance(arg, str):
1224             ret += arg
1225         else:
1226             ret += str(arg)
1227     ret += end
1228     return ret
1229
1230
1231 class SprintfStdout(object):
1232     """
1233     A context manager that captures outputs to stdout.
1234
1235     with SprintfStdout() as buf:
1236         print("test")
1237     print(buf())
1238
1239     'test\n'
1240     """
1241
1242     def __init__(self) -> None:
1243         self.destination = io.StringIO()
1244         self.recorder: contextlib.redirect_stdout
1245
1246     def __enter__(self) -> Callable[[], str]:
1247         self.recorder = contextlib.redirect_stdout(self.destination)
1248         self.recorder.__enter__()
1249         return lambda: self.destination.getvalue()
1250
1251     def __exit__(self, *args) -> None:
1252         self.recorder.__exit__(*args)
1253         self.destination.seek(0)
1254         return None  # don't suppress exceptions
1255
1256
1257 def is_are(n: int) -> str:
1258     """Is or are?
1259
1260     >>> is_are(1)
1261     'is'
1262     >>> is_are(2)
1263     'are'
1264
1265     """
1266     if n == 1:
1267         return "is"
1268     return "are"
1269
1270
1271 def pluralize(n: int) -> str:
1272     """Add an s?
1273
1274     >>> pluralize(15)
1275     's'
1276     >>> count = 1
1277     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1278     There is 1 file.
1279     >>> count = 4
1280     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1281     There are 4 files.
1282
1283     """
1284     if n == 1:
1285         return ""
1286     return "s"
1287
1288
1289 def thify(n: int) -> str:
1290     """Return the proper cardinal suffix for a number.
1291
1292     >>> thify(1)
1293     'st'
1294     >>> thify(33)
1295     'rd'
1296     >>> thify(16)
1297     'th'
1298
1299     """
1300     digit = str(n)
1301     assert is_integer_number(digit)
1302     digit = digit[-1:]
1303     if digit == "1":
1304         return "st"
1305     elif digit == "2":
1306         return "nd"
1307     elif digit == "3":
1308         return "rd"
1309     else:
1310         return "th"
1311
1312
1313 def ngrams(txt: str, n: int):
1314     """Return the ngrams from a string.
1315
1316     >>> [x for x in ngrams('This is a test', 2)]
1317     ['This is', 'is a', 'a test']
1318
1319     """
1320     words = txt.split()
1321     for ngram in ngrams_presplit(words, n):
1322         ret = ''
1323         for word in ngram:
1324             ret += f'{word} '
1325         yield ret.strip()
1326
1327
1328 def ngrams_presplit(words: Sequence[str], n: int):
1329     return list_utils.ngrams(words, n)
1330
1331
1332 def bigrams(txt: str):
1333     return ngrams(txt, 2)
1334
1335
1336 def trigrams(txt: str):
1337     return ngrams(txt, 3)
1338
1339
1340 def shuffle_columns_into_list(
1341     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1342 ) -> Iterable[str]:
1343     """Helper to shuffle / parse columnar data and return the results as a
1344     list.  The column_specs argument is an iterable collection of
1345     numeric sequences that indicate one or more column numbers to
1346     copy.
1347
1348     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1349     >>> shuffle_columns_into_list(
1350     ...     cols,
1351     ...     [ [8], [2, 3], [5, 6, 7] ],
1352     ...     delim=' ',
1353     ... )
1354     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1355
1356     """
1357     out = []
1358
1359     # Column specs map input lines' columns into outputs.
1360     # [col1, col2...]
1361     for spec in column_specs:
1362         chunk = ''
1363         for n in spec:
1364             chunk = chunk + delim + input_lines[n]
1365         chunk = chunk.strip(delim)
1366         out.append(chunk)
1367     return out
1368
1369
1370 def shuffle_columns_into_dict(
1371     input_lines: Sequence[str],
1372     column_specs: Iterable[Tuple[str, Iterable[int]]],
1373     delim='',
1374 ) -> Dict[str, str]:
1375     """Helper to shuffle / parse columnar data and return the results
1376     as a dict.
1377
1378     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1379     >>> shuffle_columns_into_dict(
1380     ...     cols,
1381     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1382     ...     delim=' ',
1383     ... )
1384     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1385
1386     """
1387     out = {}
1388
1389     # Column specs map input lines' columns into outputs.
1390     # "key", [col1, col2...]
1391     for spec in column_specs:
1392         chunk = ''
1393         for n in spec[1]:
1394             chunk = chunk + delim + input_lines[n]
1395         chunk = chunk.strip(delim)
1396         out[spec[0]] = chunk
1397     return out
1398
1399
1400 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1401     """Interpolate a string with data from a dict.
1402
1403     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1404     ...                        {'adjective': 'good', 'noun': 'example'})
1405     'This is a good example.'
1406
1407     """
1408     return sprintf(txt.format(**values), end='')
1409
1410
1411 def to_ascii(x: str):
1412     """Encode as ascii bytes string.
1413
1414     >>> to_ascii('test')
1415     b'test'
1416
1417     >>> to_ascii(b'1, 2, 3')
1418     b'1, 2, 3'
1419
1420     """
1421     if type(x) is str:
1422         return x.encode('ascii')
1423     if type(x) is bytes:
1424         return x
1425     raise Exception('to_ascii works with strings and bytes')
1426
1427
1428 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1429     """Encode txt and then encode the bytes with a 64-character
1430     alphabet.  This is compatible with uudecode.
1431
1432     >>> to_base64('hello?')
1433     b'aGVsbG8/\\n'
1434
1435     """
1436     return base64.encodebytes(txt.encode(encoding, errors))
1437
1438
1439 def is_base64(txt: str) -> bool:
1440     """Determine whether a string is base64 encoded (with Python's standard
1441     base64 alphabet which is the same as what uuencode uses).
1442
1443     >>> is_base64('test')    # all letters in the b64 alphabet
1444     True
1445
1446     >>> is_base64('another test, how do you like this one?')
1447     False
1448
1449     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1450     True
1451
1452     """
1453     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1454     alphabet = set(a.encode('ascii'))
1455     for char in to_ascii(txt.strip()):
1456         if char not in alphabet:
1457             return False
1458     return True
1459
1460
1461 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1462     """Convert base64 encoded string back to normal strings.
1463
1464     >>> from_base64(b'aGVsbG8/\\n')
1465     'hello?'
1466
1467     """
1468     return base64.decodebytes(b64).decode(encoding, errors)
1469
1470
1471 def chunk(txt: str, chunk_size):
1472     """Chunk up a string.
1473
1474     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1475     '01001101 11000101 10101010 10101010 10011111 10101000'
1476
1477     """
1478     if len(txt) % chunk_size != 0:
1479         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1480         logger.warning(msg)
1481         warnings.warn(msg, stacklevel=2)
1482     for x in range(0, len(txt), chunk_size):
1483         yield txt[x : x + chunk_size]
1484
1485
1486 def to_bitstring(
1487     txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass'
1488 ) -> str:
1489     """Encode txt and then chop it into bytes.  Note: only bitstrings
1490     with delimiter='' are interpretable by from_bitstring.
1491
1492     >>> to_bitstring('hello?')
1493     '011010000110010101101100011011000110111100111111'
1494
1495     >>> to_bitstring('test', delimiter=' ')
1496     '01110100 01100101 01110011 01110100'
1497
1498     >>> to_bitstring(b'test')
1499     '01110100011001010111001101110100'
1500
1501     """
1502     etxt = to_ascii(txt)
1503     bits = bin(int.from_bytes(etxt, 'big'))
1504     bits = bits[2:]
1505     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1506
1507
1508 def is_bitstring(txt: str) -> bool:
1509     """Is this a bitstring?
1510
1511     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1512     True
1513
1514     >>> is_bitstring('1234')
1515     False
1516
1517     """
1518     return is_binary_integer_number(f'0b{txt}')
1519
1520
1521 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1522     """Convert from bitstring back to bytes then decode into a str.
1523
1524     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1525     'hello?'
1526
1527     """
1528     n = int(bits, 2)
1529     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1530
1531
1532 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1533     """Turn an IPv4 address into a tuple for sorting purposes.
1534
1535     >>> ip_v4_sort_key('10.0.0.18')
1536     (10, 0, 0, 18)
1537
1538     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1539     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1540     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1541
1542     """
1543     if not is_ip_v4(txt):
1544         print(f"not IP: {txt}")
1545         return None
1546     return tuple([int(x) for x in txt.split('.')])
1547
1548
1549 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1550     """Chunk up a file path so that parent/ancestor paths sort before
1551     children/descendant paths.
1552
1553     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1554     ('usr', 'local', 'bin')
1555
1556     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1557     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1558     ['/usr', '/usr/local', '/usr/local/bin']
1559
1560     """
1561     return tuple([x for x in volume.split('/') if len(x) > 0])
1562
1563
1564 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1565     """Execute several replace operations in a row.
1566
1567     >>> s = 'this_is a-test!'
1568     >>> replace_all(s, ' _-!', '')
1569     'thisisatest'
1570
1571     """
1572     for char in replace_set:
1573         in_str = in_str.replace(char, replacement)
1574     return in_str
1575
1576
1577 if __name__ == '__main__':
1578     import doctest
1579
1580     doctest.testmod()