string_utils.py

   1 #!/usr/bin/env python3
   2
   3 """The MIT License (MIT)
   4
   5 Copyright (c) 2016-2020 Davide Zanotti
   6 Modifications Copyright (c) 2021-2022 Scott Gasch
   7
   8 Permission is hereby granted, free of charge, to any person obtaining a copy
   9 of this software and associated documentation files (the "Software"), to deal
  10 in the Software without restriction, including without limitation the rights
  11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12 copies of the Software, and to permit persons to whom the Software is
  13 furnished to do so, subject to the following conditions:
  14
  15 The above copyright notice and this permission notice shall be included in all
  16 copies or substantial portions of the Software.
  17
  18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24 SOFTWARE.
  25
  26 This class is based on: https://github.com/daveoncode/python-string-utils.
  27 """
  28
  29 import base64
  30 import contextlib  # type: ignore
  31 import datetime
  32 import io
  33 import json
  34 import logging
  35 import numbers
  36 import random
  37 import re
  38 import string
  39 import unicodedata
  40 import warnings
  41 from itertools import zip_longest
  42 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
  43 from uuid import uuid4
  44
  45 import list_utils
  46
  47 logger = logging.getLogger(__name__)
  48
  49 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  50
  51 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  52
  53 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  54
  55 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  56
  57 URLS_RAW_STRING = (
  58     r"([a-z-]+://)"  # scheme
  59     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  60     r"(www\.)?"  # www.
  61     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  62     r"(:\d{2,})?"  # port number
  63     r"(/[a-z\d_%+-]*)*"  # folders
  64     r"(\.[a-z\d_%+-]+)*"  # file extension
  65     r"(\?[a-z\d_+%-=]*)?"  # query string
  66     r"(#\S*)?"  # hash
  67 )
  68
  69 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  70
  71 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  72
  73 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  74
  75 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  76
  77 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  78
  79 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  80
  81 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
  82
  83 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  84
  85 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
  86
  87 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
  88
  89 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  90
  91 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  92
  93 CREDIT_CARDS = {
  94     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  95     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  96     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  97     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  98     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  99     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 100 }
 101
 102 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 103
 104 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
 105
 106 UUID_HEX_OK_RE = re.compile(
 107     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 108     re.IGNORECASE,
 109 )
 110
 111 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 112
 113 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 114
 115 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 116
 117 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 118
 119 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 120
 121 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
 122
 123 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 124
 125 HTML_RE = re.compile(
 126     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 127     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 128 )
 129
 130 HTML_TAG_ONLY_RE = re.compile(
 131     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 132     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 133 )
 134
 135 SPACES_RE = re.compile(r"\s")
 136
 137 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 138
 139 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 140
 141 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 142
 143 NUM_SUFFIXES = {
 144     "Pb": (1024 ** 5),
 145     "P": (1024 ** 5),
 146     "Tb": (1024 ** 4),
 147     "T": (1024 ** 4),
 148     "Gb": (1024 ** 3),
 149     "G": (1024 ** 3),
 150     "Mb": (1024 ** 2),
 151     "M": (1024 ** 2),
 152     "Kb": (1024 ** 1),
 153     "K": (1024 ** 1),
 154 }
 155
 156
 157 def is_none_or_empty(in_str: Optional[str]) -> bool:
 158     """
 159     Returns true if the input string is either None or an empty string.
 160
 161     >>> is_none_or_empty("")
 162     True
 163     >>> is_none_or_empty(None)
 164     True
 165     >>> is_none_or_empty("   \t   ")
 166     True
 167     >>> is_none_or_empty('Test')
 168     False
 169     """
 170     return in_str is None or len(in_str.strip()) == 0
 171
 172
 173 def is_string(obj: Any) -> bool:
 174     """
 175     Checks if an object is a string.
 176
 177     >>> is_string('test')
 178     True
 179     >>> is_string(123)
 180     False
 181     >>> is_string(100.3)
 182     False
 183     >>> is_string([1, 2, 3])
 184     False
 185     """
 186     return isinstance(obj, str)
 187
 188
 189 def is_empty_string(in_str: Any) -> bool:
 190     return is_empty(in_str)
 191
 192
 193 def is_empty(in_str: Any) -> bool:
 194     """
 195     Checks if input is a string and empty or only whitespace.
 196
 197     >>> is_empty('')
 198     True
 199     >>> is_empty('    \t\t    ')
 200     True
 201     >>> is_empty('test')
 202     False
 203     >>> is_empty(100.88)
 204     False
 205     >>> is_empty([1, 2, 3])
 206     False
 207     """
 208     return is_string(in_str) and in_str.strip() == ""
 209
 210
 211 def is_full_string(in_str: Any) -> bool:
 212     """
 213     Checks that input is a string and is not empty ('') or only whitespace.
 214
 215     >>> is_full_string('test!')
 216     True
 217     >>> is_full_string('')
 218     False
 219     >>> is_full_string('      ')
 220     False
 221     >>> is_full_string(100.999)
 222     False
 223     >>> is_full_string({"a": 1, "b": 2})
 224     False
 225     """
 226     return is_string(in_str) and in_str.strip() != ""
 227
 228
 229 def is_number(in_str: str) -> bool:
 230     """
 231     Checks if a string is a valid number.
 232
 233     >>> is_number(100.5)
 234     Traceback (most recent call last):
 235     ...
 236     ValueError: 100.5
 237     >>> is_number("100.5")
 238     True
 239     >>> is_number("test")
 240     False
 241     >>> is_number("99")
 242     True
 243     >>> is_number([1, 2, 3])
 244     Traceback (most recent call last):
 245     ...
 246     ValueError: [1, 2, 3]
 247     """
 248     if not is_string(in_str):
 249         raise ValueError(in_str)
 250     return NUMBER_RE.match(in_str) is not None
 251
 252
 253 def is_integer_number(in_str: str) -> bool:
 254     """
 255     Checks whether the given string represents an integer or not.
 256
 257     An integer may be signed or unsigned or use a "scientific notation".
 258
 259     >>> is_integer_number('42')
 260     True
 261     >>> is_integer_number('42.0')
 262     False
 263     """
 264     return (
 265         (is_number(in_str) and "." not in in_str)
 266         or is_hexidecimal_integer_number(in_str)
 267         or is_octal_integer_number(in_str)
 268         or is_binary_integer_number(in_str)
 269     )
 270
 271
 272 def is_hexidecimal_integer_number(in_str: str) -> bool:
 273     """
 274     Checks whether a string is a hex integer number.
 275
 276     >>> is_hexidecimal_integer_number('0x12345')
 277     True
 278     >>> is_hexidecimal_integer_number('0x1A3E')
 279     True
 280     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 281     False
 282     >>> is_hexidecimal_integer_number('-0xff')
 283     True
 284     >>> is_hexidecimal_integer_number('test')
 285     False
 286     >>> is_hexidecimal_integer_number(12345)  # Not a string
 287     Traceback (most recent call last):
 288     ...
 289     ValueError: 12345
 290     >>> is_hexidecimal_integer_number(101.4)
 291     Traceback (most recent call last):
 292     ...
 293     ValueError: 101.4
 294     >>> is_hexidecimal_integer_number(0x1A3E)
 295     Traceback (most recent call last):
 296     ...
 297     ValueError: 6718
 298     """
 299     if not is_string(in_str):
 300         raise ValueError(in_str)
 301     return HEX_NUMBER_RE.match(in_str) is not None
 302
 303
 304 def is_octal_integer_number(in_str: str) -> bool:
 305     """
 306     Checks whether a string is an octal number.
 307
 308     >>> is_octal_integer_number('0o777')
 309     True
 310     >>> is_octal_integer_number('-0O115')
 311     True
 312     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 313     False
 314     >>> is_octal_integer_number('7777')  # Needs 0o
 315     False
 316     >>> is_octal_integer_number('test')
 317     False
 318     """
 319     if not is_string(in_str):
 320         raise ValueError(in_str)
 321     return OCT_NUMBER_RE.match(in_str) is not None
 322
 323
 324 def is_binary_integer_number(in_str: str) -> bool:
 325     """
 326     Returns whether a string contains a binary number.
 327
 328     >>> is_binary_integer_number('0b10111')
 329     True
 330     >>> is_binary_integer_number('-0b111')
 331     True
 332     >>> is_binary_integer_number('0B10101')
 333     True
 334     >>> is_binary_integer_number('0b10102')
 335     False
 336     >>> is_binary_integer_number('0xFFF')
 337     False
 338     >>> is_binary_integer_number('test')
 339     False
 340     """
 341     if not is_string(in_str):
 342         raise ValueError(in_str)
 343     return BIN_NUMBER_RE.match(in_str) is not None
 344
 345
 346 def to_int(in_str: str) -> int:
 347     """Returns the integral value of the string or raises on error.
 348
 349     >>> to_int('1234')
 350     1234
 351     >>> to_int('test')
 352     Traceback (most recent call last):
 353     ...
 354     ValueError: invalid literal for int() with base 10: 'test'
 355     """
 356     if not is_string(in_str):
 357         raise ValueError(in_str)
 358     if is_binary_integer_number(in_str):
 359         return int(in_str, 2)
 360     if is_octal_integer_number(in_str):
 361         return int(in_str, 8)
 362     if is_hexidecimal_integer_number(in_str):
 363         return int(in_str, 16)
 364     return int(in_str)
 365
 366
 367 def is_decimal_number(in_str: str) -> bool:
 368     """
 369     Checks whether the given string represents a decimal or not.
 370
 371     A decimal may be signed or unsigned or use a "scientific notation".
 372
 373     >>> is_decimal_number('42.0')
 374     True
 375     >>> is_decimal_number('42')
 376     False
 377     """
 378     return is_number(in_str) and "." in in_str
 379
 380
 381 def strip_escape_sequences(in_str: str) -> str:
 382     """
 383     Remove escape sequences in the input string.
 384
 385     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 386     'this is a test!'
 387     """
 388     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 389     return in_str
 390
 391
 392 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 393     """
 394     Add thousands separator to a numeric string.  Also handles numbers.
 395
 396     >>> add_thousands_separator('12345678')
 397     '12,345,678'
 398     >>> add_thousands_separator(12345678)
 399     '12,345,678'
 400     >>> add_thousands_separator(12345678.99)
 401     '12,345,678.99'
 402     >>> add_thousands_separator('test')
 403     Traceback (most recent call last):
 404     ...
 405     ValueError: test
 406
 407     """
 408     if isinstance(in_str, numbers.Number):
 409         in_str = f'{in_str}'
 410     if is_number(in_str):
 411         return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
 412     raise ValueError(in_str)
 413
 414
 415 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 416     decimal_part = ""
 417     if '.' in in_str:
 418         (in_str, decimal_part) = in_str.split('.')
 419     tmp = [iter(in_str[::-1])] * places
 420     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 421     if len(decimal_part) > 0:
 422         ret += '.'
 423         ret += decimal_part
 424     return ret
 425
 426
 427 # Full url example:
 428 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 429 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 430     """
 431     Check if a string is a valid url.
 432
 433     >>> is_url('http://www.mysite.com')
 434     True
 435     >>> is_url('https://mysite.com')
 436     True
 437     >>> is_url('.mysite.com')
 438     False
 439     """
 440     if not is_full_string(in_str):
 441         return False
 442
 443     valid = URL_RE.match(in_str) is not None
 444
 445     if allowed_schemes:
 446         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 447     return valid
 448
 449
 450 def is_email(in_str: Any) -> bool:
 451     """
 452     Check if a string is a valid email.
 453
 454     Reference: https://tools.ietf.org/html/rfc3696#section-3
 455
 456     >>> is_email('[email protected]')
 457     True
 458     >>> is_email('@gmail.com')
 459     False
 460     """
 461     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 462         return False
 463
 464     try:
 465         # we expect 2 tokens, one before "@" and one after, otherwise
 466         # we have an exception and the email is not valid.
 467         head, tail = in_str.split("@")
 468
 469         # head's size must be <= 64, tail <= 255, head must not start
 470         # with a dot or contain multiple consecutive dots.
 471         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 472             return False
 473
 474         # removes escaped spaces, so that later on the test regex will
 475         # accept the string.
 476         head = head.replace("\\ ", "")
 477         if head.startswith('"') and head.endswith('"'):
 478             head = head.replace(" ", "")[1:-1]
 479         return EMAIL_RE.match(head + "@" + tail) is not None
 480
 481     except ValueError:
 482         # borderline case in which we have multiple "@" signs but the
 483         # head part is correctly escaped.
 484         if ESCAPED_AT_SIGN.search(in_str) is not None:
 485             # replace "@" with "a" in the head
 486             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 487         return False
 488
 489
 490 def suffix_string_to_number(in_str: str) -> Optional[int]:
 491     """Take a string like "33Gb" and convert it into a number (of bytes)
 492     like 34603008.  Return None if the input string is not valid.
 493
 494     >>> suffix_string_to_number('1Mb')
 495     1048576
 496     >>> suffix_string_to_number('13.1Gb')
 497     14066017894
 498     """
 499
 500     def suffix_capitalize(s: str) -> str:
 501         if len(s) == 1:
 502             return s.upper()
 503         elif len(s) == 2:
 504             return f"{s[0].upper()}{s[1].lower()}"
 505         return suffix_capitalize(s[0:1])
 506
 507     if is_string(in_str):
 508         if is_integer_number(in_str):
 509             return to_int(in_str)
 510         suffixes = [in_str[-2:], in_str[-1:]]
 511         rest = [in_str[:-2], in_str[:-1]]
 512         for x in range(len(suffixes)):
 513             s = suffixes[x]
 514             s = suffix_capitalize(s)
 515             multiplier = NUM_SUFFIXES.get(s, None)
 516             if multiplier is not None:
 517                 r = rest[x]
 518                 if is_integer_number(r):
 519                     return to_int(r) * multiplier
 520                 if is_decimal_number(r):
 521                     return int(float(r) * multiplier)
 522     return None
 523
 524
 525 def number_to_suffix_string(num: int) -> Optional[str]:
 526     """Take a number (of bytes) and returns a string like "43.8Gb".
 527     Returns none if the input is invalid.
 528
 529     >>> number_to_suffix_string(14066017894)
 530     '13.1Gb'
 531     >>> number_to_suffix_string(1024 * 1024)
 532     '1.0Mb'
 533
 534     """
 535     d = 0.0
 536     suffix = None
 537     for (sfx, size) in NUM_SUFFIXES.items():
 538         if num >= size:
 539             d = num / size
 540             suffix = sfx
 541             break
 542     if suffix is not None:
 543         return f"{d:.1f}{suffix}"
 544     else:
 545         return f'{num:d}'
 546
 547
 548 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 549     """
 550     Checks if a string is a valid credit card number.
 551     If card type is provided then it checks against that specific type only,
 552     otherwise any known credit card number will be accepted.
 553
 554     Supported card types are the following:
 555
 556     - VISA
 557     - MASTERCARD
 558     - AMERICAN_EXPRESS
 559     - DINERS_CLUB
 560     - DISCOVER
 561     - JCB
 562     """
 563     if not is_full_string(in_str):
 564         return False
 565
 566     if card_type is not None:
 567         if card_type not in CREDIT_CARDS:
 568             raise KeyError(
 569                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 570             )
 571         return CREDIT_CARDS[card_type].match(in_str) is not None
 572     for c in CREDIT_CARDS:
 573         if CREDIT_CARDS[c].match(in_str) is not None:
 574             return True
 575     return False
 576
 577
 578 def is_camel_case(in_str: Any) -> bool:
 579     """
 580     Checks if a string is formatted as camel case.
 581
 582     A string is considered camel case when:
 583
 584     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 585     - it contains both lowercase and uppercase letters
 586     - it does not start with a number
 587     """
 588     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 589
 590
 591 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 592     """
 593     Checks if a string is formatted as "snake case".
 594
 595     A string is considered snake case when:
 596
 597     - it's composed only by lowercase/uppercase letters and digits
 598     - it contains at least one underscore (or provided separator)
 599     - it does not start with a number
 600
 601     >>> is_snake_case('this_is_a_test')
 602     True
 603     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 604     True
 605     >>> is_snake_case('this-is-a-test')
 606     False
 607     >>> is_snake_case('this-is-a-test', separator='-')
 608     True
 609
 610     """
 611     if is_full_string(in_str):
 612         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 613         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 614         r = re_map.get(
 615             separator,
 616             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 617         )
 618         return r.match(in_str) is not None
 619     return False
 620
 621
 622 def is_json(in_str: Any) -> bool:
 623     """
 624     Check if a string is a valid json.
 625
 626     >>> is_json('{"name": "Peter"}')
 627     True
 628     >>> is_json('[1, 2, 3]')
 629     True
 630     >>> is_json('{nope}')
 631     False
 632     """
 633     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 634         try:
 635             return isinstance(json.loads(in_str), (dict, list))
 636         except (TypeError, ValueError, OverflowError):
 637             pass
 638     return False
 639
 640
 641 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 642     """
 643     Check if a string is a valid UUID.
 644
 645     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 646     True
 647     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 648     False
 649     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 650     True
 651     """
 652     # string casting is used to allow UUID itself as input data type
 653     s = str(in_str)
 654     if allow_hex:
 655         return UUID_HEX_OK_RE.match(s) is not None
 656     return UUID_RE.match(s) is not None
 657
 658
 659 def is_ip_v4(in_str: Any) -> bool:
 660     """
 661     Checks if a string is a valid ip v4.
 662
 663     >>> is_ip_v4('255.200.100.75')
 664     True
 665     >>> is_ip_v4('nope')
 666     False
 667     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 668     False
 669     """
 670     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 671         return False
 672
 673     # checks that each entry in the ip is in the valid range (0 to 255)
 674     for token in in_str.split("."):
 675         if not 0 <= int(token) <= 255:
 676             return False
 677     return True
 678
 679
 680 def extract_ip_v4(in_str: Any) -> Optional[str]:
 681     """
 682     Extracts the IPv4 chunk of a string or None.
 683
 684     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 685     '127.0.0.1'
 686     >>> extract_ip_v4('Your mom dresses you funny.')
 687     """
 688     if not is_full_string(in_str):
 689         return None
 690     m = ANYWHERE_IP_V4_RE.search(in_str)
 691     if m is not None:
 692         return m.group(0)
 693     return None
 694
 695
 696 def is_ip_v6(in_str: Any) -> bool:
 697     """
 698     Checks if a string is a valid ip v6.
 699
 700     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 701     True
 702     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 703     False
 704     """
 705     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 706
 707
 708 def extract_ip_v6(in_str: Any) -> Optional[str]:
 709     """
 710     Extract IPv6 chunk or None.
 711
 712     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 713     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 714     >>> extract_ip_v6("(and she's ugly too, btw)")
 715     """
 716     if not is_full_string(in_str):
 717         return None
 718     m = ANYWHERE_IP_V6_RE.search(in_str)
 719     if m is not None:
 720         return m.group(0)
 721     return None
 722
 723
 724 def is_ip(in_str: Any) -> bool:
 725     """
 726     Checks if a string is a valid ip (either v4 or v6).
 727
 728     >>> is_ip('255.200.100.75')
 729     True
 730     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 731     True
 732     >>> is_ip('1.2.3')
 733     False
 734     >>> is_ip('1.2.3.999')
 735     False
 736     """
 737     return is_ip_v6(in_str) or is_ip_v4(in_str)
 738
 739
 740 def extract_ip(in_str: Any) -> Optional[str]:
 741     """
 742     Extract the IP address or None.
 743
 744     >>> extract_ip('Attacker: 255.200.100.75')
 745     '255.200.100.75'
 746     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 747     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 748     >>> extract_ip('1.2.3')
 749
 750     """
 751     ip = extract_ip_v4(in_str)
 752     if ip is None:
 753         ip = extract_ip_v6(in_str)
 754     return ip
 755
 756
 757 def is_mac_address(in_str: Any) -> bool:
 758     """Return True if in_str is a valid MAC address false otherwise.
 759
 760     >>> is_mac_address("34:29:8F:12:0D:2F")
 761     True
 762     >>> is_mac_address('34:29:8f:12:0d:2f')
 763     True
 764     >>> is_mac_address('34-29-8F-12-0D-2F')
 765     True
 766     >>> is_mac_address("test")
 767     False
 768     """
 769     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 770
 771
 772 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 773     """
 774     Extract the MAC address from in_str.
 775
 776     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 777     '34:29:8F:12:0D:2F'
 778
 779     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 780     'd8:5d:e2:34:54:86'
 781
 782     """
 783     if not is_full_string(in_str):
 784         return None
 785     in_str.strip()
 786     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 787     if m is not None:
 788         mac = m.group(0)
 789         mac.replace(":", separator)
 790         mac.replace("-", separator)
 791         return mac
 792     return None
 793
 794
 795 def is_slug(in_str: Any, separator: str = "-") -> bool:
 796     """
 797     Checks if a given string is a slug (as created by `slugify()`).
 798
 799     >>> is_slug('my-blog-post-title')
 800     True
 801     >>> is_slug('My blog post title')
 802     False
 803
 804     """
 805     if not is_full_string(in_str):
 806         return False
 807     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 808     return re.match(rex, in_str) is not None
 809
 810
 811 def contains_html(in_str: str) -> bool:
 812     """
 813     Checks if the given string contains HTML/XML tags.
 814
 815     By design, this function matches ANY type of tag, so don't expect to use it
 816     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 817
 818     >>> contains_html('my string is <strong>bold</strong>')
 819     True
 820     >>> contains_html('my string is not bold')
 821     False
 822
 823     """
 824     if not is_string(in_str):
 825         raise ValueError(in_str)
 826     return HTML_RE.search(in_str) is not None
 827
 828
 829 def words_count(in_str: str) -> int:
 830     """
 831     Returns the number of words contained into the given string.
 832
 833     This method is smart, it does consider only sequence of one or more letter and/or numbers
 834     as "words", so a string like this: "! @ # % ... []" will return zero!
 835     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 836     will be 4 not 1 (even if there are no spaces in the string).
 837
 838     >>> words_count('hello world')
 839     2
 840     >>> words_count('one,two,three.stop')
 841     4
 842
 843     """
 844     if not is_string(in_str):
 845         raise ValueError(in_str)
 846     return len(WORDS_COUNT_RE.findall(in_str))
 847
 848
 849 def generate_uuid(omit_dashes: bool = False) -> str:
 850     """
 851     Generated an UUID string (using `uuid.uuid4()`).
 852
 853     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 854     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 855
 856     """
 857     uid = uuid4()
 858     if omit_dashes:
 859         return uid.hex
 860     return str(uid)
 861
 862
 863 def generate_random_alphanumeric_string(size: int) -> str:
 864     """
 865     Returns a string of the specified size containing random
 866     characters (uppercase/lowercase ascii letters and digits).
 867
 868     random_string(9) # possible output: "cx3QQbzYg"
 869
 870     """
 871     if size < 1:
 872         raise ValueError("size must be >= 1")
 873     chars = string.ascii_letters + string.digits
 874     buffer = [random.choice(chars) for _ in range(size)]
 875     return from_char_list(buffer)
 876
 877
 878 def reverse(in_str: str) -> str:
 879     """
 880     Returns the string with its chars reversed.
 881
 882     >>> reverse('test')
 883     'tset'
 884
 885     """
 886     if not is_string(in_str):
 887         raise ValueError(in_str)
 888     return in_str[::-1]
 889
 890
 891 def camel_case_to_snake_case(in_str, *, separator="_"):
 892     """
 893     Convert a camel case string into a snake case one.
 894     (The original string is returned if is not a valid camel case string)
 895
 896     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 897     'mac_address_extractor_factory'
 898     >>> camel_case_to_snake_case('Luke Skywalker')
 899     'Luke Skywalker'
 900     """
 901     if not is_string(in_str):
 902         raise ValueError(in_str)
 903     if not is_camel_case(in_str):
 904         return in_str
 905     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
 906
 907
 908 def snake_case_to_camel_case(
 909     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 910 ) -> str:
 911     """
 912     Convert a snake case string into a camel case one.
 913     (The original string is returned if is not a valid snake case string)
 914
 915     >>> snake_case_to_camel_case('this_is_a_test')
 916     'ThisIsATest'
 917     >>> snake_case_to_camel_case('Han Solo')
 918     'Han Solo'
 919     """
 920     if not is_string(in_str):
 921         raise ValueError(in_str)
 922     if not is_snake_case(in_str, separator=separator):
 923         return in_str
 924     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 925     if not upper_case_first:
 926         tokens[0] = tokens[0].lower()
 927     return from_char_list(tokens)
 928
 929
 930 def to_char_list(in_str: str) -> List[str]:
 931     """Convert a string into a list of chars.
 932
 933     >>> to_char_list('test')
 934     ['t', 'e', 's', 't']
 935     """
 936     if not is_string(in_str):
 937         return []
 938     return list(in_str)
 939
 940
 941 def from_char_list(in_list: List[str]) -> str:
 942     """Convert a char list into a string.
 943
 944     >>> from_char_list(['t', 'e', 's', 't'])
 945     'test'
 946     """
 947     return "".join(in_list)
 948
 949
 950 def shuffle(in_str: str) -> str:
 951     """Return a new string containing same chars of the given one but in
 952     a randomized order.
 953     """
 954     if not is_string(in_str):
 955         raise ValueError(in_str)
 956
 957     # turn the string into a list of chars
 958     chars = to_char_list(in_str)
 959     random.shuffle(chars)
 960     return from_char_list(chars)
 961
 962
 963 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 964     """
 965     Remove html code contained into the given string.
 966
 967     >>> strip_html('test: <a href="foo/bar">click here</a>')
 968     'test: '
 969     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 970     'test: click here'
 971     """
 972     if not is_string(in_str):
 973         raise ValueError(in_str)
 974     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 975     return r.sub("", in_str)
 976
 977
 978 def asciify(in_str: str) -> str:
 979     """
 980     Force string content to be ascii-only by translating all non-ascii
 981     chars into the closest possible representation (eg: ó -> o, Ë ->
 982     E, ç -> c...).
 983
 984     N.B. Some chars may be lost if impossible to translate.
 985
 986     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
 987     'eeuuooaaeynAAACIINOE'
 988     """
 989     if not is_string(in_str):
 990         raise ValueError(in_str)
 991
 992     # "NFKD" is the algorithm which is able to successfully translate
 993     # the most of non-ascii chars.
 994     normalized = unicodedata.normalize("NFKD", in_str)
 995
 996     # encode string forcing ascii and ignore any errors
 997     # (unrepresentable chars will be stripped out)
 998     ascii_bytes = normalized.encode("ascii", "ignore")
 999
1000     # turns encoded bytes into an utf-8 string
1001     return ascii_bytes.decode("utf-8")
1002
1003
1004 def slugify(in_str: str, *, separator: str = "-") -> str:
1005     """
1006     Converts a string into a "slug" using provided separator.
1007     The returned string has the following properties:
1008
1009     - it has no spaces
1010     - all letters are in lower case
1011     - all punctuation signs and non alphanumeric chars are removed
1012     - words are divided using provided separator
1013     - all chars are encoded as ascii (by using `asciify()`)
1014     - is safe for URL
1015
1016     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1017     'top-10-reasons-to-love-dogs'
1018     >>> slugify('Mönstér Mägnët')
1019     'monster-magnet'
1020     """
1021     if not is_string(in_str):
1022         raise ValueError(in_str)
1023
1024     # replace any character that is NOT letter or number with spaces
1025     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1026
1027     # replace spaces with join sign
1028     out = SPACES_RE.sub(separator, out)
1029
1030     # normalize joins (remove duplicates)
1031     out = re.sub(re.escape(separator) + r"+", separator, out)
1032     return asciify(out)
1033
1034
1035 def to_bool(in_str: str) -> bool:
1036     """
1037     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1038
1039     A positive boolean (True) is returned if the string value is one
1040     of the following:
1041
1042     - "true"
1043     - "1"
1044     - "yes"
1045     - "y"
1046
1047     Otherwise False is returned.
1048
1049     >>> to_bool('True')
1050     True
1051
1052     >>> to_bool('1')
1053     True
1054
1055     >>> to_bool('yes')
1056     True
1057
1058     >>> to_bool('no')
1059     False
1060
1061     >>> to_bool('huh?')
1062     False
1063
1064     >>> to_bool('on')
1065     True
1066
1067     """
1068     if not is_string(in_str):
1069         raise ValueError(in_str)
1070     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1071
1072
1073 def to_date(in_str: str) -> Optional[datetime.date]:
1074     """
1075     Parses a date string.  See DateParser docs for details.
1076     """
1077     import dateparse.dateparse_utils as dp  # type: ignore
1078
1079     try:
1080         d = dp.DateParser()
1081         d.parse(in_str)
1082         return d.get_date()
1083     except dp.ParseException:
1084         msg = f'Unable to parse date {in_str}.'
1085         logger.warning(msg)
1086     return None
1087
1088
1089 def valid_date(in_str: str) -> bool:
1090     """
1091     True if the string represents a valid date.
1092     """
1093     import dateparse.dateparse_utils as dp
1094
1095     try:
1096         d = dp.DateParser()
1097         _ = d.parse(in_str)
1098         return True
1099     except dp.ParseException:
1100         msg = f'Unable to parse date {in_str}.'
1101         logger.warning(msg)
1102     return False
1103
1104
1105 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1106     """
1107     Parses a datetime string.  See DateParser docs for more info.
1108     """
1109     import dateparse.dateparse_utils as dp
1110
1111     try:
1112         d = dp.DateParser()
1113         dt = d.parse(in_str)
1114         if type(dt) == datetime.datetime:
1115             return dt
1116     except ValueError:
1117         msg = f'Unable to parse datetime {in_str}.'
1118         logger.warning(msg)
1119     return None
1120
1121
1122 def valid_datetime(in_str: str) -> bool:
1123     """
1124     True if the string represents a valid datetime.
1125     """
1126     _ = to_datetime(in_str)
1127     if _ is not None:
1128         return True
1129     msg = f'Unable to parse datetime {in_str}.'
1130     logger.warning(msg)
1131     return False
1132
1133
1134 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1135     """
1136     Squeeze runs of more than one character_to_squeeze into one.
1137
1138     >>> squeeze(' this        is       a    test    ')
1139     ' this is a test '
1140
1141     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1142     'one|!|two|!|three'
1143
1144     """
1145     return re.sub(
1146         r'(' + re.escape(character_to_squeeze) + r')+',
1147         character_to_squeeze,
1148         in_str,
1149     )
1150
1151
1152 def dedent(in_str: str) -> str:
1153     """
1154     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1155     """
1156     if not is_string(in_str):
1157         raise ValueError(in_str)
1158     line_separator = '\n'
1159     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1160     return line_separator.join(lines)
1161
1162
1163 def indent(in_str: str, amount: int) -> str:
1164     """
1165     Indents string by prepending amount spaces.
1166
1167     >>> indent('This is a test', 4)
1168     '    This is a test'
1169
1170     """
1171     if not is_string(in_str):
1172         raise ValueError(in_str)
1173     line_separator = '\n'
1174     lines = [" " * amount + line for line in in_str.split(line_separator)]
1175     return line_separator.join(lines)
1176
1177
1178 def sprintf(*args, **kwargs) -> str:
1179     """String printf, like in C"""
1180     ret = ""
1181
1182     sep = kwargs.pop("sep", None)
1183     if sep is not None:
1184         if not isinstance(sep, str):
1185             raise TypeError("sep must be None or a string")
1186
1187     end = kwargs.pop("end", None)
1188     if end is not None:
1189         if not isinstance(end, str):
1190             raise TypeError("end must be None or a string")
1191
1192     if kwargs:
1193         raise TypeError("invalid keyword arguments to sprint()")
1194
1195     if sep is None:
1196         sep = " "
1197     if end is None:
1198         end = "\n"
1199     for i, arg in enumerate(args):
1200         if i:
1201             ret += sep
1202         if isinstance(arg, str):
1203             ret += arg
1204         else:
1205             ret += str(arg)
1206     ret += end
1207     return ret
1208
1209
1210 class SprintfStdout(object):
1211     """
1212     A context manager that captures outputs to stdout.
1213
1214     with SprintfStdout() as buf:
1215         print("test")
1216     print(buf())
1217
1218     'test\n'
1219     """
1220
1221     def __init__(self) -> None:
1222         self.destination = io.StringIO()
1223         self.recorder: contextlib.redirect_stdout
1224
1225     def __enter__(self) -> Callable[[], str]:
1226         self.recorder = contextlib.redirect_stdout(self.destination)
1227         self.recorder.__enter__()
1228         return lambda: self.destination.getvalue()
1229
1230     def __exit__(self, *args) -> None:
1231         self.recorder.__exit__(*args)
1232         self.destination.seek(0)
1233         return None  # don't suppress exceptions
1234
1235
1236 def capitalize_first_letter(txt: str) -> str:
1237     """Capitalize the first letter of a string.
1238
1239     >>> capitalize_first_letter('test')
1240     'Test'
1241     >>> capitalize_first_letter("ALREADY!")
1242     'ALREADY!'
1243
1244     """
1245     return txt[0].upper() + txt[1:]
1246
1247
1248 def it_they(n: int) -> str:
1249     """It or they?
1250
1251     >>> it_they(1)
1252     'it'
1253     >>> it_they(100)
1254     'they'
1255
1256     """
1257     if n == 1:
1258         return "it"
1259     return "they"
1260
1261
1262 def is_are(n: int) -> str:
1263     """Is or are?
1264
1265     >>> is_are(1)
1266     'is'
1267     >>> is_are(2)
1268     'are'
1269
1270     """
1271     if n == 1:
1272         return "is"
1273     return "are"
1274
1275
1276 def pluralize(n: int) -> str:
1277     """Add an s?
1278
1279     >>> pluralize(15)
1280     's'
1281     >>> count = 1
1282     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1283     There is 1 file.
1284     >>> count = 4
1285     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1286     There are 4 files.
1287
1288     """
1289     if n == 1:
1290         return ""
1291     return "s"
1292
1293
1294 def make_contractions(txt: str) -> str:
1295     """Glue words together to form contractions.
1296
1297     >>> make_contractions('It is nice today.')
1298     "It's nice today."
1299
1300     >>> make_contractions('I can    not even...')
1301     "I can't even..."
1302
1303     >>> make_contractions('She could not see!')
1304     "She couldn't see!"
1305
1306     >>> make_contractions('But she will not go.')
1307     "But she won't go."
1308
1309     >>> make_contractions('Verily, I shall not.')
1310     "Verily, I shan't."
1311
1312     >>> make_contractions('No you cannot.')
1313     "No you can't."
1314
1315     >>> make_contractions('I said you can not go.')
1316     "I said you can't go."
1317
1318     """
1319
1320     first_second = [
1321         (
1322             [
1323                 'are',
1324                 'could',
1325                 'did',
1326                 'has',
1327                 'have',
1328                 'is',
1329                 'must',
1330                 'should',
1331                 'was',
1332                 'were',
1333                 'would',
1334             ],
1335             ['(n)o(t)'],
1336         ),
1337         (
1338             [
1339                 "I",
1340                 "you",
1341                 "he",
1342                 "she",
1343                 "it",
1344                 "we",
1345                 "they",
1346                 "how",
1347                 "why",
1348                 "when",
1349                 "where",
1350                 "who",
1351                 "there",
1352             ],
1353             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1354         ),
1355     ]
1356
1357     # Special cases: can't, shan't and won't.
1358     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1359     txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1360     txt = re.sub(
1361         r'\b(w)ill\s*(n)(o)(t)\b',
1362         r"\1\3\2'\4",
1363         txt,
1364         count=0,
1365         flags=re.IGNORECASE,
1366     )
1367
1368     for first_list, second_list in first_second:
1369         for first in first_list:
1370             for second in second_list:
1371                 # Disallow there're/where're.  They're valid English
1372                 # but sound weird.
1373                 if (first == 'there' or first == 'where') and second == 'a(re)':
1374                     continue
1375
1376                 pattern = fr'\b({first})\s+{second}\b'
1377                 if second == '(n)o(t)':
1378                     replacement = r"\1\2'\3"
1379                 else:
1380                     replacement = r"\1'\2"
1381                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1382
1383     return txt
1384
1385
1386 def thify(n: int) -> str:
1387     """Return the proper cardinal suffix for a number.
1388
1389     >>> thify(1)
1390     'st'
1391     >>> thify(33)
1392     'rd'
1393     >>> thify(16)
1394     'th'
1395
1396     """
1397     digit = str(n)
1398     assert is_integer_number(digit)
1399     digit = digit[-1:]
1400     if digit == "1":
1401         return "st"
1402     elif digit == "2":
1403         return "nd"
1404     elif digit == "3":
1405         return "rd"
1406     else:
1407         return "th"
1408
1409
1410 def ngrams(txt: str, n: int):
1411     """Return the ngrams from a string.
1412
1413     >>> [x for x in ngrams('This is a test', 2)]
1414     ['This is', 'is a', 'a test']
1415
1416     """
1417     words = txt.split()
1418     for ngram in ngrams_presplit(words, n):
1419         ret = ''
1420         for word in ngram:
1421             ret += f'{word} '
1422         yield ret.strip()
1423
1424
1425 def ngrams_presplit(words: Sequence[str], n: int):
1426     return list_utils.ngrams(words, n)
1427
1428
1429 def bigrams(txt: str):
1430     return ngrams(txt, 2)
1431
1432
1433 def trigrams(txt: str):
1434     return ngrams(txt, 3)
1435
1436
1437 def shuffle_columns_into_list(
1438     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1439 ) -> Iterable[str]:
1440     """Helper to shuffle / parse columnar data and return the results as a
1441     list.  The column_specs argument is an iterable collection of
1442     numeric sequences that indicate one or more column numbers to
1443     copy.
1444
1445     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1446     >>> shuffle_columns_into_list(
1447     ...     cols,
1448     ...     [ [8], [2, 3], [5, 6, 7] ],
1449     ...     delim=' ',
1450     ... )
1451     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1452
1453     """
1454     out = []
1455
1456     # Column specs map input lines' columns into outputs.
1457     # [col1, col2...]
1458     for spec in column_specs:
1459         chunk = ''
1460         for n in spec:
1461             chunk = chunk + delim + input_lines[n]
1462         chunk = chunk.strip(delim)
1463         out.append(chunk)
1464     return out
1465
1466
1467 def shuffle_columns_into_dict(
1468     input_lines: Sequence[str],
1469     column_specs: Iterable[Tuple[str, Iterable[int]]],
1470     delim='',
1471 ) -> Dict[str, str]:
1472     """Helper to shuffle / parse columnar data and return the results
1473     as a dict.
1474
1475     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1476     >>> shuffle_columns_into_dict(
1477     ...     cols,
1478     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1479     ...     delim=' ',
1480     ... )
1481     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1482
1483     """
1484     out = {}
1485
1486     # Column specs map input lines' columns into outputs.
1487     # "key", [col1, col2...]
1488     for spec in column_specs:
1489         chunk = ''
1490         for n in spec[1]:
1491             chunk = chunk + delim + input_lines[n]
1492         chunk = chunk.strip(delim)
1493         out[spec[0]] = chunk
1494     return out
1495
1496
1497 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1498     """Interpolate a string with data from a dict.
1499
1500     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1501     ...                        {'adjective': 'good', 'noun': 'example'})
1502     'This is a good example.'
1503
1504     """
1505     return sprintf(txt.format(**values), end='')
1506
1507
1508 def to_ascii(x: str):
1509     """Encode as ascii bytes string.
1510
1511     >>> to_ascii('test')
1512     b'test'
1513
1514     >>> to_ascii(b'1, 2, 3')
1515     b'1, 2, 3'
1516
1517     """
1518     if type(x) is str:
1519         return x.encode('ascii')
1520     if type(x) is bytes:
1521         return x
1522     raise Exception('to_ascii works with strings and bytes')
1523
1524
1525 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1526     """Encode txt and then encode the bytes with a 64-character
1527     alphabet.  This is compatible with uudecode.
1528
1529     >>> to_base64('hello?')
1530     b'aGVsbG8/\\n'
1531
1532     """
1533     return base64.encodebytes(txt.encode(encoding, errors))
1534
1535
1536 def is_base64(txt: str) -> bool:
1537     """Determine whether a string is base64 encoded (with Python's standard
1538     base64 alphabet which is the same as what uuencode uses).
1539
1540     >>> is_base64('test')    # all letters in the b64 alphabet
1541     True
1542
1543     >>> is_base64('another test, how do you like this one?')
1544     False
1545
1546     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1547     True
1548
1549     """
1550     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1551     alphabet = set(a.encode('ascii'))
1552     for char in to_ascii(txt.strip()):
1553         if char not in alphabet:
1554             return False
1555     return True
1556
1557
1558 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1559     """Convert base64 encoded string back to normal strings.
1560
1561     >>> from_base64(b'aGVsbG8/\\n')
1562     'hello?'
1563
1564     """
1565     return base64.decodebytes(b64).decode(encoding, errors)
1566
1567
1568 def chunk(txt: str, chunk_size):
1569     """Chunk up a string.
1570
1571     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1572     '01001101 11000101 10101010 10101010 10011111 10101000'
1573
1574     """
1575     if len(txt) % chunk_size != 0:
1576         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1577         logger.warning(msg)
1578         warnings.warn(msg, stacklevel=2)
1579     for x in range(0, len(txt), chunk_size):
1580         yield txt[x : x + chunk_size]
1581
1582
1583 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1584     """Encode txt and then chop it into bytes.  Note: only bitstrings
1585     with delimiter='' are interpretable by from_bitstring.
1586
1587     >>> to_bitstring('hello?')
1588     '011010000110010101101100011011000110111100111111'
1589
1590     >>> to_bitstring('test', delimiter=' ')
1591     '01110100 01100101 01110011 01110100'
1592
1593     >>> to_bitstring(b'test')
1594     '01110100011001010111001101110100'
1595
1596     """
1597     etxt = to_ascii(txt)
1598     bits = bin(int.from_bytes(etxt, 'big'))
1599     bits = bits[2:]
1600     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1601
1602
1603 def is_bitstring(txt: str) -> bool:
1604     """Is this a bitstring?
1605
1606     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1607     True
1608
1609     >>> is_bitstring('1234')
1610     False
1611
1612     """
1613     return is_binary_integer_number(f'0b{txt}')
1614
1615
1616 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1617     """Convert from bitstring back to bytes then decode into a str.
1618
1619     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1620     'hello?'
1621
1622     """
1623     n = int(bits, 2)
1624     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1625
1626
1627 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1628     """Turn an IPv4 address into a tuple for sorting purposes.
1629
1630     >>> ip_v4_sort_key('10.0.0.18')
1631     (10, 0, 0, 18)
1632
1633     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1634     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1635     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1636
1637     """
1638     if not is_ip_v4(txt):
1639         print(f"not IP: {txt}")
1640         return None
1641     return tuple([int(x) for x in txt.split('.')])
1642
1643
1644 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1645     """Chunk up a file path so that parent/ancestor paths sort before
1646     children/descendant paths.
1647
1648     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1649     ('usr', 'local', 'bin')
1650
1651     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1652     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1653     ['/usr', '/usr/local', '/usr/local/bin']
1654
1655     """
1656     return tuple([x for x in volume.split('/') if len(x) > 0])
1657
1658
1659 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1660     """Execute several replace operations in a row.
1661
1662     >>> s = 'this_is a-test!'
1663     >>> replace_all(s, ' _-!', '')
1664     'thisisatest'
1665
1666     """
1667     for char in replace_set:
1668         in_str = in_str.replace(char, replacement)
1669     return in_str
1670
1671
1672 if __name__ == '__main__':
1673     import doctest
1674
1675     doctest.testmod()