string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7 Modifications Copyright (c) 2021-2022 Scott Gasch
   8
   9 Permission is hereby granted, free of charge, to any person obtaining a copy
  10 of this software and associated documentation files (the "Software"), to deal
  11 in the Software without restriction, including without limitation the rights
  12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  13 copies of the Software, and to permit persons to whom the Software is
  14 furnished to do so, subject to the following conditions:
  15
  16 The above copyright notice and this permission notice shall be included in all
  17 copies or substantial portions of the Software.
  18
  19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  25 SOFTWARE.
  26
  27 This class is based on: https://github.com/daveoncode/python-string-utils.
  28 """
  29
  30 import base64
  31 import contextlib  # type: ignore
  32 import datetime
  33 import io
  34 import json
  35 import logging
  36 import numbers
  37 import random
  38 import re
  39 import string
  40 import unicodedata
  41 import warnings
  42 from itertools import zip_longest
  43 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
  44 from uuid import uuid4
  45
  46 import list_utils
  47
  48 logger = logging.getLogger(__name__)
  49
  50 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  51
  52 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  53
  54 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  55
  56 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  57
  58 URLS_RAW_STRING = (
  59     r"([a-z-]+://)"  # scheme
  60     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  61     r"(www\.)?"  # www.
  62     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  63     r"(:\d{2,})?"  # port number
  64     r"(/[a-z\d_%+-]*)*"  # folders
  65     r"(\.[a-z\d_%+-]+)*"  # file extension
  66     r"(\?[a-z\d_+%-=]*)?"  # query string
  67     r"(#\S*)?"  # hash
  68 )
  69
  70 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  71
  72 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  73
  74 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  75
  76 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  77
  78 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  79
  80 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  81
  82 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
  83
  84 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  85
  86 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
  87
  88 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
  89
  90 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  91
  92 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  93
  94 CREDIT_CARDS = {
  95     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  96     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  97     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  98     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  99     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 100     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 101 }
 102
 103 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 104
 105 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
 106
 107 UUID_HEX_OK_RE = re.compile(
 108     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 109     re.IGNORECASE,
 110 )
 111
 112 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 113
 114 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 115
 116 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 117
 118 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 119
 120 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 121
 122 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
 123
 124 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 125
 126 HTML_RE = re.compile(
 127     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 128     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 129 )
 130
 131 HTML_TAG_ONLY_RE = re.compile(
 132     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 133     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 134 )
 135
 136 SPACES_RE = re.compile(r"\s")
 137
 138 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 139
 140 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 141
 142 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 143
 144 NUM_SUFFIXES = {
 145     "Pb": (1024**5),
 146     "P": (1024**5),
 147     "Tb": (1024**4),
 148     "T": (1024**4),
 149     "Gb": (1024**3),
 150     "G": (1024**3),
 151     "Mb": (1024**2),
 152     "M": (1024**2),
 153     "Kb": (1024**1),
 154     "K": (1024**1),
 155 }
 156
 157
 158 def is_none_or_empty(in_str: Optional[str]) -> bool:
 159     """
 160     Returns true if the input string is either None or an empty string.
 161
 162     >>> is_none_or_empty("")
 163     True
 164     >>> is_none_or_empty(None)
 165     True
 166     >>> is_none_or_empty("   \t   ")
 167     True
 168     >>> is_none_or_empty('Test')
 169     False
 170     """
 171     return in_str is None or len(in_str.strip()) == 0
 172
 173
 174 def is_string(obj: Any) -> bool:
 175     """
 176     Checks if an object is a string.
 177
 178     >>> is_string('test')
 179     True
 180     >>> is_string(123)
 181     False
 182     >>> is_string(100.3)
 183     False
 184     >>> is_string([1, 2, 3])
 185     False
 186     """
 187     return isinstance(obj, str)
 188
 189
 190 def is_empty_string(in_str: Any) -> bool:
 191     return is_empty(in_str)
 192
 193
 194 def is_empty(in_str: Any) -> bool:
 195     """
 196     Checks if input is a string and empty or only whitespace.
 197
 198     >>> is_empty('')
 199     True
 200     >>> is_empty('    \t\t    ')
 201     True
 202     >>> is_empty('test')
 203     False
 204     >>> is_empty(100.88)
 205     False
 206     >>> is_empty([1, 2, 3])
 207     False
 208     """
 209     return is_string(in_str) and in_str.strip() == ""
 210
 211
 212 def is_full_string(in_str: Any) -> bool:
 213     """
 214     Checks that input is a string and is not empty ('') or only whitespace.
 215
 216     >>> is_full_string('test!')
 217     True
 218     >>> is_full_string('')
 219     False
 220     >>> is_full_string('      ')
 221     False
 222     >>> is_full_string(100.999)
 223     False
 224     >>> is_full_string({"a": 1, "b": 2})
 225     False
 226     """
 227     return is_string(in_str) and in_str.strip() != ""
 228
 229
 230 def is_number(in_str: str) -> bool:
 231     """
 232     Checks if a string is a valid number.
 233
 234     >>> is_number(100.5)
 235     Traceback (most recent call last):
 236     ...
 237     ValueError: 100.5
 238     >>> is_number("100.5")
 239     True
 240     >>> is_number("test")
 241     False
 242     >>> is_number("99")
 243     True
 244     >>> is_number([1, 2, 3])
 245     Traceback (most recent call last):
 246     ...
 247     ValueError: [1, 2, 3]
 248     """
 249     if not is_string(in_str):
 250         raise ValueError(in_str)
 251     return NUMBER_RE.match(in_str) is not None
 252
 253
 254 def is_integer_number(in_str: str) -> bool:
 255     """
 256     Checks whether the given string represents an integer or not.
 257
 258     An integer may be signed or unsigned or use a "scientific notation".
 259
 260     >>> is_integer_number('42')
 261     True
 262     >>> is_integer_number('42.0')
 263     False
 264     """
 265     return (
 266         (is_number(in_str) and "." not in in_str)
 267         or is_hexidecimal_integer_number(in_str)
 268         or is_octal_integer_number(in_str)
 269         or is_binary_integer_number(in_str)
 270     )
 271
 272
 273 def is_hexidecimal_integer_number(in_str: str) -> bool:
 274     """
 275     Checks whether a string is a hex integer number.
 276
 277     >>> is_hexidecimal_integer_number('0x12345')
 278     True
 279     >>> is_hexidecimal_integer_number('0x1A3E')
 280     True
 281     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 282     False
 283     >>> is_hexidecimal_integer_number('-0xff')
 284     True
 285     >>> is_hexidecimal_integer_number('test')
 286     False
 287     >>> is_hexidecimal_integer_number(12345)  # Not a string
 288     Traceback (most recent call last):
 289     ...
 290     ValueError: 12345
 291     >>> is_hexidecimal_integer_number(101.4)
 292     Traceback (most recent call last):
 293     ...
 294     ValueError: 101.4
 295     >>> is_hexidecimal_integer_number(0x1A3E)
 296     Traceback (most recent call last):
 297     ...
 298     ValueError: 6718
 299     """
 300     if not is_string(in_str):
 301         raise ValueError(in_str)
 302     return HEX_NUMBER_RE.match(in_str) is not None
 303
 304
 305 def is_octal_integer_number(in_str: str) -> bool:
 306     """
 307     Checks whether a string is an octal number.
 308
 309     >>> is_octal_integer_number('0o777')
 310     True
 311     >>> is_octal_integer_number('-0O115')
 312     True
 313     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 314     False
 315     >>> is_octal_integer_number('7777')  # Needs 0o
 316     False
 317     >>> is_octal_integer_number('test')
 318     False
 319     """
 320     if not is_string(in_str):
 321         raise ValueError(in_str)
 322     return OCT_NUMBER_RE.match(in_str) is not None
 323
 324
 325 def is_binary_integer_number(in_str: str) -> bool:
 326     """
 327     Returns whether a string contains a binary number.
 328
 329     >>> is_binary_integer_number('0b10111')
 330     True
 331     >>> is_binary_integer_number('-0b111')
 332     True
 333     >>> is_binary_integer_number('0B10101')
 334     True
 335     >>> is_binary_integer_number('0b10102')
 336     False
 337     >>> is_binary_integer_number('0xFFF')
 338     False
 339     >>> is_binary_integer_number('test')
 340     False
 341     """
 342     if not is_string(in_str):
 343         raise ValueError(in_str)
 344     return BIN_NUMBER_RE.match(in_str) is not None
 345
 346
 347 def to_int(in_str: str) -> int:
 348     """Returns the integral value of the string or raises on error.
 349
 350     >>> to_int('1234')
 351     1234
 352     >>> to_int('test')
 353     Traceback (most recent call last):
 354     ...
 355     ValueError: invalid literal for int() with base 10: 'test'
 356     """
 357     if not is_string(in_str):
 358         raise ValueError(in_str)
 359     if is_binary_integer_number(in_str):
 360         return int(in_str, 2)
 361     if is_octal_integer_number(in_str):
 362         return int(in_str, 8)
 363     if is_hexidecimal_integer_number(in_str):
 364         return int(in_str, 16)
 365     return int(in_str)
 366
 367
 368 def is_decimal_number(in_str: str) -> bool:
 369     """
 370     Checks whether the given string represents a decimal or not.
 371
 372     A decimal may be signed or unsigned or use a "scientific notation".
 373
 374     >>> is_decimal_number('42.0')
 375     True
 376     >>> is_decimal_number('42')
 377     False
 378     """
 379     return is_number(in_str) and "." in in_str
 380
 381
 382 def strip_escape_sequences(in_str: str) -> str:
 383     """
 384     Remove escape sequences in the input string.
 385
 386     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 387     'this is a test!'
 388     """
 389     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 390     return in_str
 391
 392
 393 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 394     """
 395     Add thousands separator to a numeric string.  Also handles numbers.
 396
 397     >>> add_thousands_separator('12345678')
 398     '12,345,678'
 399     >>> add_thousands_separator(12345678)
 400     '12,345,678'
 401     >>> add_thousands_separator(12345678.99)
 402     '12,345,678.99'
 403     >>> add_thousands_separator('test')
 404     Traceback (most recent call last):
 405     ...
 406     ValueError: test
 407
 408     """
 409     if isinstance(in_str, numbers.Number):
 410         in_str = f'{in_str}'
 411     if is_number(in_str):
 412         return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
 413     raise ValueError(in_str)
 414
 415
 416 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 417     decimal_part = ""
 418     if '.' in in_str:
 419         (in_str, decimal_part) = in_str.split('.')
 420     tmp = [iter(in_str[::-1])] * places
 421     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 422     if len(decimal_part) > 0:
 423         ret += '.'
 424         ret += decimal_part
 425     return ret
 426
 427
 428 # Full url example:
 429 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 430 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 431     """
 432     Check if a string is a valid url.
 433
 434     >>> is_url('http://www.mysite.com')
 435     True
 436     >>> is_url('https://mysite.com')
 437     True
 438     >>> is_url('.mysite.com')
 439     False
 440     """
 441     if not is_full_string(in_str):
 442         return False
 443
 444     valid = URL_RE.match(in_str) is not None
 445
 446     if allowed_schemes:
 447         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 448     return valid
 449
 450
 451 def is_email(in_str: Any) -> bool:
 452     """
 453     Check if a string is a valid email.
 454
 455     Reference: https://tools.ietf.org/html/rfc3696#section-3
 456
 457     >>> is_email('[email protected]')
 458     True
 459     >>> is_email('@gmail.com')
 460     False
 461     """
 462     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 463         return False
 464
 465     try:
 466         # we expect 2 tokens, one before "@" and one after, otherwise
 467         # we have an exception and the email is not valid.
 468         head, tail = in_str.split("@")
 469
 470         # head's size must be <= 64, tail <= 255, head must not start
 471         # with a dot or contain multiple consecutive dots.
 472         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 473             return False
 474
 475         # removes escaped spaces, so that later on the test regex will
 476         # accept the string.
 477         head = head.replace("\\ ", "")
 478         if head.startswith('"') and head.endswith('"'):
 479             head = head.replace(" ", "")[1:-1]
 480         return EMAIL_RE.match(head + "@" + tail) is not None
 481
 482     except ValueError:
 483         # borderline case in which we have multiple "@" signs but the
 484         # head part is correctly escaped.
 485         if ESCAPED_AT_SIGN.search(in_str) is not None:
 486             # replace "@" with "a" in the head
 487             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 488         return False
 489
 490
 491 def suffix_string_to_number(in_str: str) -> Optional[int]:
 492     """Take a string like "33Gb" and convert it into a number (of bytes)
 493     like 34603008.  Return None if the input string is not valid.
 494
 495     >>> suffix_string_to_number('1Mb')
 496     1048576
 497     >>> suffix_string_to_number('13.1Gb')
 498     14066017894
 499     """
 500
 501     def suffix_capitalize(s: str) -> str:
 502         if len(s) == 1:
 503             return s.upper()
 504         elif len(s) == 2:
 505             return f"{s[0].upper()}{s[1].lower()}"
 506         return suffix_capitalize(s[0:1])
 507
 508     if is_string(in_str):
 509         if is_integer_number(in_str):
 510             return to_int(in_str)
 511         suffixes = [in_str[-2:], in_str[-1:]]
 512         rest = [in_str[:-2], in_str[:-1]]
 513         for x in range(len(suffixes)):
 514             s = suffixes[x]
 515             s = suffix_capitalize(s)
 516             multiplier = NUM_SUFFIXES.get(s, None)
 517             if multiplier is not None:
 518                 r = rest[x]
 519                 if is_integer_number(r):
 520                     return to_int(r) * multiplier
 521                 if is_decimal_number(r):
 522                     return int(float(r) * multiplier)
 523     return None
 524
 525
 526 def number_to_suffix_string(num: int) -> Optional[str]:
 527     """Take a number (of bytes) and returns a string like "43.8Gb".
 528     Returns none if the input is invalid.
 529
 530     >>> number_to_suffix_string(14066017894)
 531     '13.1Gb'
 532     >>> number_to_suffix_string(1024 * 1024)
 533     '1.0Mb'
 534
 535     """
 536     d = 0.0
 537     suffix = None
 538     for (sfx, size) in NUM_SUFFIXES.items():
 539         if num >= size:
 540             d = num / size
 541             suffix = sfx
 542             break
 543     if suffix is not None:
 544         return f"{d:.1f}{suffix}"
 545     else:
 546         return f'{num:d}'
 547
 548
 549 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 550     """
 551     Checks if a string is a valid credit card number.
 552     If card type is provided then it checks against that specific type only,
 553     otherwise any known credit card number will be accepted.
 554
 555     Supported card types are the following:
 556
 557     - VISA
 558     - MASTERCARD
 559     - AMERICAN_EXPRESS
 560     - DINERS_CLUB
 561     - DISCOVER
 562     - JCB
 563     """
 564     if not is_full_string(in_str):
 565         return False
 566
 567     if card_type is not None:
 568         if card_type not in CREDIT_CARDS:
 569             raise KeyError(
 570                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 571             )
 572         return CREDIT_CARDS[card_type].match(in_str) is not None
 573     for c in CREDIT_CARDS:
 574         if CREDIT_CARDS[c].match(in_str) is not None:
 575             return True
 576     return False
 577
 578
 579 def is_camel_case(in_str: Any) -> bool:
 580     """
 581     Checks if a string is formatted as camel case.
 582
 583     A string is considered camel case when:
 584
 585     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 586     - it contains both lowercase and uppercase letters
 587     - it does not start with a number
 588     """
 589     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 590
 591
 592 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 593     """
 594     Checks if a string is formatted as "snake case".
 595
 596     A string is considered snake case when:
 597
 598     - it's composed only by lowercase/uppercase letters and digits
 599     - it contains at least one underscore (or provided separator)
 600     - it does not start with a number
 601
 602     >>> is_snake_case('this_is_a_test')
 603     True
 604     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 605     True
 606     >>> is_snake_case('this-is-a-test')
 607     False
 608     >>> is_snake_case('this-is-a-test', separator='-')
 609     True
 610
 611     """
 612     if is_full_string(in_str):
 613         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 614         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 615         r = re_map.get(
 616             separator,
 617             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 618         )
 619         return r.match(in_str) is not None
 620     return False
 621
 622
 623 def is_json(in_str: Any) -> bool:
 624     """
 625     Check if a string is a valid json.
 626
 627     >>> is_json('{"name": "Peter"}')
 628     True
 629     >>> is_json('[1, 2, 3]')
 630     True
 631     >>> is_json('{nope}')
 632     False
 633     """
 634     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 635         try:
 636             return isinstance(json.loads(in_str), (dict, list))
 637         except (TypeError, ValueError, OverflowError):
 638             pass
 639     return False
 640
 641
 642 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 643     """
 644     Check if a string is a valid UUID.
 645
 646     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 647     True
 648     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 649     False
 650     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 651     True
 652     """
 653     # string casting is used to allow UUID itself as input data type
 654     s = str(in_str)
 655     if allow_hex:
 656         return UUID_HEX_OK_RE.match(s) is not None
 657     return UUID_RE.match(s) is not None
 658
 659
 660 def is_ip_v4(in_str: Any) -> bool:
 661     """
 662     Checks if a string is a valid ip v4.
 663
 664     >>> is_ip_v4('255.200.100.75')
 665     True
 666     >>> is_ip_v4('nope')
 667     False
 668     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 669     False
 670     """
 671     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 672         return False
 673
 674     # checks that each entry in the ip is in the valid range (0 to 255)
 675     for token in in_str.split("."):
 676         if not 0 <= int(token) <= 255:
 677             return False
 678     return True
 679
 680
 681 def extract_ip_v4(in_str: Any) -> Optional[str]:
 682     """
 683     Extracts the IPv4 chunk of a string or None.
 684
 685     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 686     '127.0.0.1'
 687     >>> extract_ip_v4('Your mom dresses you funny.')
 688     """
 689     if not is_full_string(in_str):
 690         return None
 691     m = ANYWHERE_IP_V4_RE.search(in_str)
 692     if m is not None:
 693         return m.group(0)
 694     return None
 695
 696
 697 def is_ip_v6(in_str: Any) -> bool:
 698     """
 699     Checks if a string is a valid ip v6.
 700
 701     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 702     True
 703     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 704     False
 705     """
 706     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 707
 708
 709 def extract_ip_v6(in_str: Any) -> Optional[str]:
 710     """
 711     Extract IPv6 chunk or None.
 712
 713     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 714     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 715     >>> extract_ip_v6("(and she's ugly too, btw)")
 716     """
 717     if not is_full_string(in_str):
 718         return None
 719     m = ANYWHERE_IP_V6_RE.search(in_str)
 720     if m is not None:
 721         return m.group(0)
 722     return None
 723
 724
 725 def is_ip(in_str: Any) -> bool:
 726     """
 727     Checks if a string is a valid ip (either v4 or v6).
 728
 729     >>> is_ip('255.200.100.75')
 730     True
 731     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 732     True
 733     >>> is_ip('1.2.3')
 734     False
 735     >>> is_ip('1.2.3.999')
 736     False
 737     """
 738     return is_ip_v6(in_str) or is_ip_v4(in_str)
 739
 740
 741 def extract_ip(in_str: Any) -> Optional[str]:
 742     """
 743     Extract the IP address or None.
 744
 745     >>> extract_ip('Attacker: 255.200.100.75')
 746     '255.200.100.75'
 747     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 748     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 749     >>> extract_ip('1.2.3')
 750
 751     """
 752     ip = extract_ip_v4(in_str)
 753     if ip is None:
 754         ip = extract_ip_v6(in_str)
 755     return ip
 756
 757
 758 def is_mac_address(in_str: Any) -> bool:
 759     """Return True if in_str is a valid MAC address false otherwise.
 760
 761     >>> is_mac_address("34:29:8F:12:0D:2F")
 762     True
 763     >>> is_mac_address('34:29:8f:12:0d:2f')
 764     True
 765     >>> is_mac_address('34-29-8F-12-0D-2F')
 766     True
 767     >>> is_mac_address("test")
 768     False
 769     """
 770     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 771
 772
 773 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 774     """
 775     Extract the MAC address from in_str.
 776
 777     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 778     '34:29:8F:12:0D:2F'
 779
 780     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 781     'd8:5d:e2:34:54:86'
 782
 783     """
 784     if not is_full_string(in_str):
 785         return None
 786     in_str.strip()
 787     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 788     if m is not None:
 789         mac = m.group(0)
 790         mac.replace(":", separator)
 791         mac.replace("-", separator)
 792         return mac
 793     return None
 794
 795
 796 def is_slug(in_str: Any, separator: str = "-") -> bool:
 797     """
 798     Checks if a given string is a slug (as created by `slugify()`).
 799
 800     >>> is_slug('my-blog-post-title')
 801     True
 802     >>> is_slug('My blog post title')
 803     False
 804
 805     """
 806     if not is_full_string(in_str):
 807         return False
 808     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 809     return re.match(rex, in_str) is not None
 810
 811
 812 def contains_html(in_str: str) -> bool:
 813     """
 814     Checks if the given string contains HTML/XML tags.
 815
 816     By design, this function matches ANY type of tag, so don't expect to use it
 817     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 818
 819     >>> contains_html('my string is <strong>bold</strong>')
 820     True
 821     >>> contains_html('my string is not bold')
 822     False
 823
 824     """
 825     if not is_string(in_str):
 826         raise ValueError(in_str)
 827     return HTML_RE.search(in_str) is not None
 828
 829
 830 def words_count(in_str: str) -> int:
 831     """
 832     Returns the number of words contained into the given string.
 833
 834     This method is smart, it does consider only sequence of one or more letter and/or numbers
 835     as "words", so a string like this: "! @ # % ... []" will return zero!
 836     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 837     will be 4 not 1 (even if there are no spaces in the string).
 838
 839     >>> words_count('hello world')
 840     2
 841     >>> words_count('one,two,three.stop')
 842     4
 843
 844     """
 845     if not is_string(in_str):
 846         raise ValueError(in_str)
 847     return len(WORDS_COUNT_RE.findall(in_str))
 848
 849
 850 def generate_uuid(omit_dashes: bool = False) -> str:
 851     """
 852     Generated an UUID string (using `uuid.uuid4()`).
 853
 854     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 855     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 856
 857     """
 858     uid = uuid4()
 859     if omit_dashes:
 860         return uid.hex
 861     return str(uid)
 862
 863
 864 def generate_random_alphanumeric_string(size: int) -> str:
 865     """
 866     Returns a string of the specified size containing random
 867     characters (uppercase/lowercase ascii letters and digits).
 868
 869     random_string(9) # possible output: "cx3QQbzYg"
 870
 871     """
 872     if size < 1:
 873         raise ValueError("size must be >= 1")
 874     chars = string.ascii_letters + string.digits
 875     buffer = [random.choice(chars) for _ in range(size)]
 876     return from_char_list(buffer)
 877
 878
 879 def reverse(in_str: str) -> str:
 880     """
 881     Returns the string with its chars reversed.
 882
 883     >>> reverse('test')
 884     'tset'
 885
 886     """
 887     if not is_string(in_str):
 888         raise ValueError(in_str)
 889     return in_str[::-1]
 890
 891
 892 def camel_case_to_snake_case(in_str, *, separator="_"):
 893     """
 894     Convert a camel case string into a snake case one.
 895     (The original string is returned if is not a valid camel case string)
 896
 897     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 898     'mac_address_extractor_factory'
 899     >>> camel_case_to_snake_case('Luke Skywalker')
 900     'Luke Skywalker'
 901     """
 902     if not is_string(in_str):
 903         raise ValueError(in_str)
 904     if not is_camel_case(in_str):
 905         return in_str
 906     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
 907
 908
 909 def snake_case_to_camel_case(
 910     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 911 ) -> str:
 912     """
 913     Convert a snake case string into a camel case one.
 914     (The original string is returned if is not a valid snake case string)
 915
 916     >>> snake_case_to_camel_case('this_is_a_test')
 917     'ThisIsATest'
 918     >>> snake_case_to_camel_case('Han Solo')
 919     'Han Solo'
 920     """
 921     if not is_string(in_str):
 922         raise ValueError(in_str)
 923     if not is_snake_case(in_str, separator=separator):
 924         return in_str
 925     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 926     if not upper_case_first:
 927         tokens[0] = tokens[0].lower()
 928     return from_char_list(tokens)
 929
 930
 931 def to_char_list(in_str: str) -> List[str]:
 932     """Convert a string into a list of chars.
 933
 934     >>> to_char_list('test')
 935     ['t', 'e', 's', 't']
 936     """
 937     if not is_string(in_str):
 938         return []
 939     return list(in_str)
 940
 941
 942 def from_char_list(in_list: List[str]) -> str:
 943     """Convert a char list into a string.
 944
 945     >>> from_char_list(['t', 'e', 's', 't'])
 946     'test'
 947     """
 948     return "".join(in_list)
 949
 950
 951 def shuffle(in_str: str) -> str:
 952     """Return a new string containing same chars of the given one but in
 953     a randomized order.
 954     """
 955     if not is_string(in_str):
 956         raise ValueError(in_str)
 957
 958     # turn the string into a list of chars
 959     chars = to_char_list(in_str)
 960     random.shuffle(chars)
 961     return from_char_list(chars)
 962
 963
 964 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 965     """
 966     Remove html code contained into the given string.
 967
 968     >>> strip_html('test: <a href="foo/bar">click here</a>')
 969     'test: '
 970     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 971     'test: click here'
 972     """
 973     if not is_string(in_str):
 974         raise ValueError(in_str)
 975     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 976     return r.sub("", in_str)
 977
 978
 979 def asciify(in_str: str) -> str:
 980     """
 981     Force string content to be ascii-only by translating all non-ascii
 982     chars into the closest possible representation (eg: ó -> o, Ë ->
 983     E, ç -> c...).
 984
 985     N.B. Some chars may be lost if impossible to translate.
 986
 987     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
 988     'eeuuooaaeynAAACIINOE'
 989     """
 990     if not is_string(in_str):
 991         raise ValueError(in_str)
 992
 993     # "NFKD" is the algorithm which is able to successfully translate
 994     # the most of non-ascii chars.
 995     normalized = unicodedata.normalize("NFKD", in_str)
 996
 997     # encode string forcing ascii and ignore any errors
 998     # (unrepresentable chars will be stripped out)
 999     ascii_bytes = normalized.encode("ascii", "ignore")
1000
1001     # turns encoded bytes into an utf-8 string
1002     return ascii_bytes.decode("utf-8")
1003
1004
1005 def slugify(in_str: str, *, separator: str = "-") -> str:
1006     """
1007     Converts a string into a "slug" using provided separator.
1008     The returned string has the following properties:
1009
1010     - it has no spaces
1011     - all letters are in lower case
1012     - all punctuation signs and non alphanumeric chars are removed
1013     - words are divided using provided separator
1014     - all chars are encoded as ascii (by using `asciify()`)
1015     - is safe for URL
1016
1017     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1018     'top-10-reasons-to-love-dogs'
1019     >>> slugify('Mönstér Mägnët')
1020     'monster-magnet'
1021     """
1022     if not is_string(in_str):
1023         raise ValueError(in_str)
1024
1025     # replace any character that is NOT letter or number with spaces
1026     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1027
1028     # replace spaces with join sign
1029     out = SPACES_RE.sub(separator, out)
1030
1031     # normalize joins (remove duplicates)
1032     out = re.sub(re.escape(separator) + r"+", separator, out)
1033     return asciify(out)
1034
1035
1036 def to_bool(in_str: str) -> bool:
1037     """
1038     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1039
1040     A positive boolean (True) is returned if the string value is one
1041     of the following:
1042
1043     - "true"
1044     - "1"
1045     - "yes"
1046     - "y"
1047
1048     Otherwise False is returned.
1049
1050     >>> to_bool('True')
1051     True
1052
1053     >>> to_bool('1')
1054     True
1055
1056     >>> to_bool('yes')
1057     True
1058
1059     >>> to_bool('no')
1060     False
1061
1062     >>> to_bool('huh?')
1063     False
1064
1065     >>> to_bool('on')
1066     True
1067
1068     """
1069     if not is_string(in_str):
1070         raise ValueError(in_str)
1071     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1072
1073
1074 def to_date(in_str: str) -> Optional[datetime.date]:
1075     """
1076     Parses a date string.  See DateParser docs for details.
1077     """
1078     import dateparse.dateparse_utils as du
1079
1080     try:
1081         d = du.DateParser()  # type: ignore
1082         d.parse(in_str)
1083         return d.get_date()
1084     except du.ParseException:  # type: ignore
1085         msg = f'Unable to parse date {in_str}.'
1086         logger.warning(msg)
1087     return None
1088
1089
1090 def valid_date(in_str: str) -> bool:
1091     """
1092     True if the string represents a valid date.
1093     """
1094     import dateparse
1095     import dateparse.dateparse_utils as dp
1096
1097     try:
1098         d = dp.DateParser()  # type: ignore
1099         _ = d.parse(in_str)
1100         return True
1101     except dp.ParseException:  # type: ignore
1102         msg = f'Unable to parse date {in_str}.'
1103         logger.warning(msg)
1104     return False
1105
1106
1107 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1108     """
1109     Parses a datetime string.  See DateParser docs for more info.
1110     """
1111     import dateparse.dateparse_utils as dp
1112
1113     try:
1114         d = dp.DateParser()  # type: ignore
1115         dt = d.parse(in_str)
1116         if type(dt) == datetime.datetime:
1117             return dt
1118     except ValueError:
1119         msg = f'Unable to parse datetime {in_str}.'
1120         logger.warning(msg)
1121     return None
1122
1123
1124 def valid_datetime(in_str: str) -> bool:
1125     """
1126     True if the string represents a valid datetime.
1127     """
1128     _ = to_datetime(in_str)
1129     if _ is not None:
1130         return True
1131     msg = f'Unable to parse datetime {in_str}.'
1132     logger.warning(msg)
1133     return False
1134
1135
1136 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1137     """
1138     Squeeze runs of more than one character_to_squeeze into one.
1139
1140     >>> squeeze(' this        is       a    test    ')
1141     ' this is a test '
1142
1143     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1144     'one|!|two|!|three'
1145
1146     """
1147     return re.sub(
1148         r'(' + re.escape(character_to_squeeze) + r')+',
1149         character_to_squeeze,
1150         in_str,
1151     )
1152
1153
1154 def dedent(in_str: str) -> str:
1155     """
1156     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1157     """
1158     if not is_string(in_str):
1159         raise ValueError(in_str)
1160     line_separator = '\n'
1161     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1162     return line_separator.join(lines)
1163
1164
1165 def indent(in_str: str, amount: int) -> str:
1166     """
1167     Indents string by prepending amount spaces.
1168
1169     >>> indent('This is a test', 4)
1170     '    This is a test'
1171
1172     """
1173     if not is_string(in_str):
1174         raise ValueError(in_str)
1175     line_separator = '\n'
1176     lines = [" " * amount + line for line in in_str.split(line_separator)]
1177     return line_separator.join(lines)
1178
1179
1180 def sprintf(*args, **kwargs) -> str:
1181     """String printf, like in C"""
1182     ret = ""
1183
1184     sep = kwargs.pop("sep", None)
1185     if sep is not None:
1186         if not isinstance(sep, str):
1187             raise TypeError("sep must be None or a string")
1188
1189     end = kwargs.pop("end", None)
1190     if end is not None:
1191         if not isinstance(end, str):
1192             raise TypeError("end must be None or a string")
1193
1194     if kwargs:
1195         raise TypeError("invalid keyword arguments to sprint()")
1196
1197     if sep is None:
1198         sep = " "
1199     if end is None:
1200         end = "\n"
1201     for i, arg in enumerate(args):
1202         if i:
1203             ret += sep
1204         if isinstance(arg, str):
1205             ret += arg
1206         else:
1207             ret += str(arg)
1208     ret += end
1209     return ret
1210
1211
1212 class SprintfStdout(object):
1213     """
1214     A context manager that captures outputs to stdout.
1215
1216     with SprintfStdout() as buf:
1217         print("test")
1218     print(buf())
1219
1220     'test\n'
1221     """
1222
1223     def __init__(self) -> None:
1224         self.destination = io.StringIO()
1225         self.recorder: contextlib.redirect_stdout
1226
1227     def __enter__(self) -> Callable[[], str]:
1228         self.recorder = contextlib.redirect_stdout(self.destination)
1229         self.recorder.__enter__()
1230         return lambda: self.destination.getvalue()
1231
1232     def __exit__(self, *args) -> None:
1233         self.recorder.__exit__(*args)
1234         self.destination.seek(0)
1235         return None  # don't suppress exceptions
1236
1237
1238 def capitalize_first_letter(txt: str) -> str:
1239     """Capitalize the first letter of a string.
1240
1241     >>> capitalize_first_letter('test')
1242     'Test'
1243     >>> capitalize_first_letter("ALREADY!")
1244     'ALREADY!'
1245
1246     """
1247     return txt[0].upper() + txt[1:]
1248
1249
1250 def it_they(n: int) -> str:
1251     """It or they?
1252
1253     >>> it_they(1)
1254     'it'
1255     >>> it_they(100)
1256     'they'
1257
1258     """
1259     if n == 1:
1260         return "it"
1261     return "they"
1262
1263
1264 def is_are(n: int) -> str:
1265     """Is or are?
1266
1267     >>> is_are(1)
1268     'is'
1269     >>> is_are(2)
1270     'are'
1271
1272     """
1273     if n == 1:
1274         return "is"
1275     return "are"
1276
1277
1278 def pluralize(n: int) -> str:
1279     """Add an s?
1280
1281     >>> pluralize(15)
1282     's'
1283     >>> count = 1
1284     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1285     There is 1 file.
1286     >>> count = 4
1287     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1288     There are 4 files.
1289
1290     """
1291     if n == 1:
1292         return ""
1293     return "s"
1294
1295
1296 def make_contractions(txt: str) -> str:
1297     """Glue words together to form contractions.
1298
1299     >>> make_contractions('It is nice today.')
1300     "It's nice today."
1301
1302     >>> make_contractions('I can    not even...')
1303     "I can't even..."
1304
1305     >>> make_contractions('She could not see!')
1306     "She couldn't see!"
1307
1308     >>> make_contractions('But she will not go.')
1309     "But she won't go."
1310
1311     >>> make_contractions('Verily, I shall not.')
1312     "Verily, I shan't."
1313
1314     >>> make_contractions('No you cannot.')
1315     "No you can't."
1316
1317     >>> make_contractions('I said you can not go.')
1318     "I said you can't go."
1319
1320     """
1321
1322     first_second = [
1323         (
1324             [
1325                 'are',
1326                 'could',
1327                 'did',
1328                 'has',
1329                 'have',
1330                 'is',
1331                 'must',
1332                 'should',
1333                 'was',
1334                 'were',
1335                 'would',
1336             ],
1337             ['(n)o(t)'],
1338         ),
1339         (
1340             [
1341                 "I",
1342                 "you",
1343                 "he",
1344                 "she",
1345                 "it",
1346                 "we",
1347                 "they",
1348                 "how",
1349                 "why",
1350                 "when",
1351                 "where",
1352                 "who",
1353                 "there",
1354             ],
1355             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1356         ),
1357     ]
1358
1359     # Special cases: can't, shan't and won't.
1360     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1361     txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1362     txt = re.sub(
1363         r'\b(w)ill\s*(n)(o)(t)\b',
1364         r"\1\3\2'\4",
1365         txt,
1366         count=0,
1367         flags=re.IGNORECASE,
1368     )
1369
1370     for first_list, second_list in first_second:
1371         for first in first_list:
1372             for second in second_list:
1373                 # Disallow there're/where're.  They're valid English
1374                 # but sound weird.
1375                 if (first == 'there' or first == 'where') and second == 'a(re)':
1376                     continue
1377
1378                 pattern = fr'\b({first})\s+{second}\b'
1379                 if second == '(n)o(t)':
1380                     replacement = r"\1\2'\3"
1381                 else:
1382                     replacement = r"\1'\2"
1383                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1384
1385     return txt
1386
1387
1388 def thify(n: int) -> str:
1389     """Return the proper cardinal suffix for a number.
1390
1391     >>> thify(1)
1392     'st'
1393     >>> thify(33)
1394     'rd'
1395     >>> thify(16)
1396     'th'
1397
1398     """
1399     digit = str(n)
1400     assert is_integer_number(digit)
1401     digit = digit[-1:]
1402     if digit == "1":
1403         return "st"
1404     elif digit == "2":
1405         return "nd"
1406     elif digit == "3":
1407         return "rd"
1408     else:
1409         return "th"
1410
1411
1412 def ngrams(txt: str, n: int):
1413     """Return the ngrams from a string.
1414
1415     >>> [x for x in ngrams('This is a test', 2)]
1416     ['This is', 'is a', 'a test']
1417
1418     """
1419     words = txt.split()
1420     for ngram in ngrams_presplit(words, n):
1421         ret = ''
1422         for word in ngram:
1423             ret += f'{word} '
1424         yield ret.strip()
1425
1426
1427 def ngrams_presplit(words: Sequence[str], n: int):
1428     return list_utils.ngrams(words, n)
1429
1430
1431 def bigrams(txt: str):
1432     return ngrams(txt, 2)
1433
1434
1435 def trigrams(txt: str):
1436     return ngrams(txt, 3)
1437
1438
1439 def shuffle_columns_into_list(
1440     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1441 ) -> Iterable[str]:
1442     """Helper to shuffle / parse columnar data and return the results as a
1443     list.  The column_specs argument is an iterable collection of
1444     numeric sequences that indicate one or more column numbers to
1445     copy.
1446
1447     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1448     >>> shuffle_columns_into_list(
1449     ...     cols,
1450     ...     [ [8], [2, 3], [5, 6, 7] ],
1451     ...     delim=' ',
1452     ... )
1453     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1454
1455     """
1456     out = []
1457
1458     # Column specs map input lines' columns into outputs.
1459     # [col1, col2...]
1460     for spec in column_specs:
1461         chunk = ''
1462         for n in spec:
1463             chunk = chunk + delim + input_lines[n]
1464         chunk = chunk.strip(delim)
1465         out.append(chunk)
1466     return out
1467
1468
1469 def shuffle_columns_into_dict(
1470     input_lines: Sequence[str],
1471     column_specs: Iterable[Tuple[str, Iterable[int]]],
1472     delim='',
1473 ) -> Dict[str, str]:
1474     """Helper to shuffle / parse columnar data and return the results
1475     as a dict.
1476
1477     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1478     >>> shuffle_columns_into_dict(
1479     ...     cols,
1480     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1481     ...     delim=' ',
1482     ... )
1483     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1484
1485     """
1486     out = {}
1487
1488     # Column specs map input lines' columns into outputs.
1489     # "key", [col1, col2...]
1490     for spec in column_specs:
1491         chunk = ''
1492         for n in spec[1]:
1493             chunk = chunk + delim + input_lines[n]
1494         chunk = chunk.strip(delim)
1495         out[spec[0]] = chunk
1496     return out
1497
1498
1499 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1500     """Interpolate a string with data from a dict.
1501
1502     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1503     ...                        {'adjective': 'good', 'noun': 'example'})
1504     'This is a good example.'
1505
1506     """
1507     return sprintf(txt.format(**values), end='')
1508
1509
1510 def to_ascii(x: str):
1511     """Encode as ascii bytes string.
1512
1513     >>> to_ascii('test')
1514     b'test'
1515
1516     >>> to_ascii(b'1, 2, 3')
1517     b'1, 2, 3'
1518
1519     """
1520     if type(x) is str:
1521         return x.encode('ascii')
1522     if type(x) is bytes:
1523         return x
1524     raise Exception('to_ascii works with strings and bytes')
1525
1526
1527 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1528     """Encode txt and then encode the bytes with a 64-character
1529     alphabet.  This is compatible with uudecode.
1530
1531     >>> to_base64('hello?')
1532     b'aGVsbG8/\\n'
1533
1534     """
1535     return base64.encodebytes(txt.encode(encoding, errors))
1536
1537
1538 def is_base64(txt: str) -> bool:
1539     """Determine whether a string is base64 encoded (with Python's standard
1540     base64 alphabet which is the same as what uuencode uses).
1541
1542     >>> is_base64('test')    # all letters in the b64 alphabet
1543     True
1544
1545     >>> is_base64('another test, how do you like this one?')
1546     False
1547
1548     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1549     True
1550
1551     """
1552     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1553     alphabet = set(a.encode('ascii'))
1554     for char in to_ascii(txt.strip()):
1555         if char not in alphabet:
1556             return False
1557     return True
1558
1559
1560 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1561     """Convert base64 encoded string back to normal strings.
1562
1563     >>> from_base64(b'aGVsbG8/\\n')
1564     'hello?'
1565
1566     """
1567     return base64.decodebytes(b64).decode(encoding, errors)
1568
1569
1570 def chunk(txt: str, chunk_size):
1571     """Chunk up a string.
1572
1573     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1574     '01001101 11000101 10101010 10101010 10011111 10101000'
1575
1576     """
1577     if len(txt) % chunk_size != 0:
1578         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1579         logger.warning(msg)
1580         warnings.warn(msg, stacklevel=2)
1581     for x in range(0, len(txt), chunk_size):
1582         yield txt[x : x + chunk_size]
1583
1584
1585 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1586     """Encode txt and then chop it into bytes.  Note: only bitstrings
1587     with delimiter='' are interpretable by from_bitstring.
1588
1589     >>> to_bitstring('hello?')
1590     '011010000110010101101100011011000110111100111111'
1591
1592     >>> to_bitstring('test', delimiter=' ')
1593     '01110100 01100101 01110011 01110100'
1594
1595     >>> to_bitstring(b'test')
1596     '01110100011001010111001101110100'
1597
1598     """
1599     etxt = to_ascii(txt)
1600     bits = bin(int.from_bytes(etxt, 'big'))
1601     bits = bits[2:]
1602     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1603
1604
1605 def is_bitstring(txt: str) -> bool:
1606     """Is this a bitstring?
1607
1608     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1609     True
1610
1611     >>> is_bitstring('1234')
1612     False
1613
1614     """
1615     return is_binary_integer_number(f'0b{txt}')
1616
1617
1618 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1619     """Convert from bitstring back to bytes then decode into a str.
1620
1621     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1622     'hello?'
1623
1624     """
1625     n = int(bits, 2)
1626     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1627
1628
1629 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1630     """Turn an IPv4 address into a tuple for sorting purposes.
1631
1632     >>> ip_v4_sort_key('10.0.0.18')
1633     (10, 0, 0, 18)
1634
1635     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1636     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1637     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1638
1639     """
1640     if not is_ip_v4(txt):
1641         print(f"not IP: {txt}")
1642         return None
1643     return tuple([int(x) for x in txt.split('.')])
1644
1645
1646 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1647     """Chunk up a file path so that parent/ancestor paths sort before
1648     children/descendant paths.
1649
1650     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1651     ('usr', 'local', 'bin')
1652
1653     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1654     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1655     ['/usr', '/usr/local', '/usr/local/bin']
1656
1657     """
1658     return tuple([x for x in volume.split('/') if len(x) > 0])
1659
1660
1661 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1662     """Execute several replace operations in a row.
1663
1664     >>> s = 'this_is a-test!'
1665     >>> replace_all(s, ' _-!', '')
1666     'thisisatest'
1667
1668     """
1669     for char in replace_set:
1670         in_str = in_str.replace(char, replacement)
1671     return in_str
1672
1673
1674 if __name__ == '__main__':
1675     import doctest
1676
1677     doctest.testmod()