string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7 Modifications Copyright (c) 2021-2022 Scott Gasch
   8
   9 Permission is hereby granted, free of charge, to any person obtaining a copy
  10 of this software and associated documentation files (the "Software"), to deal
  11 in the Software without restriction, including without limitation the rights
  12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  13 copies of the Software, and to permit persons to whom the Software is
  14 furnished to do so, subject to the following conditions:
  15
  16 The above copyright notice and this permission notice shall be included in all
  17 copies or substantial portions of the Software.
  18
  19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  25 SOFTWARE.
  26
  27 This class is based on: https://github.com/daveoncode/python-string-utils.
  28 """
  29
  30 import base64
  31 import contextlib  # type: ignore
  32 import datetime
  33 import io
  34 import json
  35 import logging
  36 import numbers
  37 import random
  38 import re
  39 import string
  40 import unicodedata
  41 import warnings
  42 from itertools import zip_longest
  43 from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple
  44 from uuid import uuid4
  45
  46 import list_utils
  47
  48 logger = logging.getLogger(__name__)
  49
  50 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  51
  52 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  53
  54 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  55
  56 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  57
  58 URLS_RAW_STRING = (
  59     r"([a-z-]+://)"  # scheme
  60     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  61     r"(www\.)?"  # www.
  62     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  63     r"(:\d{2,})?"  # port number
  64     r"(/[a-z\d_%+-]*)*"  # folders
  65     r"(\.[a-z\d_%+-]+)*"  # file extension
  66     r"(\?[a-z\d_+%-=]*)?"  # query string
  67     r"(#\S*)?"  # hash
  68 )
  69
  70 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  71
  72 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  73
  74 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  75
  76 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  77
  78 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  79
  80 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  81
  82 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
  83
  84 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  85
  86 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
  87
  88 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
  89
  90 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  91
  92 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  93
  94 CREDIT_CARDS = {
  95     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  96     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  97     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  98     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  99     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 100     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 101 }
 102
 103 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 104
 105 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
 106
 107 UUID_HEX_OK_RE = re.compile(
 108     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 109     re.IGNORECASE,
 110 )
 111
 112 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 113
 114 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 115
 116 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 117
 118 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 119
 120 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 121
 122 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
 123
 124 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 125
 126 HTML_RE = re.compile(
 127     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 128     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 129 )
 130
 131 HTML_TAG_ONLY_RE = re.compile(
 132     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 133     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 134 )
 135
 136 SPACES_RE = re.compile(r"\s")
 137
 138 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 139
 140 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 141
 142 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 143
 144 NUM_SUFFIXES = {
 145     "Pb": (1024**5),
 146     "P": (1024**5),
 147     "Tb": (1024**4),
 148     "T": (1024**4),
 149     "Gb": (1024**3),
 150     "G": (1024**3),
 151     "Mb": (1024**2),
 152     "M": (1024**2),
 153     "Kb": (1024**1),
 154     "K": (1024**1),
 155 }
 156
 157
 158 def is_none_or_empty(in_str: Optional[str]) -> bool:
 159     """
 160     Returns true if the input string is either None or an empty string.
 161
 162     >>> is_none_or_empty("")
 163     True
 164     >>> is_none_or_empty(None)
 165     True
 166     >>> is_none_or_empty("   \t   ")
 167     True
 168     >>> is_none_or_empty('Test')
 169     False
 170     """
 171     return in_str is None or len(in_str.strip()) == 0
 172
 173
 174 def is_string(obj: Any) -> bool:
 175     """
 176     Checks if an object is a string.
 177
 178     >>> is_string('test')
 179     True
 180     >>> is_string(123)
 181     False
 182     >>> is_string(100.3)
 183     False
 184     >>> is_string([1, 2, 3])
 185     False
 186     """
 187     return isinstance(obj, str)
 188
 189
 190 def is_empty_string(in_str: Any) -> bool:
 191     return is_empty(in_str)
 192
 193
 194 def is_empty(in_str: Any) -> bool:
 195     """
 196     Checks if input is a string and empty or only whitespace.
 197
 198     >>> is_empty('')
 199     True
 200     >>> is_empty('    \t\t    ')
 201     True
 202     >>> is_empty('test')
 203     False
 204     >>> is_empty(100.88)
 205     False
 206     >>> is_empty([1, 2, 3])
 207     False
 208     """
 209     return is_string(in_str) and in_str.strip() == ""
 210
 211
 212 def is_full_string(in_str: Any) -> bool:
 213     """
 214     Checks that input is a string and is not empty ('') or only whitespace.
 215
 216     >>> is_full_string('test!')
 217     True
 218     >>> is_full_string('')
 219     False
 220     >>> is_full_string('      ')
 221     False
 222     >>> is_full_string(100.999)
 223     False
 224     >>> is_full_string({"a": 1, "b": 2})
 225     False
 226     """
 227     return is_string(in_str) and in_str.strip() != ""
 228
 229
 230 def is_number(in_str: str) -> bool:
 231     """
 232     Checks if a string is a valid number.
 233
 234     >>> is_number(100.5)
 235     Traceback (most recent call last):
 236     ...
 237     ValueError: 100.5
 238     >>> is_number("100.5")
 239     True
 240     >>> is_number("test")
 241     False
 242     >>> is_number("99")
 243     True
 244     >>> is_number([1, 2, 3])
 245     Traceback (most recent call last):
 246     ...
 247     ValueError: [1, 2, 3]
 248     """
 249     if not is_string(in_str):
 250         raise ValueError(in_str)
 251     return NUMBER_RE.match(in_str) is not None
 252
 253
 254 def is_integer_number(in_str: str) -> bool:
 255     """
 256     Checks whether the given string represents an integer or not.
 257
 258     An integer may be signed or unsigned or use a "scientific notation".
 259
 260     >>> is_integer_number('42')
 261     True
 262     >>> is_integer_number('42.0')
 263     False
 264     """
 265     return (
 266         (is_number(in_str) and "." not in in_str)
 267         or is_hexidecimal_integer_number(in_str)
 268         or is_octal_integer_number(in_str)
 269         or is_binary_integer_number(in_str)
 270     )
 271
 272
 273 def is_hexidecimal_integer_number(in_str: str) -> bool:
 274     """
 275     Checks whether a string is a hex integer number.
 276
 277     >>> is_hexidecimal_integer_number('0x12345')
 278     True
 279     >>> is_hexidecimal_integer_number('0x1A3E')
 280     True
 281     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 282     False
 283     >>> is_hexidecimal_integer_number('-0xff')
 284     True
 285     >>> is_hexidecimal_integer_number('test')
 286     False
 287     >>> is_hexidecimal_integer_number(12345)  # Not a string
 288     Traceback (most recent call last):
 289     ...
 290     ValueError: 12345
 291     >>> is_hexidecimal_integer_number(101.4)
 292     Traceback (most recent call last):
 293     ...
 294     ValueError: 101.4
 295     >>> is_hexidecimal_integer_number(0x1A3E)
 296     Traceback (most recent call last):
 297     ...
 298     ValueError: 6718
 299     """
 300     if not is_string(in_str):
 301         raise ValueError(in_str)
 302     return HEX_NUMBER_RE.match(in_str) is not None
 303
 304
 305 def is_octal_integer_number(in_str: str) -> bool:
 306     """
 307     Checks whether a string is an octal number.
 308
 309     >>> is_octal_integer_number('0o777')
 310     True
 311     >>> is_octal_integer_number('-0O115')
 312     True
 313     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 314     False
 315     >>> is_octal_integer_number('7777')  # Needs 0o
 316     False
 317     >>> is_octal_integer_number('test')
 318     False
 319     """
 320     if not is_string(in_str):
 321         raise ValueError(in_str)
 322     return OCT_NUMBER_RE.match(in_str) is not None
 323
 324
 325 def is_binary_integer_number(in_str: str) -> bool:
 326     """
 327     Returns whether a string contains a binary number.
 328
 329     >>> is_binary_integer_number('0b10111')
 330     True
 331     >>> is_binary_integer_number('-0b111')
 332     True
 333     >>> is_binary_integer_number('0B10101')
 334     True
 335     >>> is_binary_integer_number('0b10102')
 336     False
 337     >>> is_binary_integer_number('0xFFF')
 338     False
 339     >>> is_binary_integer_number('test')
 340     False
 341     """
 342     if not is_string(in_str):
 343         raise ValueError(in_str)
 344     return BIN_NUMBER_RE.match(in_str) is not None
 345
 346
 347 def to_int(in_str: str) -> int:
 348     """Returns the integral value of the string or raises on error.
 349
 350     >>> to_int('1234')
 351     1234
 352     >>> to_int('test')
 353     Traceback (most recent call last):
 354     ...
 355     ValueError: invalid literal for int() with base 10: 'test'
 356     """
 357     if not is_string(in_str):
 358         raise ValueError(in_str)
 359     if is_binary_integer_number(in_str):
 360         return int(in_str, 2)
 361     if is_octal_integer_number(in_str):
 362         return int(in_str, 8)
 363     if is_hexidecimal_integer_number(in_str):
 364         return int(in_str, 16)
 365     return int(in_str)
 366
 367
 368 def is_decimal_number(in_str: str) -> bool:
 369     """
 370     Checks whether the given string represents a decimal or not.
 371
 372     A decimal may be signed or unsigned or use a "scientific notation".
 373
 374     >>> is_decimal_number('42.0')
 375     True
 376     >>> is_decimal_number('42')
 377     False
 378     """
 379     return is_number(in_str) and "." in in_str
 380
 381
 382 def strip_escape_sequences(in_str: str) -> str:
 383     """
 384     Remove escape sequences in the input string.
 385
 386     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 387     'this is a test!'
 388     """
 389     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 390     return in_str
 391
 392
 393 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 394     """
 395     Add thousands separator to a numeric string.  Also handles numbers.
 396
 397     >>> add_thousands_separator('12345678')
 398     '12,345,678'
 399     >>> add_thousands_separator(12345678)
 400     '12,345,678'
 401     >>> add_thousands_separator(12345678.99)
 402     '12,345,678.99'
 403     >>> add_thousands_separator('test')
 404     Traceback (most recent call last):
 405     ...
 406     ValueError: test
 407
 408     """
 409     if isinstance(in_str, numbers.Number):
 410         in_str = f'{in_str}'
 411     if is_number(in_str):
 412         return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
 413     raise ValueError(in_str)
 414
 415
 416 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 417     decimal_part = ""
 418     if '.' in in_str:
 419         (in_str, decimal_part) = in_str.split('.')
 420     tmp = [iter(in_str[::-1])] * places
 421     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 422     if len(decimal_part) > 0:
 423         ret += '.'
 424         ret += decimal_part
 425     return ret
 426
 427
 428 # Full url example:
 429 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 430 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 431     """
 432     Check if a string is a valid url.
 433
 434     >>> is_url('http://www.mysite.com')
 435     True
 436     >>> is_url('https://mysite.com')
 437     True
 438     >>> is_url('.mysite.com')
 439     False
 440     """
 441     if not is_full_string(in_str):
 442         return False
 443
 444     valid = URL_RE.match(in_str) is not None
 445
 446     if allowed_schemes:
 447         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 448     return valid
 449
 450
 451 def is_email(in_str: Any) -> bool:
 452     """
 453     Check if a string is a valid email.
 454
 455     Reference: https://tools.ietf.org/html/rfc3696#section-3
 456
 457     >>> is_email('[email protected]')
 458     True
 459     >>> is_email('@gmail.com')
 460     False
 461     """
 462     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 463         return False
 464
 465     try:
 466         # we expect 2 tokens, one before "@" and one after, otherwise
 467         # we have an exception and the email is not valid.
 468         head, tail = in_str.split("@")
 469
 470         # head's size must be <= 64, tail <= 255, head must not start
 471         # with a dot or contain multiple consecutive dots.
 472         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 473             return False
 474
 475         # removes escaped spaces, so that later on the test regex will
 476         # accept the string.
 477         head = head.replace("\\ ", "")
 478         if head.startswith('"') and head.endswith('"'):
 479             head = head.replace(" ", "")[1:-1]
 480         return EMAIL_RE.match(head + "@" + tail) is not None
 481
 482     except ValueError:
 483         # borderline case in which we have multiple "@" signs but the
 484         # head part is correctly escaped.
 485         if ESCAPED_AT_SIGN.search(in_str) is not None:
 486             # replace "@" with "a" in the head
 487             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 488         return False
 489
 490
 491 def suffix_string_to_number(in_str: str) -> Optional[int]:
 492     """Take a string like "33Gb" and convert it into a number (of bytes)
 493     like 34603008.  Return None if the input string is not valid.
 494
 495     >>> suffix_string_to_number('1Mb')
 496     1048576
 497     >>> suffix_string_to_number('13.1Gb')
 498     14066017894
 499     """
 500
 501     def suffix_capitalize(s: str) -> str:
 502         if len(s) == 1:
 503             return s.upper()
 504         elif len(s) == 2:
 505             return f"{s[0].upper()}{s[1].lower()}"
 506         return suffix_capitalize(s[0:1])
 507
 508     if is_string(in_str):
 509         if is_integer_number(in_str):
 510             return to_int(in_str)
 511         suffixes = [in_str[-2:], in_str[-1:]]
 512         rest = [in_str[:-2], in_str[:-1]]
 513         for x in range(len(suffixes)):
 514             s = suffixes[x]
 515             s = suffix_capitalize(s)
 516             multiplier = NUM_SUFFIXES.get(s, None)
 517             if multiplier is not None:
 518                 r = rest[x]
 519                 if is_integer_number(r):
 520                     return to_int(r) * multiplier
 521                 if is_decimal_number(r):
 522                     return int(float(r) * multiplier)
 523     return None
 524
 525
 526 def number_to_suffix_string(num: int) -> Optional[str]:
 527     """Take a number (of bytes) and returns a string like "43.8Gb".
 528     Returns none if the input is invalid.
 529
 530     >>> number_to_suffix_string(14066017894)
 531     '13.1Gb'
 532     >>> number_to_suffix_string(1024 * 1024)
 533     '1.0Mb'
 534
 535     """
 536     d = 0.0
 537     suffix = None
 538     for (sfx, size) in NUM_SUFFIXES.items():
 539         if num >= size:
 540             d = num / size
 541             suffix = sfx
 542             break
 543     if suffix is not None:
 544         return f"{d:.1f}{suffix}"
 545     else:
 546         return f'{num:d}'
 547
 548
 549 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 550     """
 551     Checks if a string is a valid credit card number.
 552     If card type is provided then it checks against that specific type only,
 553     otherwise any known credit card number will be accepted.
 554
 555     Supported card types are the following:
 556
 557     - VISA
 558     - MASTERCARD
 559     - AMERICAN_EXPRESS
 560     - DINERS_CLUB
 561     - DISCOVER
 562     - JCB
 563     """
 564     if not is_full_string(in_str):
 565         return False
 566
 567     if card_type is not None:
 568         if card_type not in CREDIT_CARDS:
 569             raise KeyError(
 570                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 571             )
 572         return CREDIT_CARDS[card_type].match(in_str) is not None
 573     for c in CREDIT_CARDS:
 574         if CREDIT_CARDS[c].match(in_str) is not None:
 575             return True
 576     return False
 577
 578
 579 def is_camel_case(in_str: Any) -> bool:
 580     """
 581     Checks if a string is formatted as camel case.
 582
 583     A string is considered camel case when:
 584
 585     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 586     - it contains both lowercase and uppercase letters
 587     - it does not start with a number
 588     """
 589     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 590
 591
 592 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 593     """
 594     Checks if a string is formatted as "snake case".
 595
 596     A string is considered snake case when:
 597
 598     - it's composed only by lowercase/uppercase letters and digits
 599     - it contains at least one underscore (or provided separator)
 600     - it does not start with a number
 601
 602     >>> is_snake_case('this_is_a_test')
 603     True
 604     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 605     True
 606     >>> is_snake_case('this-is-a-test')
 607     False
 608     >>> is_snake_case('this-is-a-test', separator='-')
 609     True
 610
 611     """
 612     if is_full_string(in_str):
 613         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 614         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 615         r = re_map.get(
 616             separator,
 617             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 618         )
 619         return r.match(in_str) is not None
 620     return False
 621
 622
 623 def is_json(in_str: Any) -> bool:
 624     """
 625     Check if a string is a valid json.
 626
 627     >>> is_json('{"name": "Peter"}')
 628     True
 629     >>> is_json('[1, 2, 3]')
 630     True
 631     >>> is_json('{nope}')
 632     False
 633     """
 634     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 635         try:
 636             return isinstance(json.loads(in_str), (dict, list))
 637         except (TypeError, ValueError, OverflowError):
 638             pass
 639     return False
 640
 641
 642 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 643     """
 644     Check if a string is a valid UUID.
 645
 646     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 647     True
 648     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 649     False
 650     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 651     True
 652     """
 653     # string casting is used to allow UUID itself as input data type
 654     s = str(in_str)
 655     if allow_hex:
 656         return UUID_HEX_OK_RE.match(s) is not None
 657     return UUID_RE.match(s) is not None
 658
 659
 660 def is_ip_v4(in_str: Any) -> bool:
 661     """
 662     Checks if a string is a valid ip v4.
 663
 664     >>> is_ip_v4('255.200.100.75')
 665     True
 666     >>> is_ip_v4('nope')
 667     False
 668     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 669     False
 670     """
 671     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 672         return False
 673
 674     # checks that each entry in the ip is in the valid range (0 to 255)
 675     for token in in_str.split("."):
 676         if not 0 <= int(token) <= 255:
 677             return False
 678     return True
 679
 680
 681 def extract_ip_v4(in_str: Any) -> Optional[str]:
 682     """
 683     Extracts the IPv4 chunk of a string or None.
 684
 685     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 686     '127.0.0.1'
 687     >>> extract_ip_v4('Your mom dresses you funny.')
 688     """
 689     if not is_full_string(in_str):
 690         return None
 691     m = ANYWHERE_IP_V4_RE.search(in_str)
 692     if m is not None:
 693         return m.group(0)
 694     return None
 695
 696
 697 def is_ip_v6(in_str: Any) -> bool:
 698     """
 699     Checks if a string is a valid ip v6.
 700
 701     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 702     True
 703     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 704     False
 705     """
 706     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 707
 708
 709 def extract_ip_v6(in_str: Any) -> Optional[str]:
 710     """
 711     Extract IPv6 chunk or None.
 712
 713     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 714     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 715     >>> extract_ip_v6("(and she's ugly too, btw)")
 716     """
 717     if not is_full_string(in_str):
 718         return None
 719     m = ANYWHERE_IP_V6_RE.search(in_str)
 720     if m is not None:
 721         return m.group(0)
 722     return None
 723
 724
 725 def is_ip(in_str: Any) -> bool:
 726     """
 727     Checks if a string is a valid ip (either v4 or v6).
 728
 729     >>> is_ip('255.200.100.75')
 730     True
 731     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 732     True
 733     >>> is_ip('1.2.3')
 734     False
 735     >>> is_ip('1.2.3.999')
 736     False
 737     """
 738     return is_ip_v6(in_str) or is_ip_v4(in_str)
 739
 740
 741 def extract_ip(in_str: Any) -> Optional[str]:
 742     """
 743     Extract the IP address or None.
 744
 745     >>> extract_ip('Attacker: 255.200.100.75')
 746     '255.200.100.75'
 747     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 748     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 749     >>> extract_ip('1.2.3')
 750
 751     """
 752     ip = extract_ip_v4(in_str)
 753     if ip is None:
 754         ip = extract_ip_v6(in_str)
 755     return ip
 756
 757
 758 def is_mac_address(in_str: Any) -> bool:
 759     """Return True if in_str is a valid MAC address false otherwise.
 760
 761     >>> is_mac_address("34:29:8F:12:0D:2F")
 762     True
 763     >>> is_mac_address('34:29:8f:12:0d:2f')
 764     True
 765     >>> is_mac_address('34-29-8F-12-0D-2F')
 766     True
 767     >>> is_mac_address("test")
 768     False
 769     """
 770     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 771
 772
 773 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 774     """
 775     Extract the MAC address from in_str.
 776
 777     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 778     '34:29:8F:12:0D:2F'
 779
 780     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 781     'd8:5d:e2:34:54:86'
 782
 783     """
 784     if not is_full_string(in_str):
 785         return None
 786     in_str.strip()
 787     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 788     if m is not None:
 789         mac = m.group(0)
 790         mac.replace(":", separator)
 791         mac.replace("-", separator)
 792         return mac
 793     return None
 794
 795
 796 def is_slug(in_str: Any, separator: str = "-") -> bool:
 797     """
 798     Checks if a given string is a slug (as created by `slugify()`).
 799
 800     >>> is_slug('my-blog-post-title')
 801     True
 802     >>> is_slug('My blog post title')
 803     False
 804
 805     """
 806     if not is_full_string(in_str):
 807         return False
 808     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 809     return re.match(rex, in_str) is not None
 810
 811
 812 def contains_html(in_str: str) -> bool:
 813     """
 814     Checks if the given string contains HTML/XML tags.
 815
 816     By design, this function matches ANY type of tag, so don't expect to use it
 817     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 818
 819     >>> contains_html('my string is <strong>bold</strong>')
 820     True
 821     >>> contains_html('my string is not bold')
 822     False
 823
 824     """
 825     if not is_string(in_str):
 826         raise ValueError(in_str)
 827     return HTML_RE.search(in_str) is not None
 828
 829
 830 def words_count(in_str: str) -> int:
 831     """
 832     Returns the number of words contained into the given string.
 833
 834     This method is smart, it does consider only sequence of one or more letter and/or numbers
 835     as "words", so a string like this: "! @ # % ... []" will return zero!
 836     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 837     will be 4 not 1 (even if there are no spaces in the string).
 838
 839     >>> words_count('hello world')
 840     2
 841     >>> words_count('one,two,three.stop')
 842     4
 843
 844     """
 845     if not is_string(in_str):
 846         raise ValueError(in_str)
 847     return len(WORDS_COUNT_RE.findall(in_str))
 848
 849
 850 def generate_uuid(omit_dashes: bool = False) -> str:
 851     """
 852     Generated an UUID string (using `uuid.uuid4()`).
 853
 854     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 855     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 856
 857     """
 858     uid = uuid4()
 859     if omit_dashes:
 860         return uid.hex
 861     return str(uid)
 862
 863
 864 def generate_random_alphanumeric_string(size: int) -> str:
 865     """
 866     Returns a string of the specified size containing random
 867     characters (uppercase/lowercase ascii letters and digits).
 868
 869     random_string(9) # possible output: "cx3QQbzYg"
 870
 871     """
 872     if size < 1:
 873         raise ValueError("size must be >= 1")
 874     chars = string.ascii_letters + string.digits
 875     buffer = [random.choice(chars) for _ in range(size)]
 876     return from_char_list(buffer)
 877
 878
 879 def reverse(in_str: str) -> str:
 880     """
 881     Returns the string with its chars reversed.
 882
 883     >>> reverse('test')
 884     'tset'
 885
 886     """
 887     if not is_string(in_str):
 888         raise ValueError(in_str)
 889     return in_str[::-1]
 890
 891
 892 def camel_case_to_snake_case(in_str, *, separator="_"):
 893     """
 894     Convert a camel case string into a snake case one.
 895     (The original string is returned if is not a valid camel case string)
 896
 897     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 898     'mac_address_extractor_factory'
 899     >>> camel_case_to_snake_case('Luke Skywalker')
 900     'Luke Skywalker'
 901     """
 902     if not is_string(in_str):
 903         raise ValueError(in_str)
 904     if not is_camel_case(in_str):
 905         return in_str
 906     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
 907
 908
 909 def snake_case_to_camel_case(
 910     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 911 ) -> str:
 912     """
 913     Convert a snake case string into a camel case one.
 914     (The original string is returned if is not a valid snake case string)
 915
 916     >>> snake_case_to_camel_case('this_is_a_test')
 917     'ThisIsATest'
 918     >>> snake_case_to_camel_case('Han Solo')
 919     'Han Solo'
 920     """
 921     if not is_string(in_str):
 922         raise ValueError(in_str)
 923     if not is_snake_case(in_str, separator=separator):
 924         return in_str
 925     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 926     if not upper_case_first:
 927         tokens[0] = tokens[0].lower()
 928     return from_char_list(tokens)
 929
 930
 931 def to_char_list(in_str: str) -> List[str]:
 932     """Convert a string into a list of chars.
 933
 934     >>> to_char_list('test')
 935     ['t', 'e', 's', 't']
 936     """
 937     if not is_string(in_str):
 938         return []
 939     return list(in_str)
 940
 941
 942 def from_char_list(in_list: List[str]) -> str:
 943     """Convert a char list into a string.
 944
 945     >>> from_char_list(['t', 'e', 's', 't'])
 946     'test'
 947     """
 948     return "".join(in_list)
 949
 950
 951 def shuffle(in_str: str) -> str:
 952     """Return a new string containing same chars of the given one but in
 953     a randomized order.
 954     """
 955     if not is_string(in_str):
 956         raise ValueError(in_str)
 957
 958     # turn the string into a list of chars
 959     chars = to_char_list(in_str)
 960     random.shuffle(chars)
 961     return from_char_list(chars)
 962
 963
 964 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 965     """
 966     Remove html code contained into the given string.
 967
 968     >>> strip_html('test: <a href="foo/bar">click here</a>')
 969     'test: '
 970     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 971     'test: click here'
 972     """
 973     if not is_string(in_str):
 974         raise ValueError(in_str)
 975     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 976     return r.sub("", in_str)
 977
 978
 979 def asciify(in_str: str) -> str:
 980     """
 981     Force string content to be ascii-only by translating all non-ascii
 982     chars into the closest possible representation (eg: ó -> o, Ë ->
 983     E, ç -> c...).
 984
 985     N.B. Some chars may be lost if impossible to translate.
 986
 987     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
 988     'eeuuooaaeynAAACIINOE'
 989     """
 990     if not is_string(in_str):
 991         raise ValueError(in_str)
 992
 993     # "NFKD" is the algorithm which is able to successfully translate
 994     # the most of non-ascii chars.
 995     normalized = unicodedata.normalize("NFKD", in_str)
 996
 997     # encode string forcing ascii and ignore any errors
 998     # (unrepresentable chars will be stripped out)
 999     ascii_bytes = normalized.encode("ascii", "ignore")
1000
1001     # turns encoded bytes into an utf-8 string
1002     return ascii_bytes.decode("utf-8")
1003
1004
1005 def slugify(in_str: str, *, separator: str = "-") -> str:
1006     """
1007     Converts a string into a "slug" using provided separator.
1008     The returned string has the following properties:
1009
1010     - it has no spaces
1011     - all letters are in lower case
1012     - all punctuation signs and non alphanumeric chars are removed
1013     - words are divided using provided separator
1014     - all chars are encoded as ascii (by using `asciify()`)
1015     - is safe for URL
1016
1017     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1018     'top-10-reasons-to-love-dogs'
1019     >>> slugify('Mönstér Mägnët')
1020     'monster-magnet'
1021     """
1022     if not is_string(in_str):
1023         raise ValueError(in_str)
1024
1025     # replace any character that is NOT letter or number with spaces
1026     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1027
1028     # replace spaces with join sign
1029     out = SPACES_RE.sub(separator, out)
1030
1031     # normalize joins (remove duplicates)
1032     out = re.sub(re.escape(separator) + r"+", separator, out)
1033     return asciify(out)
1034
1035
1036 def to_bool(in_str: str) -> bool:
1037     """
1038     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1039
1040     A positive boolean (True) is returned if the string value is one
1041     of the following:
1042
1043     - "true"
1044     - "1"
1045     - "yes"
1046     - "y"
1047
1048     Otherwise False is returned.
1049
1050     >>> to_bool('True')
1051     True
1052
1053     >>> to_bool('1')
1054     True
1055
1056     >>> to_bool('yes')
1057     True
1058
1059     >>> to_bool('no')
1060     False
1061
1062     >>> to_bool('huh?')
1063     False
1064
1065     >>> to_bool('on')
1066     True
1067
1068     """
1069     if not is_string(in_str):
1070         raise ValueError(in_str)
1071     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1072
1073
1074 def to_date(in_str: str) -> Optional[datetime.date]:
1075     """
1076     Parses a date string.  See DateParser docs for details.
1077     """
1078     import dateparse.dateparse_utils as du
1079
1080     try:
1081         d = du.DateParser()  # type: ignore
1082         d.parse(in_str)
1083         return d.get_date()
1084     except du.ParseException:  # type: ignore
1085         msg = f'Unable to parse date {in_str}.'
1086         logger.warning(msg)
1087     return None
1088
1089
1090 def valid_date(in_str: str) -> bool:
1091     """
1092     True if the string represents a valid date.
1093     """
1094     import dateparse.dateparse_utils as dp
1095
1096     try:
1097         d = dp.DateParser()  # type: ignore
1098         _ = d.parse(in_str)
1099         return True
1100     except dp.ParseException:  # type: ignore
1101         msg = f'Unable to parse date {in_str}.'
1102         logger.warning(msg)
1103     return False
1104
1105
1106 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1107     """
1108     Parses a datetime string.  See DateParser docs for more info.
1109     """
1110     import dateparse.dateparse_utils as dp
1111
1112     try:
1113         d = dp.DateParser()  # type: ignore
1114         dt = d.parse(in_str)
1115         if isinstance(dt, datetime.datetime):
1116             return dt
1117     except ValueError:
1118         msg = f'Unable to parse datetime {in_str}.'
1119         logger.warning(msg)
1120     return None
1121
1122
1123 def valid_datetime(in_str: str) -> bool:
1124     """
1125     True if the string represents a valid datetime.
1126     """
1127     _ = to_datetime(in_str)
1128     if _ is not None:
1129         return True
1130     msg = f'Unable to parse datetime {in_str}.'
1131     logger.warning(msg)
1132     return False
1133
1134
1135 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1136     """
1137     Squeeze runs of more than one character_to_squeeze into one.
1138
1139     >>> squeeze(' this        is       a    test    ')
1140     ' this is a test '
1141
1142     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1143     'one|!|two|!|three'
1144
1145     """
1146     return re.sub(
1147         r'(' + re.escape(character_to_squeeze) + r')+',
1148         character_to_squeeze,
1149         in_str,
1150     )
1151
1152
1153 def dedent(in_str: str) -> str:
1154     """
1155     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1156     """
1157     if not is_string(in_str):
1158         raise ValueError(in_str)
1159     line_separator = '\n'
1160     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1161     return line_separator.join(lines)
1162
1163
1164 def indent(in_str: str, amount: int) -> str:
1165     """
1166     Indents string by prepending amount spaces.
1167
1168     >>> indent('This is a test', 4)
1169     '    This is a test'
1170
1171     """
1172     if not is_string(in_str):
1173         raise ValueError(in_str)
1174     line_separator = '\n'
1175     lines = [" " * amount + line for line in in_str.split(line_separator)]
1176     return line_separator.join(lines)
1177
1178
1179 def sprintf(*args, **kwargs) -> str:
1180     """String printf, like in C"""
1181     ret = ""
1182
1183     sep = kwargs.pop("sep", None)
1184     if sep is not None:
1185         if not isinstance(sep, str):
1186             raise TypeError("sep must be None or a string")
1187
1188     end = kwargs.pop("end", None)
1189     if end is not None:
1190         if not isinstance(end, str):
1191             raise TypeError("end must be None or a string")
1192
1193     if kwargs:
1194         raise TypeError("invalid keyword arguments to sprint()")
1195
1196     if sep is None:
1197         sep = " "
1198     if end is None:
1199         end = "\n"
1200     for i, arg in enumerate(args):
1201         if i:
1202             ret += sep
1203         if isinstance(arg, str):
1204             ret += arg
1205         else:
1206             ret += str(arg)
1207     ret += end
1208     return ret
1209
1210
1211 class SprintfStdout(contextlib.AbstractContextManager):
1212     """
1213     A context manager that captures outputs to stdout.
1214
1215     with SprintfStdout() as buf:
1216         print("test")
1217     print(buf())
1218
1219     'test\n'
1220     """
1221
1222     def __init__(self) -> None:
1223         self.destination = io.StringIO()
1224         self.recorder: contextlib.redirect_stdout
1225
1226     def __enter__(self) -> Callable[[], str]:
1227         self.recorder = contextlib.redirect_stdout(self.destination)
1228         self.recorder.__enter__()
1229         return lambda: self.destination.getvalue()
1230
1231     def __exit__(self, *args) -> Literal[False]:
1232         self.recorder.__exit__(*args)
1233         self.destination.seek(0)
1234         return False
1235
1236
1237 def capitalize_first_letter(txt: str) -> str:
1238     """Capitalize the first letter of a string.
1239
1240     >>> capitalize_first_letter('test')
1241     'Test'
1242     >>> capitalize_first_letter("ALREADY!")
1243     'ALREADY!'
1244
1245     """
1246     return txt[0].upper() + txt[1:]
1247
1248
1249 def it_they(n: int) -> str:
1250     """It or they?
1251
1252     >>> it_they(1)
1253     'it'
1254     >>> it_they(100)
1255     'they'
1256
1257     """
1258     if n == 1:
1259         return "it"
1260     return "they"
1261
1262
1263 def is_are(n: int) -> str:
1264     """Is or are?
1265
1266     >>> is_are(1)
1267     'is'
1268     >>> is_are(2)
1269     'are'
1270
1271     """
1272     if n == 1:
1273         return "is"
1274     return "are"
1275
1276
1277 def pluralize(n: int) -> str:
1278     """Add an s?
1279
1280     >>> pluralize(15)
1281     's'
1282     >>> count = 1
1283     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1284     There is 1 file.
1285     >>> count = 4
1286     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1287     There are 4 files.
1288
1289     """
1290     if n == 1:
1291         return ""
1292     return "s"
1293
1294
1295 def make_contractions(txt: str) -> str:
1296     """Glue words together to form contractions.
1297
1298     >>> make_contractions('It is nice today.')
1299     "It's nice today."
1300
1301     >>> make_contractions('I can    not even...')
1302     "I can't even..."
1303
1304     >>> make_contractions('She could not see!')
1305     "She couldn't see!"
1306
1307     >>> make_contractions('But she will not go.')
1308     "But she won't go."
1309
1310     >>> make_contractions('Verily, I shall not.')
1311     "Verily, I shan't."
1312
1313     >>> make_contractions('No you cannot.')
1314     "No you can't."
1315
1316     >>> make_contractions('I said you can not go.')
1317     "I said you can't go."
1318
1319     """
1320
1321     first_second = [
1322         (
1323             [
1324                 'are',
1325                 'could',
1326                 'did',
1327                 'has',
1328                 'have',
1329                 'is',
1330                 'must',
1331                 'should',
1332                 'was',
1333                 'were',
1334                 'would',
1335             ],
1336             ['(n)o(t)'],
1337         ),
1338         (
1339             [
1340                 "I",
1341                 "you",
1342                 "he",
1343                 "she",
1344                 "it",
1345                 "we",
1346                 "they",
1347                 "how",
1348                 "why",
1349                 "when",
1350                 "where",
1351                 "who",
1352                 "there",
1353             ],
1354             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1355         ),
1356     ]
1357
1358     # Special cases: can't, shan't and won't.
1359     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1360     txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1361     txt = re.sub(
1362         r'\b(w)ill\s*(n)(o)(t)\b',
1363         r"\1\3\2'\4",
1364         txt,
1365         count=0,
1366         flags=re.IGNORECASE,
1367     )
1368
1369     for first_list, second_list in first_second:
1370         for first in first_list:
1371             for second in second_list:
1372                 # Disallow there're/where're.  They're valid English
1373                 # but sound weird.
1374                 if (first in ('there', 'where')) and second == 'a(re)':
1375                     continue
1376
1377                 pattern = fr'\b({first})\s+{second}\b'
1378                 if second == '(n)o(t)':
1379                     replacement = r"\1\2'\3"
1380                 else:
1381                     replacement = r"\1'\2"
1382                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1383
1384     return txt
1385
1386
1387 def thify(n: int) -> str:
1388     """Return the proper cardinal suffix for a number.
1389
1390     >>> thify(1)
1391     'st'
1392     >>> thify(33)
1393     'rd'
1394     >>> thify(16)
1395     'th'
1396
1397     """
1398     digit = str(n)
1399     assert is_integer_number(digit)
1400     digit = digit[-1:]
1401     if digit == "1":
1402         return "st"
1403     elif digit == "2":
1404         return "nd"
1405     elif digit == "3":
1406         return "rd"
1407     else:
1408         return "th"
1409
1410
1411 def ngrams(txt: str, n: int):
1412     """Return the ngrams from a string.
1413
1414     >>> [x for x in ngrams('This is a test', 2)]
1415     ['This is', 'is a', 'a test']
1416
1417     """
1418     words = txt.split()
1419     for ngram in ngrams_presplit(words, n):
1420         ret = ''
1421         for word in ngram:
1422             ret += f'{word} '
1423         yield ret.strip()
1424
1425
1426 def ngrams_presplit(words: Sequence[str], n: int):
1427     return list_utils.ngrams(words, n)
1428
1429
1430 def bigrams(txt: str):
1431     return ngrams(txt, 2)
1432
1433
1434 def trigrams(txt: str):
1435     return ngrams(txt, 3)
1436
1437
1438 def shuffle_columns_into_list(
1439     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1440 ) -> Iterable[str]:
1441     """Helper to shuffle / parse columnar data and return the results as a
1442     list.  The column_specs argument is an iterable collection of
1443     numeric sequences that indicate one or more column numbers to
1444     copy.
1445
1446     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1447     >>> shuffle_columns_into_list(
1448     ...     cols,
1449     ...     [ [8], [2, 3], [5, 6, 7] ],
1450     ...     delim=' ',
1451     ... )
1452     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1453
1454     """
1455     out = []
1456
1457     # Column specs map input lines' columns into outputs.
1458     # [col1, col2...]
1459     for spec in column_specs:
1460         hunk = ''
1461         for n in spec:
1462             hunk = hunk + delim + input_lines[n]
1463         hunk = hunk.strip(delim)
1464         out.append(hunk)
1465     return out
1466
1467
1468 def shuffle_columns_into_dict(
1469     input_lines: Sequence[str],
1470     column_specs: Iterable[Tuple[str, Iterable[int]]],
1471     delim='',
1472 ) -> Dict[str, str]:
1473     """Helper to shuffle / parse columnar data and return the results
1474     as a dict.
1475
1476     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1477     >>> shuffle_columns_into_dict(
1478     ...     cols,
1479     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1480     ...     delim=' ',
1481     ... )
1482     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1483
1484     """
1485     out = {}
1486
1487     # Column specs map input lines' columns into outputs.
1488     # "key", [col1, col2...]
1489     for spec in column_specs:
1490         hunk = ''
1491         for n in spec[1]:
1492             hunk = hunk + delim + input_lines[n]
1493         hunk = hunk.strip(delim)
1494         out[spec[0]] = hunk
1495     return out
1496
1497
1498 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1499     """Interpolate a string with data from a dict.
1500
1501     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1502     ...                        {'adjective': 'good', 'noun': 'example'})
1503     'This is a good example.'
1504
1505     """
1506     return sprintf(txt.format(**values), end='')
1507
1508
1509 def to_ascii(x: str):
1510     """Encode as ascii bytes string.
1511
1512     >>> to_ascii('test')
1513     b'test'
1514
1515     >>> to_ascii(b'1, 2, 3')
1516     b'1, 2, 3'
1517
1518     """
1519     if isinstance(x, str):
1520         return x.encode('ascii')
1521     if isinstance(x, bytes):
1522         return x
1523     raise Exception('to_ascii works with strings and bytes')
1524
1525
1526 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1527     """Encode txt and then encode the bytes with a 64-character
1528     alphabet.  This is compatible with uudecode.
1529
1530     >>> to_base64('hello?')
1531     b'aGVsbG8/\\n'
1532
1533     """
1534     return base64.encodebytes(txt.encode(encoding, errors))
1535
1536
1537 def is_base64(txt: str) -> bool:
1538     """Determine whether a string is base64 encoded (with Python's standard
1539     base64 alphabet which is the same as what uuencode uses).
1540
1541     >>> is_base64('test')    # all letters in the b64 alphabet
1542     True
1543
1544     >>> is_base64('another test, how do you like this one?')
1545     False
1546
1547     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1548     True
1549
1550     """
1551     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1552     alphabet = set(a.encode('ascii'))
1553     for char in to_ascii(txt.strip()):
1554         if char not in alphabet:
1555             return False
1556     return True
1557
1558
1559 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1560     """Convert base64 encoded string back to normal strings.
1561
1562     >>> from_base64(b'aGVsbG8/\\n')
1563     'hello?'
1564
1565     """
1566     return base64.decodebytes(b64).decode(encoding, errors)
1567
1568
1569 def chunk(txt: str, chunk_size):
1570     """Chunk up a string.
1571
1572     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1573     '01001101 11000101 10101010 10101010 10011111 10101000'
1574
1575     """
1576     if len(txt) % chunk_size != 0:
1577         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1578         logger.warning(msg)
1579         warnings.warn(msg, stacklevel=2)
1580     for x in range(0, len(txt), chunk_size):
1581         yield txt[x : x + chunk_size]
1582
1583
1584 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1585     """Encode txt and then chop it into bytes.  Note: only bitstrings
1586     with delimiter='' are interpretable by from_bitstring.
1587
1588     >>> to_bitstring('hello?')
1589     '011010000110010101101100011011000110111100111111'
1590
1591     >>> to_bitstring('test', delimiter=' ')
1592     '01110100 01100101 01110011 01110100'
1593
1594     >>> to_bitstring(b'test')
1595     '01110100011001010111001101110100'
1596
1597     """
1598     etxt = to_ascii(txt)
1599     bits = bin(int.from_bytes(etxt, 'big'))
1600     bits = bits[2:]
1601     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1602
1603
1604 def is_bitstring(txt: str) -> bool:
1605     """Is this a bitstring?
1606
1607     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1608     True
1609
1610     >>> is_bitstring('1234')
1611     False
1612
1613     """
1614     return is_binary_integer_number(f'0b{txt}')
1615
1616
1617 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1618     """Convert from bitstring back to bytes then decode into a str.
1619
1620     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1621     'hello?'
1622
1623     """
1624     n = int(bits, 2)
1625     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1626
1627
1628 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1629     """Turn an IPv4 address into a tuple for sorting purposes.
1630
1631     >>> ip_v4_sort_key('10.0.0.18')
1632     (10, 0, 0, 18)
1633
1634     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1635     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1636     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1637
1638     """
1639     if not is_ip_v4(txt):
1640         print(f"not IP: {txt}")
1641         return None
1642     return tuple([int(x) for x in txt.split('.')])
1643
1644
1645 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1646     """Chunk up a file path so that parent/ancestor paths sort before
1647     children/descendant paths.
1648
1649     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1650     ('usr', 'local', 'bin')
1651
1652     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1653     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1654     ['/usr', '/usr/local', '/usr/local/bin']
1655
1656     """
1657     return tuple([x for x in volume.split('/') if len(x) > 0])
1658
1659
1660 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1661     """Execute several replace operations in a row.
1662
1663     >>> s = 'this_is a-test!'
1664     >>> replace_all(s, ' _-!', '')
1665     'thisisatest'
1666
1667     """
1668     for char in replace_set:
1669         in_str = in_str.replace(char, replacement)
1670     return in_str
1671
1672
1673 if __name__ == '__main__':
1674     import doctest
1675
1676     doctest.testmod()