string_utils.py

   1 #!/usr/bin/env python3
   2
   3 """The MIT License (MIT)
   4
   5 Copyright (c) 2016-2020 Davide Zanotti
   6 Modifications Copyright (c) 2021-2022 Scott Gasch
   7
   8 Permission is hereby granted, free of charge, to any person obtaining a copy
   9 of this software and associated documentation files (the "Software"), to deal
  10 in the Software without restriction, including without limitation the rights
  11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12 copies of the Software, and to permit persons to whom the Software is
  13 furnished to do so, subject to the following conditions:
  14
  15 The above copyright notice and this permission notice shall be included in all
  16 copies or substantial portions of the Software.
  17
  18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24 SOFTWARE.
  25
  26 This class is based on: https://github.com/daveoncode/python-string-utils.
  27 """
  28
  29 import base64
  30 import contextlib  # type: ignore
  31 import datetime
  32 import io
  33 import json
  34 import logging
  35 import numbers
  36 import random
  37 import re
  38 import string
  39 import unicodedata
  40 import warnings
  41 from itertools import zip_longest
  42 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
  43 from uuid import uuid4
  44
  45 import list_utils
  46
  47 logger = logging.getLogger(__name__)
  48
  49 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  50
  51 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  52
  53 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  54
  55 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  56
  57 URLS_RAW_STRING = (
  58     r"([a-z-]+://)"  # scheme
  59     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  60     r"(www\.)?"  # www.
  61     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  62     r"(:\d{2,})?"  # port number
  63     r"(/[a-z\d_%+-]*)*"  # folders
  64     r"(\.[a-z\d_%+-]+)*"  # file extension
  65     r"(\?[a-z\d_+%-=]*)?"  # query string
  66     r"(#\S*)?"  # hash
  67 )
  68
  69 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  70
  71 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  72
  73 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  74
  75 EMAILS_RAW_STRING = (
  76     r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  77 )
  78
  79 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  80
  81 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  82
  83 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
  84
  85 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  86
  87 SNAKE_CASE_TEST_RE = re.compile(
  88     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  89 )
  90
  91 SNAKE_CASE_TEST_DASH_RE = re.compile(
  92     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  93 )
  94
  95 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  96
  97 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  98
  99 CREDIT_CARDS = {
 100     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 101     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 102     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 103     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 104     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 105     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 106 }
 107
 108 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 109
 110 UUID_RE = re.compile(
 111     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
 112 )
 113
 114 UUID_HEX_OK_RE = re.compile(
 115     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 116     re.IGNORECASE,
 117 )
 118
 119 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 120
 121 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 122
 123 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 124
 125 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 126
 127 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 128
 129 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 130     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 131 )
 132
 133 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 134
 135 HTML_RE = re.compile(
 136     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 137     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 138 )
 139
 140 HTML_TAG_ONLY_RE = re.compile(
 141     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 142     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 143 )
 144
 145 SPACES_RE = re.compile(r"\s")
 146
 147 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 148
 149 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 150
 151 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 152
 153 NUM_SUFFIXES = {
 154     "Pb": (1024 ** 5),
 155     "P": (1024 ** 5),
 156     "Tb": (1024 ** 4),
 157     "T": (1024 ** 4),
 158     "Gb": (1024 ** 3),
 159     "G": (1024 ** 3),
 160     "Mb": (1024 ** 2),
 161     "M": (1024 ** 2),
 162     "Kb": (1024 ** 1),
 163     "K": (1024 ** 1),
 164 }
 165
 166
 167 def is_none_or_empty(in_str: Optional[str]) -> bool:
 168     """
 169     Returns true if the input string is either None or an empty string.
 170
 171     >>> is_none_or_empty("")
 172     True
 173     >>> is_none_or_empty(None)
 174     True
 175     >>> is_none_or_empty("   \t   ")
 176     True
 177     >>> is_none_or_empty('Test')
 178     False
 179     """
 180     return in_str is None or len(in_str.strip()) == 0
 181
 182
 183 def is_string(obj: Any) -> bool:
 184     """
 185     Checks if an object is a string.
 186
 187     >>> is_string('test')
 188     True
 189     >>> is_string(123)
 190     False
 191     >>> is_string(100.3)
 192     False
 193     >>> is_string([1, 2, 3])
 194     False
 195     """
 196     return isinstance(obj, str)
 197
 198
 199 def is_empty_string(in_str: Any) -> bool:
 200     return is_empty(in_str)
 201
 202
 203 def is_empty(in_str: Any) -> bool:
 204     """
 205     Checks if input is a string and empty or only whitespace.
 206
 207     >>> is_empty('')
 208     True
 209     >>> is_empty('    \t\t    ')
 210     True
 211     >>> is_empty('test')
 212     False
 213     >>> is_empty(100.88)
 214     False
 215     >>> is_empty([1, 2, 3])
 216     False
 217     """
 218     return is_string(in_str) and in_str.strip() == ""
 219
 220
 221 def is_full_string(in_str: Any) -> bool:
 222     """
 223     Checks that input is a string and is not empty ('') or only whitespace.
 224
 225     >>> is_full_string('test!')
 226     True
 227     >>> is_full_string('')
 228     False
 229     >>> is_full_string('      ')
 230     False
 231     >>> is_full_string(100.999)
 232     False
 233     >>> is_full_string({"a": 1, "b": 2})
 234     False
 235     """
 236     return is_string(in_str) and in_str.strip() != ""
 237
 238
 239 def is_number(in_str: str) -> bool:
 240     """
 241     Checks if a string is a valid number.
 242
 243     >>> is_number(100.5)
 244     Traceback (most recent call last):
 245     ...
 246     ValueError: 100.5
 247     >>> is_number("100.5")
 248     True
 249     >>> is_number("test")
 250     False
 251     >>> is_number("99")
 252     True
 253     >>> is_number([1, 2, 3])
 254     Traceback (most recent call last):
 255     ...
 256     ValueError: [1, 2, 3]
 257     """
 258     if not is_string(in_str):
 259         raise ValueError(in_str)
 260     return NUMBER_RE.match(in_str) is not None
 261
 262
 263 def is_integer_number(in_str: str) -> bool:
 264     """
 265     Checks whether the given string represents an integer or not.
 266
 267     An integer may be signed or unsigned or use a "scientific notation".
 268
 269     >>> is_integer_number('42')
 270     True
 271     >>> is_integer_number('42.0')
 272     False
 273     """
 274     return (
 275         (is_number(in_str) and "." not in in_str)
 276         or is_hexidecimal_integer_number(in_str)
 277         or is_octal_integer_number(in_str)
 278         or is_binary_integer_number(in_str)
 279     )
 280
 281
 282 def is_hexidecimal_integer_number(in_str: str) -> bool:
 283     """
 284     Checks whether a string is a hex integer number.
 285
 286     >>> is_hexidecimal_integer_number('0x12345')
 287     True
 288     >>> is_hexidecimal_integer_number('0x1A3E')
 289     True
 290     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 291     False
 292     >>> is_hexidecimal_integer_number('-0xff')
 293     True
 294     >>> is_hexidecimal_integer_number('test')
 295     False
 296     >>> is_hexidecimal_integer_number(12345)  # Not a string
 297     Traceback (most recent call last):
 298     ...
 299     ValueError: 12345
 300     >>> is_hexidecimal_integer_number(101.4)
 301     Traceback (most recent call last):
 302     ...
 303     ValueError: 101.4
 304     >>> is_hexidecimal_integer_number(0x1A3E)
 305     Traceback (most recent call last):
 306     ...
 307     ValueError: 6718
 308     """
 309     if not is_string(in_str):
 310         raise ValueError(in_str)
 311     return HEX_NUMBER_RE.match(in_str) is not None
 312
 313
 314 def is_octal_integer_number(in_str: str) -> bool:
 315     """
 316     Checks whether a string is an octal number.
 317
 318     >>> is_octal_integer_number('0o777')
 319     True
 320     >>> is_octal_integer_number('-0O115')
 321     True
 322     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 323     False
 324     >>> is_octal_integer_number('7777')  # Needs 0o
 325     False
 326     >>> is_octal_integer_number('test')
 327     False
 328     """
 329     if not is_string(in_str):
 330         raise ValueError(in_str)
 331     return OCT_NUMBER_RE.match(in_str) is not None
 332
 333
 334 def is_binary_integer_number(in_str: str) -> bool:
 335     """
 336     Returns whether a string contains a binary number.
 337
 338     >>> is_binary_integer_number('0b10111')
 339     True
 340     >>> is_binary_integer_number('-0b111')
 341     True
 342     >>> is_binary_integer_number('0B10101')
 343     True
 344     >>> is_binary_integer_number('0b10102')
 345     False
 346     >>> is_binary_integer_number('0xFFF')
 347     False
 348     >>> is_binary_integer_number('test')
 349     False
 350     """
 351     if not is_string(in_str):
 352         raise ValueError(in_str)
 353     return BIN_NUMBER_RE.match(in_str) is not None
 354
 355
 356 def to_int(in_str: str) -> int:
 357     """Returns the integral value of the string or raises on error.
 358
 359     >>> to_int('1234')
 360     1234
 361     >>> to_int('test')
 362     Traceback (most recent call last):
 363     ...
 364     ValueError: invalid literal for int() with base 10: 'test'
 365     """
 366     if not is_string(in_str):
 367         raise ValueError(in_str)
 368     if is_binary_integer_number(in_str):
 369         return int(in_str, 2)
 370     if is_octal_integer_number(in_str):
 371         return int(in_str, 8)
 372     if is_hexidecimal_integer_number(in_str):
 373         return int(in_str, 16)
 374     return int(in_str)
 375
 376
 377 def is_decimal_number(in_str: str) -> bool:
 378     """
 379     Checks whether the given string represents a decimal or not.
 380
 381     A decimal may be signed or unsigned or use a "scientific notation".
 382
 383     >>> is_decimal_number('42.0')
 384     True
 385     >>> is_decimal_number('42')
 386     False
 387     """
 388     return is_number(in_str) and "." in in_str
 389
 390
 391 def strip_escape_sequences(in_str: str) -> str:
 392     """
 393     Remove escape sequences in the input string.
 394
 395     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 396     'this is a test!'
 397     """
 398     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 399     return in_str
 400
 401
 402 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 403     """
 404     Add thousands separator to a numeric string.  Also handles numbers.
 405
 406     >>> add_thousands_separator('12345678')
 407     '12,345,678'
 408     >>> add_thousands_separator(12345678)
 409     '12,345,678'
 410     >>> add_thousands_separator(12345678.99)
 411     '12,345,678.99'
 412     >>> add_thousands_separator('test')
 413     Traceback (most recent call last):
 414     ...
 415     ValueError: test
 416
 417     """
 418     if isinstance(in_str, numbers.Number):
 419         in_str = f'{in_str}'
 420     if is_number(in_str):
 421         return _add_thousands_separator(
 422             in_str, separator_char=separator_char, places=places
 423         )
 424     raise ValueError(in_str)
 425
 426
 427 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 428     decimal_part = ""
 429     if '.' in in_str:
 430         (in_str, decimal_part) = in_str.split('.')
 431     tmp = [iter(in_str[::-1])] * places
 432     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 433     if len(decimal_part) > 0:
 434         ret += '.'
 435         ret += decimal_part
 436     return ret
 437
 438
 439 # Full url example:
 440 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 441 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 442     """
 443     Check if a string is a valid url.
 444
 445     >>> is_url('http://www.mysite.com')
 446     True
 447     >>> is_url('https://mysite.com')
 448     True
 449     >>> is_url('.mysite.com')
 450     False
 451     """
 452     if not is_full_string(in_str):
 453         return False
 454
 455     valid = URL_RE.match(in_str) is not None
 456
 457     if allowed_schemes:
 458         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 459     return valid
 460
 461
 462 def is_email(in_str: Any) -> bool:
 463     """
 464     Check if a string is a valid email.
 465
 466     Reference: https://tools.ietf.org/html/rfc3696#section-3
 467
 468     >>> is_email('[email protected]')
 469     True
 470     >>> is_email('@gmail.com')
 471     False
 472     """
 473     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 474         return False
 475
 476     try:
 477         # we expect 2 tokens, one before "@" and one after, otherwise
 478         # we have an exception and the email is not valid.
 479         head, tail = in_str.split("@")
 480
 481         # head's size must be <= 64, tail <= 255, head must not start
 482         # with a dot or contain multiple consecutive dots.
 483         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 484             return False
 485
 486         # removes escaped spaces, so that later on the test regex will
 487         # accept the string.
 488         head = head.replace("\\ ", "")
 489         if head.startswith('"') and head.endswith('"'):
 490             head = head.replace(" ", "")[1:-1]
 491         return EMAIL_RE.match(head + "@" + tail) is not None
 492
 493     except ValueError:
 494         # borderline case in which we have multiple "@" signs but the
 495         # head part is correctly escaped.
 496         if ESCAPED_AT_SIGN.search(in_str) is not None:
 497             # replace "@" with "a" in the head
 498             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 499         return False
 500
 501
 502 def suffix_string_to_number(in_str: str) -> Optional[int]:
 503     """Take a string like "33Gb" and convert it into a number (of bytes)
 504     like 34603008.  Return None if the input string is not valid.
 505
 506     >>> suffix_string_to_number('1Mb')
 507     1048576
 508     >>> suffix_string_to_number('13.1Gb')
 509     14066017894
 510     """
 511
 512     def suffix_capitalize(s: str) -> str:
 513         if len(s) == 1:
 514             return s.upper()
 515         elif len(s) == 2:
 516             return f"{s[0].upper()}{s[1].lower()}"
 517         return suffix_capitalize(s[0:1])
 518
 519     if is_string(in_str):
 520         if is_integer_number(in_str):
 521             return to_int(in_str)
 522         suffixes = [in_str[-2:], in_str[-1:]]
 523         rest = [in_str[:-2], in_str[:-1]]
 524         for x in range(len(suffixes)):
 525             s = suffixes[x]
 526             s = suffix_capitalize(s)
 527             multiplier = NUM_SUFFIXES.get(s, None)
 528             if multiplier is not None:
 529                 r = rest[x]
 530                 if is_integer_number(r):
 531                     return to_int(r) * multiplier
 532                 if is_decimal_number(r):
 533                     return int(float(r) * multiplier)
 534     return None
 535
 536
 537 def number_to_suffix_string(num: int) -> Optional[str]:
 538     """Take a number (of bytes) and returns a string like "43.8Gb".
 539     Returns none if the input is invalid.
 540
 541     >>> number_to_suffix_string(14066017894)
 542     '13.1Gb'
 543     >>> number_to_suffix_string(1024 * 1024)
 544     '1.0Mb'
 545
 546     """
 547     d = 0.0
 548     suffix = None
 549     for (sfx, size) in NUM_SUFFIXES.items():
 550         if num >= size:
 551             d = num / size
 552             suffix = sfx
 553             break
 554     if suffix is not None:
 555         return f"{d:.1f}{suffix}"
 556     else:
 557         return f'{num:d}'
 558
 559
 560 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 561     """
 562     Checks if a string is a valid credit card number.
 563     If card type is provided then it checks against that specific type only,
 564     otherwise any known credit card number will be accepted.
 565
 566     Supported card types are the following:
 567
 568     - VISA
 569     - MASTERCARD
 570     - AMERICAN_EXPRESS
 571     - DINERS_CLUB
 572     - DISCOVER
 573     - JCB
 574     """
 575     if not is_full_string(in_str):
 576         return False
 577
 578     if card_type is not None:
 579         if card_type not in CREDIT_CARDS:
 580             raise KeyError(
 581                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 582             )
 583         return CREDIT_CARDS[card_type].match(in_str) is not None
 584     for c in CREDIT_CARDS:
 585         if CREDIT_CARDS[c].match(in_str) is not None:
 586             return True
 587     return False
 588
 589
 590 def is_camel_case(in_str: Any) -> bool:
 591     """
 592     Checks if a string is formatted as camel case.
 593
 594     A string is considered camel case when:
 595
 596     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 597     - it contains both lowercase and uppercase letters
 598     - it does not start with a number
 599     """
 600     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 601
 602
 603 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 604     """
 605     Checks if a string is formatted as "snake case".
 606
 607     A string is considered snake case when:
 608
 609     - it's composed only by lowercase/uppercase letters and digits
 610     - it contains at least one underscore (or provided separator)
 611     - it does not start with a number
 612
 613     >>> is_snake_case('this_is_a_test')
 614     True
 615     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 616     True
 617     >>> is_snake_case('this-is-a-test')
 618     False
 619     >>> is_snake_case('this-is-a-test', separator='-')
 620     True
 621
 622     """
 623     if is_full_string(in_str):
 624         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 625         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 626         r = re_map.get(
 627             separator,
 628             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 629         )
 630         return r.match(in_str) is not None
 631     return False
 632
 633
 634 def is_json(in_str: Any) -> bool:
 635     """
 636     Check if a string is a valid json.
 637
 638     >>> is_json('{"name": "Peter"}')
 639     True
 640     >>> is_json('[1, 2, 3]')
 641     True
 642     >>> is_json('{nope}')
 643     False
 644     """
 645     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 646         try:
 647             return isinstance(json.loads(in_str), (dict, list))
 648         except (TypeError, ValueError, OverflowError):
 649             pass
 650     return False
 651
 652
 653 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 654     """
 655     Check if a string is a valid UUID.
 656
 657     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 658     True
 659     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 660     False
 661     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 662     True
 663     """
 664     # string casting is used to allow UUID itself as input data type
 665     s = str(in_str)
 666     if allow_hex:
 667         return UUID_HEX_OK_RE.match(s) is not None
 668     return UUID_RE.match(s) is not None
 669
 670
 671 def is_ip_v4(in_str: Any) -> bool:
 672     """
 673     Checks if a string is a valid ip v4.
 674
 675     >>> is_ip_v4('255.200.100.75')
 676     True
 677     >>> is_ip_v4('nope')
 678     False
 679     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 680     False
 681     """
 682     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 683         return False
 684
 685     # checks that each entry in the ip is in the valid range (0 to 255)
 686     for token in in_str.split("."):
 687         if not 0 <= int(token) <= 255:
 688             return False
 689     return True
 690
 691
 692 def extract_ip_v4(in_str: Any) -> Optional[str]:
 693     """
 694     Extracts the IPv4 chunk of a string or None.
 695
 696     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 697     '127.0.0.1'
 698     >>> extract_ip_v4('Your mom dresses you funny.')
 699     """
 700     if not is_full_string(in_str):
 701         return None
 702     m = ANYWHERE_IP_V4_RE.search(in_str)
 703     if m is not None:
 704         return m.group(0)
 705     return None
 706
 707
 708 def is_ip_v6(in_str: Any) -> bool:
 709     """
 710     Checks if a string is a valid ip v6.
 711
 712     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 713     True
 714     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 715     False
 716     """
 717     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 718
 719
 720 def extract_ip_v6(in_str: Any) -> Optional[str]:
 721     """
 722     Extract IPv6 chunk or None.
 723
 724     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 725     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 726     >>> extract_ip_v6("(and she's ugly too, btw)")
 727     """
 728     if not is_full_string(in_str):
 729         return None
 730     m = ANYWHERE_IP_V6_RE.search(in_str)
 731     if m is not None:
 732         return m.group(0)
 733     return None
 734
 735
 736 def is_ip(in_str: Any) -> bool:
 737     """
 738     Checks if a string is a valid ip (either v4 or v6).
 739
 740     >>> is_ip('255.200.100.75')
 741     True
 742     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 743     True
 744     >>> is_ip('1.2.3')
 745     False
 746     >>> is_ip('1.2.3.999')
 747     False
 748     """
 749     return is_ip_v6(in_str) or is_ip_v4(in_str)
 750
 751
 752 def extract_ip(in_str: Any) -> Optional[str]:
 753     """
 754     Extract the IP address or None.
 755
 756     >>> extract_ip('Attacker: 255.200.100.75')
 757     '255.200.100.75'
 758     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 759     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 760     >>> extract_ip('1.2.3')
 761
 762     """
 763     ip = extract_ip_v4(in_str)
 764     if ip is None:
 765         ip = extract_ip_v6(in_str)
 766     return ip
 767
 768
 769 def is_mac_address(in_str: Any) -> bool:
 770     """Return True if in_str is a valid MAC address false otherwise.
 771
 772     >>> is_mac_address("34:29:8F:12:0D:2F")
 773     True
 774     >>> is_mac_address('34:29:8f:12:0d:2f')
 775     True
 776     >>> is_mac_address('34-29-8F-12-0D-2F')
 777     True
 778     >>> is_mac_address("test")
 779     False
 780     """
 781     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 782
 783
 784 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 785     """
 786     Extract the MAC address from in_str.
 787
 788     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 789     '34:29:8F:12:0D:2F'
 790
 791     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 792     'd8:5d:e2:34:54:86'
 793
 794     """
 795     if not is_full_string(in_str):
 796         return None
 797     in_str.strip()
 798     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 799     if m is not None:
 800         mac = m.group(0)
 801         mac.replace(":", separator)
 802         mac.replace("-", separator)
 803         return mac
 804     return None
 805
 806
 807 def is_slug(in_str: Any, separator: str = "-") -> bool:
 808     """
 809     Checks if a given string is a slug (as created by `slugify()`).
 810
 811     >>> is_slug('my-blog-post-title')
 812     True
 813     >>> is_slug('My blog post title')
 814     False
 815
 816     """
 817     if not is_full_string(in_str):
 818         return False
 819     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 820     return re.match(rex, in_str) is not None
 821
 822
 823 def contains_html(in_str: str) -> bool:
 824     """
 825     Checks if the given string contains HTML/XML tags.
 826
 827     By design, this function matches ANY type of tag, so don't expect to use it
 828     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 829
 830     >>> contains_html('my string is <strong>bold</strong>')
 831     True
 832     >>> contains_html('my string is not bold')
 833     False
 834
 835     """
 836     if not is_string(in_str):
 837         raise ValueError(in_str)
 838     return HTML_RE.search(in_str) is not None
 839
 840
 841 def words_count(in_str: str) -> int:
 842     """
 843     Returns the number of words contained into the given string.
 844
 845     This method is smart, it does consider only sequence of one or more letter and/or numbers
 846     as "words", so a string like this: "! @ # % ... []" will return zero!
 847     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 848     will be 4 not 1 (even if there are no spaces in the string).
 849
 850     >>> words_count('hello world')
 851     2
 852     >>> words_count('one,two,three.stop')
 853     4
 854
 855     """
 856     if not is_string(in_str):
 857         raise ValueError(in_str)
 858     return len(WORDS_COUNT_RE.findall(in_str))
 859
 860
 861 def generate_uuid(omit_dashes: bool = False) -> str:
 862     """
 863     Generated an UUID string (using `uuid.uuid4()`).
 864
 865     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 866     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 867
 868     """
 869     uid = uuid4()
 870     if omit_dashes:
 871         return uid.hex
 872     return str(uid)
 873
 874
 875 def generate_random_alphanumeric_string(size: int) -> str:
 876     """
 877     Returns a string of the specified size containing random
 878     characters (uppercase/lowercase ascii letters and digits).
 879
 880     random_string(9) # possible output: "cx3QQbzYg"
 881
 882     """
 883     if size < 1:
 884         raise ValueError("size must be >= 1")
 885     chars = string.ascii_letters + string.digits
 886     buffer = [random.choice(chars) for _ in range(size)]
 887     return from_char_list(buffer)
 888
 889
 890 def reverse(in_str: str) -> str:
 891     """
 892     Returns the string with its chars reversed.
 893
 894     >>> reverse('test')
 895     'tset'
 896
 897     """
 898     if not is_string(in_str):
 899         raise ValueError(in_str)
 900     return in_str[::-1]
 901
 902
 903 def camel_case_to_snake_case(in_str, *, separator="_"):
 904     """
 905     Convert a camel case string into a snake case one.
 906     (The original string is returned if is not a valid camel case string)
 907
 908     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 909     'mac_address_extractor_factory'
 910     >>> camel_case_to_snake_case('Luke Skywalker')
 911     'Luke Skywalker'
 912     """
 913     if not is_string(in_str):
 914         raise ValueError(in_str)
 915     if not is_camel_case(in_str):
 916         return in_str
 917     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
 918
 919
 920 def snake_case_to_camel_case(
 921     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 922 ) -> str:
 923     """
 924     Convert a snake case string into a camel case one.
 925     (The original string is returned if is not a valid snake case string)
 926
 927     >>> snake_case_to_camel_case('this_is_a_test')
 928     'ThisIsATest'
 929     >>> snake_case_to_camel_case('Han Solo')
 930     'Han Solo'
 931     """
 932     if not is_string(in_str):
 933         raise ValueError(in_str)
 934     if not is_snake_case(in_str, separator=separator):
 935         return in_str
 936     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 937     if not upper_case_first:
 938         tokens[0] = tokens[0].lower()
 939     return from_char_list(tokens)
 940
 941
 942 def to_char_list(in_str: str) -> List[str]:
 943     """Convert a string into a list of chars.
 944
 945     >>> to_char_list('test')
 946     ['t', 'e', 's', 't']
 947     """
 948     if not is_string(in_str):
 949         return []
 950     return list(in_str)
 951
 952
 953 def from_char_list(in_list: List[str]) -> str:
 954     """Convert a char list into a string.
 955
 956     >>> from_char_list(['t', 'e', 's', 't'])
 957     'test'
 958     """
 959     return "".join(in_list)
 960
 961
 962 def shuffle(in_str: str) -> str:
 963     """Return a new string containing same chars of the given one but in
 964     a randomized order.
 965     """
 966     if not is_string(in_str):
 967         raise ValueError(in_str)
 968
 969     # turn the string into a list of chars
 970     chars = to_char_list(in_str)
 971     random.shuffle(chars)
 972     return from_char_list(chars)
 973
 974
 975 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 976     """
 977     Remove html code contained into the given string.
 978
 979     >>> strip_html('test: <a href="foo/bar">click here</a>')
 980     'test: '
 981     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 982     'test: click here'
 983     """
 984     if not is_string(in_str):
 985         raise ValueError(in_str)
 986     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 987     return r.sub("", in_str)
 988
 989
 990 def asciify(in_str: str) -> str:
 991     """
 992     Force string content to be ascii-only by translating all non-ascii
 993     chars into the closest possible representation (eg: ó -> o, Ë ->
 994     E, ç -> c...).
 995
 996     N.B. Some chars may be lost if impossible to translate.
 997
 998     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
 999     'eeuuooaaeynAAACIINOE'
1000     """
1001     if not is_string(in_str):
1002         raise ValueError(in_str)
1003
1004     # "NFKD" is the algorithm which is able to successfully translate
1005     # the most of non-ascii chars.
1006     normalized = unicodedata.normalize("NFKD", in_str)
1007
1008     # encode string forcing ascii and ignore any errors
1009     # (unrepresentable chars will be stripped out)
1010     ascii_bytes = normalized.encode("ascii", "ignore")
1011
1012     # turns encoded bytes into an utf-8 string
1013     return ascii_bytes.decode("utf-8")
1014
1015
1016 def slugify(in_str: str, *, separator: str = "-") -> str:
1017     """
1018     Converts a string into a "slug" using provided separator.
1019     The returned string has the following properties:
1020
1021     - it has no spaces
1022     - all letters are in lower case
1023     - all punctuation signs and non alphanumeric chars are removed
1024     - words are divided using provided separator
1025     - all chars are encoded as ascii (by using `asciify()`)
1026     - is safe for URL
1027
1028     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1029     'top-10-reasons-to-love-dogs'
1030     >>> slugify('Mönstér Mägnët')
1031     'monster-magnet'
1032     """
1033     if not is_string(in_str):
1034         raise ValueError(in_str)
1035
1036     # replace any character that is NOT letter or number with spaces
1037     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1038
1039     # replace spaces with join sign
1040     out = SPACES_RE.sub(separator, out)
1041
1042     # normalize joins (remove duplicates)
1043     out = re.sub(re.escape(separator) + r"+", separator, out)
1044     return asciify(out)
1045
1046
1047 def to_bool(in_str: str) -> bool:
1048     """
1049     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1050
1051     A positive boolean (True) is returned if the string value is one
1052     of the following:
1053
1054     - "true"
1055     - "1"
1056     - "yes"
1057     - "y"
1058
1059     Otherwise False is returned.
1060
1061     >>> to_bool('True')
1062     True
1063
1064     >>> to_bool('1')
1065     True
1066
1067     >>> to_bool('yes')
1068     True
1069
1070     >>> to_bool('no')
1071     False
1072
1073     >>> to_bool('huh?')
1074     False
1075
1076     >>> to_bool('on')
1077     True
1078
1079     """
1080     if not is_string(in_str):
1081         raise ValueError(in_str)
1082     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1083
1084
1085 def to_date(in_str: str) -> Optional[datetime.date]:
1086     """
1087     Parses a date string.  See DateParser docs for details.
1088     """
1089     import dateparse.dateparse_utils as dp  # type: ignore
1090
1091     try:
1092         d = dp.DateParser()
1093         d.parse(in_str)
1094         return d.get_date()
1095     except dp.ParseException:
1096         msg = f'Unable to parse date {in_str}.'
1097         logger.warning(msg)
1098     return None
1099
1100
1101 def valid_date(in_str: str) -> bool:
1102     """
1103     True if the string represents a valid date.
1104     """
1105     import dateparse.dateparse_utils as dp
1106
1107     try:
1108         d = dp.DateParser()
1109         _ = d.parse(in_str)
1110         return True
1111     except dp.ParseException:
1112         msg = f'Unable to parse date {in_str}.'
1113         logger.warning(msg)
1114     return False
1115
1116
1117 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1118     """
1119     Parses a datetime string.  See DateParser docs for more info.
1120     """
1121     import dateparse.dateparse_utils as dp
1122
1123     try:
1124         d = dp.DateParser()
1125         dt = d.parse(in_str)
1126         if type(dt) == datetime.datetime:
1127             return dt
1128     except ValueError:
1129         msg = f'Unable to parse datetime {in_str}.'
1130         logger.warning(msg)
1131     return None
1132
1133
1134 def valid_datetime(in_str: str) -> bool:
1135     """
1136     True if the string represents a valid datetime.
1137     """
1138     _ = to_datetime(in_str)
1139     if _ is not None:
1140         return True
1141     msg = f'Unable to parse datetime {in_str}.'
1142     logger.warning(msg)
1143     return False
1144
1145
1146 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1147     """
1148     Squeeze runs of more than one character_to_squeeze into one.
1149
1150     >>> squeeze(' this        is       a    test    ')
1151     ' this is a test '
1152
1153     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1154     'one|!|two|!|three'
1155
1156     """
1157     return re.sub(
1158         r'(' + re.escape(character_to_squeeze) + r')+',
1159         character_to_squeeze,
1160         in_str,
1161     )
1162
1163
1164 def dedent(in_str: str) -> str:
1165     """
1166     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1167     """
1168     if not is_string(in_str):
1169         raise ValueError(in_str)
1170     line_separator = '\n'
1171     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1172     return line_separator.join(lines)
1173
1174
1175 def indent(in_str: str, amount: int) -> str:
1176     """
1177     Indents string by prepending amount spaces.
1178
1179     >>> indent('This is a test', 4)
1180     '    This is a test'
1181
1182     """
1183     if not is_string(in_str):
1184         raise ValueError(in_str)
1185     line_separator = '\n'
1186     lines = [" " * amount + line for line in in_str.split(line_separator)]
1187     return line_separator.join(lines)
1188
1189
1190 def sprintf(*args, **kwargs) -> str:
1191     """String printf, like in C"""
1192     ret = ""
1193
1194     sep = kwargs.pop("sep", None)
1195     if sep is not None:
1196         if not isinstance(sep, str):
1197             raise TypeError("sep must be None or a string")
1198
1199     end = kwargs.pop("end", None)
1200     if end is not None:
1201         if not isinstance(end, str):
1202             raise TypeError("end must be None or a string")
1203
1204     if kwargs:
1205         raise TypeError("invalid keyword arguments to sprint()")
1206
1207     if sep is None:
1208         sep = " "
1209     if end is None:
1210         end = "\n"
1211     for i, arg in enumerate(args):
1212         if i:
1213             ret += sep
1214         if isinstance(arg, str):
1215             ret += arg
1216         else:
1217             ret += str(arg)
1218     ret += end
1219     return ret
1220
1221
1222 class SprintfStdout(object):
1223     """
1224     A context manager that captures outputs to stdout.
1225
1226     with SprintfStdout() as buf:
1227         print("test")
1228     print(buf())
1229
1230     'test\n'
1231     """
1232
1233     def __init__(self) -> None:
1234         self.destination = io.StringIO()
1235         self.recorder: contextlib.redirect_stdout
1236
1237     def __enter__(self) -> Callable[[], str]:
1238         self.recorder = contextlib.redirect_stdout(self.destination)
1239         self.recorder.__enter__()
1240         return lambda: self.destination.getvalue()
1241
1242     def __exit__(self, *args) -> None:
1243         self.recorder.__exit__(*args)
1244         self.destination.seek(0)
1245         return None  # don't suppress exceptions
1246
1247
1248 def capitalize_first_letter(txt: str) -> str:
1249     """Capitalize the first letter of a string.
1250
1251     >>> capitalize_first_letter('test')
1252     'Test'
1253     >>> capitalize_first_letter("ALREADY!")
1254     'ALREADY!'
1255
1256     """
1257     return txt[0].upper() + txt[1:]
1258
1259
1260 def it_they(n: int) -> str:
1261     """It or they?
1262
1263     >>> it_they(1)
1264     'it'
1265     >>> it_they(100)
1266     'they'
1267
1268     """
1269     if n == 1:
1270         return "it"
1271     return "they"
1272
1273
1274 def is_are(n: int) -> str:
1275     """Is or are?
1276
1277     >>> is_are(1)
1278     'is'
1279     >>> is_are(2)
1280     'are'
1281
1282     """
1283     if n == 1:
1284         return "is"
1285     return "are"
1286
1287
1288 def pluralize(n: int) -> str:
1289     """Add an s?
1290
1291     >>> pluralize(15)
1292     's'
1293     >>> count = 1
1294     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1295     There is 1 file.
1296     >>> count = 4
1297     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1298     There are 4 files.
1299
1300     """
1301     if n == 1:
1302         return ""
1303     return "s"
1304
1305
1306 def make_contractions(txt: str) -> str:
1307     """Glue words together to form contractions.
1308
1309     >>> make_contractions('It is nice today.')
1310     "It's nice today."
1311
1312     >>> make_contractions('I can    not even...')
1313     "I can't even..."
1314
1315     >>> make_contractions('She could not see!')
1316     "She couldn't see!"
1317
1318     >>> make_contractions('But she will not go.')
1319     "But she won't go."
1320
1321     >>> make_contractions('Verily, I shall not.')
1322     "Verily, I shan't."
1323
1324     >>> make_contractions('No you cannot.')
1325     "No you can't."
1326
1327     >>> make_contractions('I said you can not go.')
1328     "I said you can't go."
1329
1330     """
1331
1332     first_second = [
1333         (
1334             [
1335                 'are',
1336                 'could',
1337                 'did',
1338                 'has',
1339                 'have',
1340                 'is',
1341                 'must',
1342                 'should',
1343                 'was',
1344                 'were',
1345                 'would',
1346             ],
1347             ['(n)o(t)'],
1348         ),
1349         (
1350             [
1351                 "I",
1352                 "you",
1353                 "he",
1354                 "she",
1355                 "it",
1356                 "we",
1357                 "they",
1358                 "how",
1359                 "why",
1360                 "when",
1361                 "where",
1362                 "who",
1363                 "there",
1364             ],
1365             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1366         ),
1367     ]
1368
1369     # Special cases: can't, shan't and won't.
1370     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1371     txt = re.sub(
1372         r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
1373     )
1374     txt = re.sub(
1375         r'\b(w)ill\s*(n)(o)(t)\b', r"\1\3\2'\4", txt, count=0, flags=re.IGNORECASE
1376     )
1377
1378     for first_list, second_list in first_second:
1379         for first in first_list:
1380             for second in second_list:
1381                 # Disallow there're/where're.  They're valid English
1382                 # but sound weird.
1383                 if (first == 'there' or first == 'where') and second == 'a(re)':
1384                     continue
1385
1386                 pattern = fr'\b({first})\s+{second}\b'
1387                 if second == '(n)o(t)':
1388                     replacement = r"\1\2'\3"
1389                 else:
1390                     replacement = r"\1'\2"
1391                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1392
1393     return txt
1394
1395
1396 def thify(n: int) -> str:
1397     """Return the proper cardinal suffix for a number.
1398
1399     >>> thify(1)
1400     'st'
1401     >>> thify(33)
1402     'rd'
1403     >>> thify(16)
1404     'th'
1405
1406     """
1407     digit = str(n)
1408     assert is_integer_number(digit)
1409     digit = digit[-1:]
1410     if digit == "1":
1411         return "st"
1412     elif digit == "2":
1413         return "nd"
1414     elif digit == "3":
1415         return "rd"
1416     else:
1417         return "th"
1418
1419
1420 def ngrams(txt: str, n: int):
1421     """Return the ngrams from a string.
1422
1423     >>> [x for x in ngrams('This is a test', 2)]
1424     ['This is', 'is a', 'a test']
1425
1426     """
1427     words = txt.split()
1428     for ngram in ngrams_presplit(words, n):
1429         ret = ''
1430         for word in ngram:
1431             ret += f'{word} '
1432         yield ret.strip()
1433
1434
1435 def ngrams_presplit(words: Sequence[str], n: int):
1436     return list_utils.ngrams(words, n)
1437
1438
1439 def bigrams(txt: str):
1440     return ngrams(txt, 2)
1441
1442
1443 def trigrams(txt: str):
1444     return ngrams(txt, 3)
1445
1446
1447 def shuffle_columns_into_list(
1448     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1449 ) -> Iterable[str]:
1450     """Helper to shuffle / parse columnar data and return the results as a
1451     list.  The column_specs argument is an iterable collection of
1452     numeric sequences that indicate one or more column numbers to
1453     copy.
1454
1455     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1456     >>> shuffle_columns_into_list(
1457     ...     cols,
1458     ...     [ [8], [2, 3], [5, 6, 7] ],
1459     ...     delim=' ',
1460     ... )
1461     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1462
1463     """
1464     out = []
1465
1466     # Column specs map input lines' columns into outputs.
1467     # [col1, col2...]
1468     for spec in column_specs:
1469         chunk = ''
1470         for n in spec:
1471             chunk = chunk + delim + input_lines[n]
1472         chunk = chunk.strip(delim)
1473         out.append(chunk)
1474     return out
1475
1476
1477 def shuffle_columns_into_dict(
1478     input_lines: Sequence[str],
1479     column_specs: Iterable[Tuple[str, Iterable[int]]],
1480     delim='',
1481 ) -> Dict[str, str]:
1482     """Helper to shuffle / parse columnar data and return the results
1483     as a dict.
1484
1485     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1486     >>> shuffle_columns_into_dict(
1487     ...     cols,
1488     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1489     ...     delim=' ',
1490     ... )
1491     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1492
1493     """
1494     out = {}
1495
1496     # Column specs map input lines' columns into outputs.
1497     # "key", [col1, col2...]
1498     for spec in column_specs:
1499         chunk = ''
1500         for n in spec[1]:
1501             chunk = chunk + delim + input_lines[n]
1502         chunk = chunk.strip(delim)
1503         out[spec[0]] = chunk
1504     return out
1505
1506
1507 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1508     """Interpolate a string with data from a dict.
1509
1510     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1511     ...                        {'adjective': 'good', 'noun': 'example'})
1512     'This is a good example.'
1513
1514     """
1515     return sprintf(txt.format(**values), end='')
1516
1517
1518 def to_ascii(x: str):
1519     """Encode as ascii bytes string.
1520
1521     >>> to_ascii('test')
1522     b'test'
1523
1524     >>> to_ascii(b'1, 2, 3')
1525     b'1, 2, 3'
1526
1527     """
1528     if type(x) is str:
1529         return x.encode('ascii')
1530     if type(x) is bytes:
1531         return x
1532     raise Exception('to_ascii works with strings and bytes')
1533
1534
1535 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1536     """Encode txt and then encode the bytes with a 64-character
1537     alphabet.  This is compatible with uudecode.
1538
1539     >>> to_base64('hello?')
1540     b'aGVsbG8/\\n'
1541
1542     """
1543     return base64.encodebytes(txt.encode(encoding, errors))
1544
1545
1546 def is_base64(txt: str) -> bool:
1547     """Determine whether a string is base64 encoded (with Python's standard
1548     base64 alphabet which is the same as what uuencode uses).
1549
1550     >>> is_base64('test')    # all letters in the b64 alphabet
1551     True
1552
1553     >>> is_base64('another test, how do you like this one?')
1554     False
1555
1556     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1557     True
1558
1559     """
1560     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1561     alphabet = set(a.encode('ascii'))
1562     for char in to_ascii(txt.strip()):
1563         if char not in alphabet:
1564             return False
1565     return True
1566
1567
1568 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1569     """Convert base64 encoded string back to normal strings.
1570
1571     >>> from_base64(b'aGVsbG8/\\n')
1572     'hello?'
1573
1574     """
1575     return base64.decodebytes(b64).decode(encoding, errors)
1576
1577
1578 def chunk(txt: str, chunk_size):
1579     """Chunk up a string.
1580
1581     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1582     '01001101 11000101 10101010 10101010 10011111 10101000'
1583
1584     """
1585     if len(txt) % chunk_size != 0:
1586         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1587         logger.warning(msg)
1588         warnings.warn(msg, stacklevel=2)
1589     for x in range(0, len(txt), chunk_size):
1590         yield txt[x : x + chunk_size]
1591
1592
1593 def to_bitstring(
1594     txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass'
1595 ) -> str:
1596     """Encode txt and then chop it into bytes.  Note: only bitstrings
1597     with delimiter='' are interpretable by from_bitstring.
1598
1599     >>> to_bitstring('hello?')
1600     '011010000110010101101100011011000110111100111111'
1601
1602     >>> to_bitstring('test', delimiter=' ')
1603     '01110100 01100101 01110011 01110100'
1604
1605     >>> to_bitstring(b'test')
1606     '01110100011001010111001101110100'
1607
1608     """
1609     etxt = to_ascii(txt)
1610     bits = bin(int.from_bytes(etxt, 'big'))
1611     bits = bits[2:]
1612     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1613
1614
1615 def is_bitstring(txt: str) -> bool:
1616     """Is this a bitstring?
1617
1618     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1619     True
1620
1621     >>> is_bitstring('1234')
1622     False
1623
1624     """
1625     return is_binary_integer_number(f'0b{txt}')
1626
1627
1628 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1629     """Convert from bitstring back to bytes then decode into a str.
1630
1631     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1632     'hello?'
1633
1634     """
1635     n = int(bits, 2)
1636     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1637
1638
1639 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1640     """Turn an IPv4 address into a tuple for sorting purposes.
1641
1642     >>> ip_v4_sort_key('10.0.0.18')
1643     (10, 0, 0, 18)
1644
1645     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1646     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1647     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1648
1649     """
1650     if not is_ip_v4(txt):
1651         print(f"not IP: {txt}")
1652         return None
1653     return tuple([int(x) for x in txt.split('.')])
1654
1655
1656 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1657     """Chunk up a file path so that parent/ancestor paths sort before
1658     children/descendant paths.
1659
1660     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1661     ('usr', 'local', 'bin')
1662
1663     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1664     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1665     ['/usr', '/usr/local', '/usr/local/bin']
1666
1667     """
1668     return tuple([x for x in volume.split('/') if len(x) > 0])
1669
1670
1671 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1672     """Execute several replace operations in a row.
1673
1674     >>> s = 'this_is a-test!'
1675     >>> replace_all(s, ' _-!', '')
1676     'thisisatest'
1677
1678     """
1679     for char in replace_set:
1680         in_str = in_str.replace(char, replacement)
1681     return in_str
1682
1683
1684 if __name__ == '__main__':
1685     import doctest
1686
1687     doctest.testmod()