string_utils.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3
   4 """The MIT License (MIT)
   5
   6 Copyright (c) 2016-2020 Davide Zanotti
   7 Modifications Copyright (c) 2021-2022 Scott Gasch
   8
   9 Permission is hereby granted, free of charge, to any person obtaining a copy
  10 of this software and associated documentation files (the "Software"), to deal
  11 in the Software without restriction, including without limitation the rights
  12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  13 copies of the Software, and to permit persons to whom the Software is
  14 furnished to do so, subject to the following conditions:
  15
  16 The above copyright notice and this permission notice shall be included in all
  17 copies or substantial portions of the Software.
  18
  19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  25 SOFTWARE.
  26
  27 This class is based on: https://github.com/daveoncode/python-string-utils.
  28 """
  29
  30 import base64
  31 import contextlib  # type: ignore
  32 import datetime
  33 import io
  34 import json
  35 import logging
  36 import numbers
  37 import random
  38 import re
  39 import string
  40 import unicodedata
  41 import warnings
  42 from itertools import zip_longest
  43 from typing import (
  44     Any,
  45     Callable,
  46     Dict,
  47     Iterable,
  48     List,
  49     Literal,
  50     Optional,
  51     Sequence,
  52     Tuple,
  53 )
  54 from uuid import uuid4
  55
  56 import list_utils
  57
  58 logger = logging.getLogger(__name__)
  59
  60 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  61
  62 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  63
  64 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  65
  66 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  67
  68 URLS_RAW_STRING = (
  69     r"([a-z-]+://)"  # scheme
  70     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  71     r"(www\.)?"  # www.
  72     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  73     r"(:\d{2,})?"  # port number
  74     r"(/[a-z\d_%+-]*)*"  # folders
  75     r"(\.[a-z\d_%+-]+)*"  # file extension
  76     r"(\?[a-z\d_+%-=]*)?"  # query string
  77     r"(#\S*)?"  # hash
  78 )
  79
  80 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  81
  82 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  83
  84 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  85
  86 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  87
  88 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  89
  90 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  91
  92 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
  93
  94 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  95
  96 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
  97
  98 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
  99
 100 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
 101
 102 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
 103
 104 CREDIT_CARDS = {
 105     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
 106     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
 107     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
 108     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
 109     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
 110     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
 111 }
 112
 113 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
 114
 115 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
 116
 117 UUID_HEX_OK_RE = re.compile(
 118     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
 119     re.IGNORECASE,
 120 )
 121
 122 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
 123
 124 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
 125
 126 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
 127
 128 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 129
 130 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
 131
 132 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
 133
 134 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
 135
 136 HTML_RE = re.compile(
 137     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 138     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 139 )
 140
 141 HTML_TAG_ONLY_RE = re.compile(
 142     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 143     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 144 )
 145
 146 SPACES_RE = re.compile(r"\s")
 147
 148 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
 149
 150 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 151
 152 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 153
 154 NUM_SUFFIXES = {
 155     "Pb": (1024**5),
 156     "P": (1024**5),
 157     "Tb": (1024**4),
 158     "T": (1024**4),
 159     "Gb": (1024**3),
 160     "G": (1024**3),
 161     "Mb": (1024**2),
 162     "M": (1024**2),
 163     "Kb": (1024**1),
 164     "K": (1024**1),
 165 }
 166
 167
 168 def is_none_or_empty(in_str: Optional[str]) -> bool:
 169     """
 170     Returns true if the input string is either None or an empty string.
 171
 172     >>> is_none_or_empty("")
 173     True
 174     >>> is_none_or_empty(None)
 175     True
 176     >>> is_none_or_empty("   \t   ")
 177     True
 178     >>> is_none_or_empty('Test')
 179     False
 180     """
 181     return in_str is None or len(in_str.strip()) == 0
 182
 183
 184 def is_string(obj: Any) -> bool:
 185     """
 186     Checks if an object is a string.
 187
 188     >>> is_string('test')
 189     True
 190     >>> is_string(123)
 191     False
 192     >>> is_string(100.3)
 193     False
 194     >>> is_string([1, 2, 3])
 195     False
 196     """
 197     return isinstance(obj, str)
 198
 199
 200 def is_empty_string(in_str: Any) -> bool:
 201     return is_empty(in_str)
 202
 203
 204 def is_empty(in_str: Any) -> bool:
 205     """
 206     Checks if input is a string and empty or only whitespace.
 207
 208     >>> is_empty('')
 209     True
 210     >>> is_empty('    \t\t    ')
 211     True
 212     >>> is_empty('test')
 213     False
 214     >>> is_empty(100.88)
 215     False
 216     >>> is_empty([1, 2, 3])
 217     False
 218     """
 219     return is_string(in_str) and in_str.strip() == ""
 220
 221
 222 def is_full_string(in_str: Any) -> bool:
 223     """
 224     Checks that input is a string and is not empty ('') or only whitespace.
 225
 226     >>> is_full_string('test!')
 227     True
 228     >>> is_full_string('')
 229     False
 230     >>> is_full_string('      ')
 231     False
 232     >>> is_full_string(100.999)
 233     False
 234     >>> is_full_string({"a": 1, "b": 2})
 235     False
 236     """
 237     return is_string(in_str) and in_str.strip() != ""
 238
 239
 240 def is_number(in_str: str) -> bool:
 241     """
 242     Checks if a string is a valid number.
 243
 244     >>> is_number(100.5)
 245     Traceback (most recent call last):
 246     ...
 247     ValueError: 100.5
 248     >>> is_number("100.5")
 249     True
 250     >>> is_number("test")
 251     False
 252     >>> is_number("99")
 253     True
 254     >>> is_number([1, 2, 3])
 255     Traceback (most recent call last):
 256     ...
 257     ValueError: [1, 2, 3]
 258     """
 259     if not is_string(in_str):
 260         raise ValueError(in_str)
 261     return NUMBER_RE.match(in_str) is not None
 262
 263
 264 def is_integer_number(in_str: str) -> bool:
 265     """
 266     Checks whether the given string represents an integer or not.
 267
 268     An integer may be signed or unsigned or use a "scientific notation".
 269
 270     >>> is_integer_number('42')
 271     True
 272     >>> is_integer_number('42.0')
 273     False
 274     """
 275     return (
 276         (is_number(in_str) and "." not in in_str)
 277         or is_hexidecimal_integer_number(in_str)
 278         or is_octal_integer_number(in_str)
 279         or is_binary_integer_number(in_str)
 280     )
 281
 282
 283 def is_hexidecimal_integer_number(in_str: str) -> bool:
 284     """
 285     Checks whether a string is a hex integer number.
 286
 287     >>> is_hexidecimal_integer_number('0x12345')
 288     True
 289     >>> is_hexidecimal_integer_number('0x1A3E')
 290     True
 291     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 292     False
 293     >>> is_hexidecimal_integer_number('-0xff')
 294     True
 295     >>> is_hexidecimal_integer_number('test')
 296     False
 297     >>> is_hexidecimal_integer_number(12345)  # Not a string
 298     Traceback (most recent call last):
 299     ...
 300     ValueError: 12345
 301     >>> is_hexidecimal_integer_number(101.4)
 302     Traceback (most recent call last):
 303     ...
 304     ValueError: 101.4
 305     >>> is_hexidecimal_integer_number(0x1A3E)
 306     Traceback (most recent call last):
 307     ...
 308     ValueError: 6718
 309     """
 310     if not is_string(in_str):
 311         raise ValueError(in_str)
 312     return HEX_NUMBER_RE.match(in_str) is not None
 313
 314
 315 def is_octal_integer_number(in_str: str) -> bool:
 316     """
 317     Checks whether a string is an octal number.
 318
 319     >>> is_octal_integer_number('0o777')
 320     True
 321     >>> is_octal_integer_number('-0O115')
 322     True
 323     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 324     False
 325     >>> is_octal_integer_number('7777')  # Needs 0o
 326     False
 327     >>> is_octal_integer_number('test')
 328     False
 329     """
 330     if not is_string(in_str):
 331         raise ValueError(in_str)
 332     return OCT_NUMBER_RE.match(in_str) is not None
 333
 334
 335 def is_binary_integer_number(in_str: str) -> bool:
 336     """
 337     Returns whether a string contains a binary number.
 338
 339     >>> is_binary_integer_number('0b10111')
 340     True
 341     >>> is_binary_integer_number('-0b111')
 342     True
 343     >>> is_binary_integer_number('0B10101')
 344     True
 345     >>> is_binary_integer_number('0b10102')
 346     False
 347     >>> is_binary_integer_number('0xFFF')
 348     False
 349     >>> is_binary_integer_number('test')
 350     False
 351     """
 352     if not is_string(in_str):
 353         raise ValueError(in_str)
 354     return BIN_NUMBER_RE.match(in_str) is not None
 355
 356
 357 def to_int(in_str: str) -> int:
 358     """Returns the integral value of the string or raises on error.
 359
 360     >>> to_int('1234')
 361     1234
 362     >>> to_int('test')
 363     Traceback (most recent call last):
 364     ...
 365     ValueError: invalid literal for int() with base 10: 'test'
 366     """
 367     if not is_string(in_str):
 368         raise ValueError(in_str)
 369     if is_binary_integer_number(in_str):
 370         return int(in_str, 2)
 371     if is_octal_integer_number(in_str):
 372         return int(in_str, 8)
 373     if is_hexidecimal_integer_number(in_str):
 374         return int(in_str, 16)
 375     return int(in_str)
 376
 377
 378 def is_decimal_number(in_str: str) -> bool:
 379     """
 380     Checks whether the given string represents a decimal or not.
 381
 382     A decimal may be signed or unsigned or use a "scientific notation".
 383
 384     >>> is_decimal_number('42.0')
 385     True
 386     >>> is_decimal_number('42')
 387     False
 388     """
 389     return is_number(in_str) and "." in in_str
 390
 391
 392 def strip_escape_sequences(in_str: str) -> str:
 393     """
 394     Remove escape sequences in the input string.
 395
 396     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 397     'this is a test!'
 398     """
 399     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 400     return in_str
 401
 402
 403 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 404     """
 405     Add thousands separator to a numeric string.  Also handles numbers.
 406
 407     >>> add_thousands_separator('12345678')
 408     '12,345,678'
 409     >>> add_thousands_separator(12345678)
 410     '12,345,678'
 411     >>> add_thousands_separator(12345678.99)
 412     '12,345,678.99'
 413     >>> add_thousands_separator('test')
 414     Traceback (most recent call last):
 415     ...
 416     ValueError: test
 417
 418     """
 419     if isinstance(in_str, numbers.Number):
 420         in_str = f'{in_str}'
 421     if is_number(in_str):
 422         return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
 423     raise ValueError(in_str)
 424
 425
 426 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
 427     decimal_part = ""
 428     if '.' in in_str:
 429         (in_str, decimal_part) = in_str.split('.')
 430     tmp = [iter(in_str[::-1])] * places
 431     ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 432     if len(decimal_part) > 0:
 433         ret += '.'
 434         ret += decimal_part
 435     return ret
 436
 437
 438 # Full url example:
 439 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 440 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 441     """
 442     Check if a string is a valid url.
 443
 444     >>> is_url('http://www.mysite.com')
 445     True
 446     >>> is_url('https://mysite.com')
 447     True
 448     >>> is_url('.mysite.com')
 449     False
 450     """
 451     if not is_full_string(in_str):
 452         return False
 453
 454     valid = URL_RE.match(in_str) is not None
 455
 456     if allowed_schemes:
 457         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 458     return valid
 459
 460
 461 def is_email(in_str: Any) -> bool:
 462     """
 463     Check if a string is a valid email.
 464
 465     Reference: https://tools.ietf.org/html/rfc3696#section-3
 466
 467     >>> is_email('[email protected]')
 468     True
 469     >>> is_email('@gmail.com')
 470     False
 471     """
 472     if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
 473         return False
 474
 475     try:
 476         # we expect 2 tokens, one before "@" and one after, otherwise
 477         # we have an exception and the email is not valid.
 478         head, tail = in_str.split("@")
 479
 480         # head's size must be <= 64, tail <= 255, head must not start
 481         # with a dot or contain multiple consecutive dots.
 482         if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
 483             return False
 484
 485         # removes escaped spaces, so that later on the test regex will
 486         # accept the string.
 487         head = head.replace("\\ ", "")
 488         if head.startswith('"') and head.endswith('"'):
 489             head = head.replace(" ", "")[1:-1]
 490         return EMAIL_RE.match(head + "@" + tail) is not None
 491
 492     except ValueError:
 493         # borderline case in which we have multiple "@" signs but the
 494         # head part is correctly escaped.
 495         if ESCAPED_AT_SIGN.search(in_str) is not None:
 496             # replace "@" with "a" in the head
 497             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 498         return False
 499
 500
 501 def suffix_string_to_number(in_str: str) -> Optional[int]:
 502     """Take a string like "33Gb" and convert it into a number (of bytes)
 503     like 34603008.  Return None if the input string is not valid.
 504
 505     >>> suffix_string_to_number('1Mb')
 506     1048576
 507     >>> suffix_string_to_number('13.1Gb')
 508     14066017894
 509     """
 510
 511     def suffix_capitalize(s: str) -> str:
 512         if len(s) == 1:
 513             return s.upper()
 514         elif len(s) == 2:
 515             return f"{s[0].upper()}{s[1].lower()}"
 516         return suffix_capitalize(s[0:1])
 517
 518     if is_string(in_str):
 519         if is_integer_number(in_str):
 520             return to_int(in_str)
 521         suffixes = [in_str[-2:], in_str[-1:]]
 522         rest = [in_str[:-2], in_str[:-1]]
 523         for x in range(len(suffixes)):
 524             s = suffixes[x]
 525             s = suffix_capitalize(s)
 526             multiplier = NUM_SUFFIXES.get(s, None)
 527             if multiplier is not None:
 528                 r = rest[x]
 529                 if is_integer_number(r):
 530                     return to_int(r) * multiplier
 531                 if is_decimal_number(r):
 532                     return int(float(r) * multiplier)
 533     return None
 534
 535
 536 def number_to_suffix_string(num: int) -> Optional[str]:
 537     """Take a number (of bytes) and returns a string like "43.8Gb".
 538     Returns none if the input is invalid.
 539
 540     >>> number_to_suffix_string(14066017894)
 541     '13.1Gb'
 542     >>> number_to_suffix_string(1024 * 1024)
 543     '1.0Mb'
 544
 545     """
 546     d = 0.0
 547     suffix = None
 548     for (sfx, size) in NUM_SUFFIXES.items():
 549         if num >= size:
 550             d = num / size
 551             suffix = sfx
 552             break
 553     if suffix is not None:
 554         return f"{d:.1f}{suffix}"
 555     else:
 556         return f'{num:d}'
 557
 558
 559 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 560     """
 561     Checks if a string is a valid credit card number.
 562     If card type is provided then it checks against that specific type only,
 563     otherwise any known credit card number will be accepted.
 564
 565     Supported card types are the following:
 566
 567     - VISA
 568     - MASTERCARD
 569     - AMERICAN_EXPRESS
 570     - DINERS_CLUB
 571     - DISCOVER
 572     - JCB
 573     """
 574     if not is_full_string(in_str):
 575         return False
 576
 577     if card_type is not None:
 578         if card_type not in CREDIT_CARDS:
 579             raise KeyError(
 580                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 581             )
 582         return CREDIT_CARDS[card_type].match(in_str) is not None
 583     for c in CREDIT_CARDS:
 584         if CREDIT_CARDS[c].match(in_str) is not None:
 585             return True
 586     return False
 587
 588
 589 def is_camel_case(in_str: Any) -> bool:
 590     """
 591     Checks if a string is formatted as camel case.
 592
 593     A string is considered camel case when:
 594
 595     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 596     - it contains both lowercase and uppercase letters
 597     - it does not start with a number
 598     """
 599     return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 600
 601
 602 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 603     """
 604     Checks if a string is formatted as "snake case".
 605
 606     A string is considered snake case when:
 607
 608     - it's composed only by lowercase/uppercase letters and digits
 609     - it contains at least one underscore (or provided separator)
 610     - it does not start with a number
 611
 612     >>> is_snake_case('this_is_a_test')
 613     True
 614     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 615     True
 616     >>> is_snake_case('this-is-a-test')
 617     False
 618     >>> is_snake_case('this-is-a-test', separator='-')
 619     True
 620
 621     """
 622     if is_full_string(in_str):
 623         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 624         re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 625         r = re_map.get(
 626             separator,
 627             re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
 628         )
 629         return r.match(in_str) is not None
 630     return False
 631
 632
 633 def is_json(in_str: Any) -> bool:
 634     """
 635     Check if a string is a valid json.
 636
 637     >>> is_json('{"name": "Peter"}')
 638     True
 639     >>> is_json('[1, 2, 3]')
 640     True
 641     >>> is_json('{nope}')
 642     False
 643     """
 644     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 645         try:
 646             return isinstance(json.loads(in_str), (dict, list))
 647         except (TypeError, ValueError, OverflowError):
 648             pass
 649     return False
 650
 651
 652 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 653     """
 654     Check if a string is a valid UUID.
 655
 656     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 657     True
 658     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 659     False
 660     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 661     True
 662     """
 663     # string casting is used to allow UUID itself as input data type
 664     s = str(in_str)
 665     if allow_hex:
 666         return UUID_HEX_OK_RE.match(s) is not None
 667     return UUID_RE.match(s) is not None
 668
 669
 670 def is_ip_v4(in_str: Any) -> bool:
 671     """
 672     Checks if a string is a valid ip v4.
 673
 674     >>> is_ip_v4('255.200.100.75')
 675     True
 676     >>> is_ip_v4('nope')
 677     False
 678     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 679     False
 680     """
 681     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 682         return False
 683
 684     # checks that each entry in the ip is in the valid range (0 to 255)
 685     for token in in_str.split("."):
 686         if not 0 <= int(token) <= 255:
 687             return False
 688     return True
 689
 690
 691 def extract_ip_v4(in_str: Any) -> Optional[str]:
 692     """
 693     Extracts the IPv4 chunk of a string or None.
 694
 695     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 696     '127.0.0.1'
 697     >>> extract_ip_v4('Your mom dresses you funny.')
 698     """
 699     if not is_full_string(in_str):
 700         return None
 701     m = ANYWHERE_IP_V4_RE.search(in_str)
 702     if m is not None:
 703         return m.group(0)
 704     return None
 705
 706
 707 def is_ip_v6(in_str: Any) -> bool:
 708     """
 709     Checks if a string is a valid ip v6.
 710
 711     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 712     True
 713     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 714     False
 715     """
 716     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 717
 718
 719 def extract_ip_v6(in_str: Any) -> Optional[str]:
 720     """
 721     Extract IPv6 chunk or None.
 722
 723     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 724     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 725     >>> extract_ip_v6("(and she's ugly too, btw)")
 726     """
 727     if not is_full_string(in_str):
 728         return None
 729     m = ANYWHERE_IP_V6_RE.search(in_str)
 730     if m is not None:
 731         return m.group(0)
 732     return None
 733
 734
 735 def is_ip(in_str: Any) -> bool:
 736     """
 737     Checks if a string is a valid ip (either v4 or v6).
 738
 739     >>> is_ip('255.200.100.75')
 740     True
 741     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 742     True
 743     >>> is_ip('1.2.3')
 744     False
 745     >>> is_ip('1.2.3.999')
 746     False
 747     """
 748     return is_ip_v6(in_str) or is_ip_v4(in_str)
 749
 750
 751 def extract_ip(in_str: Any) -> Optional[str]:
 752     """
 753     Extract the IP address or None.
 754
 755     >>> extract_ip('Attacker: 255.200.100.75')
 756     '255.200.100.75'
 757     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 758     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 759     >>> extract_ip('1.2.3')
 760
 761     """
 762     ip = extract_ip_v4(in_str)
 763     if ip is None:
 764         ip = extract_ip_v6(in_str)
 765     return ip
 766
 767
 768 def is_mac_address(in_str: Any) -> bool:
 769     """Return True if in_str is a valid MAC address false otherwise.
 770
 771     >>> is_mac_address("34:29:8F:12:0D:2F")
 772     True
 773     >>> is_mac_address('34:29:8f:12:0d:2f')
 774     True
 775     >>> is_mac_address('34-29-8F-12-0D-2F')
 776     True
 777     >>> is_mac_address("test")
 778     False
 779     """
 780     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 781
 782
 783 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 784     """
 785     Extract the MAC address from in_str.
 786
 787     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 788     '34:29:8F:12:0D:2F'
 789
 790     >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
 791     'd8:5d:e2:34:54:86'
 792
 793     """
 794     if not is_full_string(in_str):
 795         return None
 796     in_str.strip()
 797     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 798     if m is not None:
 799         mac = m.group(0)
 800         mac.replace(":", separator)
 801         mac.replace("-", separator)
 802         return mac
 803     return None
 804
 805
 806 def is_slug(in_str: Any, separator: str = "-") -> bool:
 807     """
 808     Checks if a given string is a slug (as created by `slugify()`).
 809
 810     >>> is_slug('my-blog-post-title')
 811     True
 812     >>> is_slug('My blog post title')
 813     False
 814
 815     """
 816     if not is_full_string(in_str):
 817         return False
 818     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 819     return re.match(rex, in_str) is not None
 820
 821
 822 def contains_html(in_str: str) -> bool:
 823     """
 824     Checks if the given string contains HTML/XML tags.
 825
 826     By design, this function matches ANY type of tag, so don't expect to use it
 827     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 828
 829     >>> contains_html('my string is <strong>bold</strong>')
 830     True
 831     >>> contains_html('my string is not bold')
 832     False
 833
 834     """
 835     if not is_string(in_str):
 836         raise ValueError(in_str)
 837     return HTML_RE.search(in_str) is not None
 838
 839
 840 def words_count(in_str: str) -> int:
 841     """
 842     Returns the number of words contained into the given string.
 843
 844     This method is smart, it does consider only sequence of one or more letter and/or numbers
 845     as "words", so a string like this: "! @ # % ... []" will return zero!
 846     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 847     will be 4 not 1 (even if there are no spaces in the string).
 848
 849     >>> words_count('hello world')
 850     2
 851     >>> words_count('one,two,three.stop')
 852     4
 853
 854     """
 855     if not is_string(in_str):
 856         raise ValueError(in_str)
 857     return len(WORDS_COUNT_RE.findall(in_str))
 858
 859
 860 def word_count(in_str: str) -> int:
 861     return words_count(in_str)
 862
 863
 864 def generate_uuid(omit_dashes: bool = False) -> str:
 865     """
 866     Generated an UUID string (using `uuid.uuid4()`).
 867
 868     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 869     generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 870
 871     """
 872     uid = uuid4()
 873     if omit_dashes:
 874         return uid.hex
 875     return str(uid)
 876
 877
 878 def generate_random_alphanumeric_string(size: int) -> str:
 879     """
 880     Returns a string of the specified size containing random
 881     characters (uppercase/lowercase ascii letters and digits).
 882
 883     >>> random.seed(22)
 884     >>> generate_random_alphanumeric_string(9)
 885     '96ipbNClS'
 886
 887     """
 888     if size < 1:
 889         raise ValueError("size must be >= 1")
 890     chars = string.ascii_letters + string.digits
 891     buffer = [random.choice(chars) for _ in range(size)]
 892     return from_char_list(buffer)
 893
 894
 895 def reverse(in_str: str) -> str:
 896     """
 897     Returns the string with its chars reversed.
 898
 899     >>> reverse('test')
 900     'tset'
 901
 902     """
 903     if not is_string(in_str):
 904         raise ValueError(in_str)
 905     return in_str[::-1]
 906
 907
 908 def camel_case_to_snake_case(in_str, *, separator="_"):
 909     """
 910     Convert a camel case string into a snake case one.
 911     (The original string is returned if is not a valid camel case string)
 912
 913     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 914     'mac_address_extractor_factory'
 915     >>> camel_case_to_snake_case('Luke Skywalker')
 916     'Luke Skywalker'
 917     """
 918     if not is_string(in_str):
 919         raise ValueError(in_str)
 920     if not is_camel_case(in_str):
 921         return in_str
 922     return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
 923
 924
 925 def snake_case_to_camel_case(
 926     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 927 ) -> str:
 928     """
 929     Convert a snake case string into a camel case one.
 930     (The original string is returned if is not a valid snake case string)
 931
 932     >>> snake_case_to_camel_case('this_is_a_test')
 933     'ThisIsATest'
 934     >>> snake_case_to_camel_case('Han Solo')
 935     'Han Solo'
 936     """
 937     if not is_string(in_str):
 938         raise ValueError(in_str)
 939     if not is_snake_case(in_str, separator=separator):
 940         return in_str
 941     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 942     if not upper_case_first:
 943         tokens[0] = tokens[0].lower()
 944     return from_char_list(tokens)
 945
 946
 947 def to_char_list(in_str: str) -> List[str]:
 948     """Convert a string into a list of chars.
 949
 950     >>> to_char_list('test')
 951     ['t', 'e', 's', 't']
 952     """
 953     if not is_string(in_str):
 954         return []
 955     return list(in_str)
 956
 957
 958 def from_char_list(in_list: List[str]) -> str:
 959     """Convert a char list into a string.
 960
 961     >>> from_char_list(['t', 'e', 's', 't'])
 962     'test'
 963     """
 964     return "".join(in_list)
 965
 966
 967 def shuffle(in_str: str) -> str:
 968     """Return a new string containing same chars of the given one but in
 969     a randomized order.
 970     """
 971     if not is_string(in_str):
 972         raise ValueError(in_str)
 973
 974     # turn the string into a list of chars
 975     chars = to_char_list(in_str)
 976     random.shuffle(chars)
 977     return from_char_list(chars)
 978
 979
 980 def scramble(in_str: str) -> str:
 981     return shuffle(in_str)
 982
 983
 984 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 985     """
 986     Remove html code contained into the given string.
 987
 988     >>> strip_html('test: <a href="foo/bar">click here</a>')
 989     'test: '
 990     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 991     'test: click here'
 992     """
 993     if not is_string(in_str):
 994         raise ValueError(in_str)
 995     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 996     return r.sub("", in_str)
 997
 998
 999 def asciify(in_str: str) -> str:
1000     """
1001     Force string content to be ascii-only by translating all non-ascii
1002     chars into the closest possible representation (eg: ó -> o, Ë ->
1003     E, ç -> c...).
1004
1005     N.B. Some chars may be lost if impossible to translate.
1006
1007     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1008     'eeuuooaaeynAAACIINOE'
1009     """
1010     if not is_string(in_str):
1011         raise ValueError(in_str)
1012
1013     # "NFKD" is the algorithm which is able to successfully translate
1014     # the most of non-ascii chars.
1015     normalized = unicodedata.normalize("NFKD", in_str)
1016
1017     # encode string forcing ascii and ignore any errors
1018     # (unrepresentable chars will be stripped out)
1019     ascii_bytes = normalized.encode("ascii", "ignore")
1020
1021     # turns encoded bytes into an utf-8 string
1022     return ascii_bytes.decode("utf-8")
1023
1024
1025 def slugify(in_str: str, *, separator: str = "-") -> str:
1026     """
1027     Converts a string into a "slug" using provided separator.
1028     The returned string has the following properties:
1029
1030     - it has no spaces
1031     - all letters are in lower case
1032     - all punctuation signs and non alphanumeric chars are removed
1033     - words are divided using provided separator
1034     - all chars are encoded as ascii (by using `asciify()`)
1035     - is safe for URL
1036
1037     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1038     'top-10-reasons-to-love-dogs'
1039     >>> slugify('Mönstér Mägnët')
1040     'monster-magnet'
1041     """
1042     if not is_string(in_str):
1043         raise ValueError(in_str)
1044
1045     # replace any character that is NOT letter or number with spaces
1046     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1047
1048     # replace spaces with join sign
1049     out = SPACES_RE.sub(separator, out)
1050
1051     # normalize joins (remove duplicates)
1052     out = re.sub(re.escape(separator) + r"+", separator, out)
1053     return asciify(out)
1054
1055
1056 def to_bool(in_str: str) -> bool:
1057     """
1058     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1059
1060     A positive boolean (True) is returned if the string value is one
1061     of the following:
1062
1063     - "true"
1064     - "1"
1065     - "yes"
1066     - "y"
1067
1068     Otherwise False is returned.
1069
1070     >>> to_bool('True')
1071     True
1072
1073     >>> to_bool('1')
1074     True
1075
1076     >>> to_bool('yes')
1077     True
1078
1079     >>> to_bool('no')
1080     False
1081
1082     >>> to_bool('huh?')
1083     False
1084
1085     >>> to_bool('on')
1086     True
1087
1088     """
1089     if not is_string(in_str):
1090         raise ValueError(in_str)
1091     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1092
1093
1094 def to_date(in_str: str) -> Optional[datetime.date]:
1095     """
1096     Parses a date string.  See DateParser docs for details.
1097     """
1098     import dateparse.dateparse_utils as du
1099
1100     try:
1101         d = du.DateParser()  # type: ignore
1102         d.parse(in_str)
1103         return d.get_date()
1104     except du.ParseException:  # type: ignore
1105         msg = f'Unable to parse date {in_str}.'
1106         logger.warning(msg)
1107     return None
1108
1109
1110 def valid_date(in_str: str) -> bool:
1111     """
1112     True if the string represents a valid date.
1113     """
1114     import dateparse.dateparse_utils as dp
1115
1116     try:
1117         d = dp.DateParser()  # type: ignore
1118         _ = d.parse(in_str)
1119         return True
1120     except dp.ParseException:  # type: ignore
1121         msg = f'Unable to parse date {in_str}.'
1122         logger.warning(msg)
1123     return False
1124
1125
1126 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1127     """
1128     Parses a datetime string.  See DateParser docs for more info.
1129     """
1130     import dateparse.dateparse_utils as dp
1131
1132     try:
1133         d = dp.DateParser()  # type: ignore
1134         dt = d.parse(in_str)
1135         if isinstance(dt, datetime.datetime):
1136             return dt
1137     except ValueError:
1138         msg = f'Unable to parse datetime {in_str}.'
1139         logger.warning(msg)
1140     return None
1141
1142
1143 def valid_datetime(in_str: str) -> bool:
1144     """
1145     True if the string represents a valid datetime.
1146     """
1147     _ = to_datetime(in_str)
1148     if _ is not None:
1149         return True
1150     msg = f'Unable to parse datetime {in_str}.'
1151     logger.warning(msg)
1152     return False
1153
1154
1155 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1156     """
1157     Squeeze runs of more than one character_to_squeeze into one.
1158
1159     >>> squeeze(' this        is       a    test    ')
1160     ' this is a test '
1161
1162     >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1163     'one|!|two|!|three'
1164
1165     """
1166     return re.sub(
1167         r'(' + re.escape(character_to_squeeze) + r')+',
1168         character_to_squeeze,
1169         in_str,
1170     )
1171
1172
1173 def dedent(in_str: str) -> str:
1174     """
1175     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1176     """
1177     if not is_string(in_str):
1178         raise ValueError(in_str)
1179     line_separator = '\n'
1180     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1181     return line_separator.join(lines)
1182
1183
1184 def indent(in_str: str, amount: int) -> str:
1185     """
1186     Indents string by prepending amount spaces.
1187
1188     >>> indent('This is a test', 4)
1189     '    This is a test'
1190
1191     """
1192     if not is_string(in_str):
1193         raise ValueError(in_str)
1194     line_separator = '\n'
1195     lines = [" " * amount + line for line in in_str.split(line_separator)]
1196     return line_separator.join(lines)
1197
1198
1199 def sprintf(*args, **kwargs) -> str:
1200     """String printf, like in C"""
1201     ret = ""
1202
1203     sep = kwargs.pop("sep", None)
1204     if sep is not None:
1205         if not isinstance(sep, str):
1206             raise TypeError("sep must be None or a string")
1207
1208     end = kwargs.pop("end", None)
1209     if end is not None:
1210         if not isinstance(end, str):
1211             raise TypeError("end must be None or a string")
1212
1213     if kwargs:
1214         raise TypeError("invalid keyword arguments to sprint()")
1215
1216     if sep is None:
1217         sep = " "
1218     if end is None:
1219         end = "\n"
1220     for i, arg in enumerate(args):
1221         if i:
1222             ret += sep
1223         if isinstance(arg, str):
1224             ret += arg
1225         else:
1226             ret += str(arg)
1227     ret += end
1228     return ret
1229
1230
1231 def strip_ansi_sequences(in_str: str) -> str:
1232     """Strips ANSI sequences out of strings.
1233
1234     >>> import ansi as a
1235     >>> s = a.fg('blue') + 'blue!' + a.reset()
1236     >>> len(s)   # '\x1b[38;5;21mblue!\x1b[m'
1237     18
1238     >>> len(strip_ansi_sequences(s))
1239     5
1240     >>> strip_ansi_sequences(s)
1241     'blue!'
1242
1243     """
1244     return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1245
1246
1247 class SprintfStdout(contextlib.AbstractContextManager):
1248     """
1249     A context manager that captures outputs to stdout.
1250
1251     with SprintfStdout() as buf:
1252         print("test")
1253     print(buf())
1254
1255     'test\n'
1256     """
1257
1258     def __init__(self) -> None:
1259         self.destination = io.StringIO()
1260         self.recorder: contextlib.redirect_stdout
1261
1262     def __enter__(self) -> Callable[[], str]:
1263         self.recorder = contextlib.redirect_stdout(self.destination)
1264         self.recorder.__enter__()
1265         return lambda: self.destination.getvalue()
1266
1267     def __exit__(self, *args) -> Literal[False]:
1268         self.recorder.__exit__(*args)
1269         self.destination.seek(0)
1270         return False
1271
1272
1273 def capitalize_first_letter(txt: str) -> str:
1274     """Capitalize the first letter of a string.
1275
1276     >>> capitalize_first_letter('test')
1277     'Test'
1278     >>> capitalize_first_letter("ALREADY!")
1279     'ALREADY!'
1280
1281     """
1282     return txt[0].upper() + txt[1:]
1283
1284
1285 def it_they(n: int) -> str:
1286     """It or they?
1287
1288     >>> it_they(1)
1289     'it'
1290     >>> it_they(100)
1291     'they'
1292
1293     """
1294     if n == 1:
1295         return "it"
1296     return "they"
1297
1298
1299 def is_are(n: int) -> str:
1300     """Is or are?
1301
1302     >>> is_are(1)
1303     'is'
1304     >>> is_are(2)
1305     'are'
1306
1307     """
1308     if n == 1:
1309         return "is"
1310     return "are"
1311
1312
1313 def pluralize(n: int) -> str:
1314     """Add an s?
1315
1316     >>> pluralize(15)
1317     's'
1318     >>> count = 1
1319     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1320     There is 1 file.
1321     >>> count = 4
1322     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1323     There are 4 files.
1324
1325     """
1326     if n == 1:
1327         return ""
1328     return "s"
1329
1330
1331 def make_contractions(txt: str) -> str:
1332     """Glue words together to form contractions.
1333
1334     >>> make_contractions('It is nice today.')
1335     "It's nice today."
1336
1337     >>> make_contractions('I can    not even...')
1338     "I can't even..."
1339
1340     >>> make_contractions('She could not see!')
1341     "She couldn't see!"
1342
1343     >>> make_contractions('But she will not go.')
1344     "But she won't go."
1345
1346     >>> make_contractions('Verily, I shall not.')
1347     "Verily, I shan't."
1348
1349     >>> make_contractions('No you cannot.')
1350     "No you can't."
1351
1352     >>> make_contractions('I said you can not go.')
1353     "I said you can't go."
1354
1355     """
1356
1357     first_second = [
1358         (
1359             [
1360                 'are',
1361                 'could',
1362                 'did',
1363                 'has',
1364                 'have',
1365                 'is',
1366                 'must',
1367                 'should',
1368                 'was',
1369                 'were',
1370                 'would',
1371             ],
1372             ['(n)o(t)'],
1373         ),
1374         (
1375             [
1376                 "I",
1377                 "you",
1378                 "he",
1379                 "she",
1380                 "it",
1381                 "we",
1382                 "they",
1383                 "how",
1384                 "why",
1385                 "when",
1386                 "where",
1387                 "who",
1388                 "there",
1389             ],
1390             ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1391         ),
1392     ]
1393
1394     # Special cases: can't, shan't and won't.
1395     txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1396     txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1397     txt = re.sub(
1398         r'\b(w)ill\s*(n)(o)(t)\b',
1399         r"\1\3\2'\4",
1400         txt,
1401         count=0,
1402         flags=re.IGNORECASE,
1403     )
1404
1405     for first_list, second_list in first_second:
1406         for first in first_list:
1407             for second in second_list:
1408                 # Disallow there're/where're.  They're valid English
1409                 # but sound weird.
1410                 if (first in ('there', 'where')) and second == 'a(re)':
1411                     continue
1412
1413                 pattern = fr'\b({first})\s+{second}\b'
1414                 if second == '(n)o(t)':
1415                     replacement = r"\1\2'\3"
1416                 else:
1417                     replacement = r"\1'\2"
1418                 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1419
1420     return txt
1421
1422
1423 def thify(n: int) -> str:
1424     """Return the proper cardinal suffix for a number.
1425
1426     >>> thify(1)
1427     'st'
1428     >>> thify(33)
1429     'rd'
1430     >>> thify(16)
1431     'th'
1432
1433     """
1434     digit = str(n)
1435     assert is_integer_number(digit)
1436     digit = digit[-1:]
1437     if digit == "1":
1438         return "st"
1439     elif digit == "2":
1440         return "nd"
1441     elif digit == "3":
1442         return "rd"
1443     else:
1444         return "th"
1445
1446
1447 def ngrams(txt: str, n: int):
1448     """Return the ngrams from a string.
1449
1450     >>> [x for x in ngrams('This is a test', 2)]
1451     ['This is', 'is a', 'a test']
1452
1453     """
1454     words = txt.split()
1455     for ngram in ngrams_presplit(words, n):
1456         ret = ''
1457         for word in ngram:
1458             ret += f'{word} '
1459         yield ret.strip()
1460
1461
1462 def ngrams_presplit(words: Sequence[str], n: int):
1463     return list_utils.ngrams(words, n)
1464
1465
1466 def bigrams(txt: str):
1467     return ngrams(txt, 2)
1468
1469
1470 def trigrams(txt: str):
1471     return ngrams(txt, 3)
1472
1473
1474 def shuffle_columns_into_list(
1475     input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1476 ) -> Iterable[str]:
1477     """Helper to shuffle / parse columnar data and return the results as a
1478     list.  The column_specs argument is an iterable collection of
1479     numeric sequences that indicate one or more column numbers to
1480     copy.
1481
1482     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1483     >>> shuffle_columns_into_list(
1484     ...     cols,
1485     ...     [ [8], [2, 3], [5, 6, 7] ],
1486     ...     delim=' ',
1487     ... )
1488     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1489
1490     """
1491     out = []
1492
1493     # Column specs map input lines' columns into outputs.
1494     # [col1, col2...]
1495     for spec in column_specs:
1496         hunk = ''
1497         for n in spec:
1498             hunk = hunk + delim + input_lines[n]
1499         hunk = hunk.strip(delim)
1500         out.append(hunk)
1501     return out
1502
1503
1504 def shuffle_columns_into_dict(
1505     input_lines: Sequence[str],
1506     column_specs: Iterable[Tuple[str, Iterable[int]]],
1507     delim='',
1508 ) -> Dict[str, str]:
1509     """Helper to shuffle / parse columnar data and return the results
1510     as a dict.
1511
1512     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1513     >>> shuffle_columns_into_dict(
1514     ...     cols,
1515     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1516     ...     delim=' ',
1517     ... )
1518     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1519
1520     """
1521     out = {}
1522
1523     # Column specs map input lines' columns into outputs.
1524     # "key", [col1, col2...]
1525     for spec in column_specs:
1526         hunk = ''
1527         for n in spec[1]:
1528             hunk = hunk + delim + input_lines[n]
1529         hunk = hunk.strip(delim)
1530         out[spec[0]] = hunk
1531     return out
1532
1533
1534 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1535     """Interpolate a string with data from a dict.
1536
1537     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1538     ...                        {'adjective': 'good', 'noun': 'example'})
1539     'This is a good example.'
1540
1541     """
1542     return sprintf(txt.format(**values), end='')
1543
1544
1545 def to_ascii(x: str):
1546     """Encode as ascii bytes string.
1547
1548     >>> to_ascii('test')
1549     b'test'
1550
1551     >>> to_ascii(b'1, 2, 3')
1552     b'1, 2, 3'
1553
1554     """
1555     if isinstance(x, str):
1556         return x.encode('ascii')
1557     if isinstance(x, bytes):
1558         return x
1559     raise Exception('to_ascii works with strings and bytes')
1560
1561
1562 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1563     """Encode txt and then encode the bytes with a 64-character
1564     alphabet.  This is compatible with uudecode.
1565
1566     >>> to_base64('hello?')
1567     b'aGVsbG8/\\n'
1568
1569     """
1570     return base64.encodebytes(txt.encode(encoding, errors))
1571
1572
1573 def is_base64(txt: str) -> bool:
1574     """Determine whether a string is base64 encoded (with Python's standard
1575     base64 alphabet which is the same as what uuencode uses).
1576
1577     >>> is_base64('test')    # all letters in the b64 alphabet
1578     True
1579
1580     >>> is_base64('another test, how do you like this one?')
1581     False
1582
1583     >>> is_base64(b'aGVsbG8/\\n')    # Ending newline is ok.
1584     True
1585
1586     """
1587     a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1588     alphabet = set(a.encode('ascii'))
1589     for char in to_ascii(txt.strip()):
1590         if char not in alphabet:
1591             return False
1592     return True
1593
1594
1595 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1596     """Convert base64 encoded string back to normal strings.
1597
1598     >>> from_base64(b'aGVsbG8/\\n')
1599     'hello?'
1600
1601     """
1602     return base64.decodebytes(b64).decode(encoding, errors)
1603
1604
1605 def chunk(txt: str, chunk_size):
1606     """Chunk up a string.
1607
1608     >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1609     '01001101 11000101 10101010 10101010 10011111 10101000'
1610
1611     """
1612     if len(txt) % chunk_size != 0:
1613         msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1614         logger.warning(msg)
1615         warnings.warn(msg, stacklevel=2)
1616     for x in range(0, len(txt), chunk_size):
1617         yield txt[x : x + chunk_size]
1618
1619
1620 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1621     """Encode txt and then chop it into bytes.  Note: only bitstrings
1622     with delimiter='' are interpretable by from_bitstring.
1623
1624     >>> to_bitstring('hello?')
1625     '011010000110010101101100011011000110111100111111'
1626
1627     >>> to_bitstring('test', delimiter=' ')
1628     '01110100 01100101 01110011 01110100'
1629
1630     >>> to_bitstring(b'test')
1631     '01110100011001010111001101110100'
1632
1633     """
1634     etxt = to_ascii(txt)
1635     bits = bin(int.from_bytes(etxt, 'big'))
1636     bits = bits[2:]
1637     return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1638
1639
1640 def is_bitstring(txt: str) -> bool:
1641     """Is this a bitstring?
1642
1643     >>> is_bitstring('011010000110010101101100011011000110111100111111')
1644     True
1645
1646     >>> is_bitstring('1234')
1647     False
1648
1649     """
1650     return is_binary_integer_number(f'0b{txt}')
1651
1652
1653 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1654     """Convert from bitstring back to bytes then decode into a str.
1655
1656     >>> from_bitstring('011010000110010101101100011011000110111100111111')
1657     'hello?'
1658
1659     """
1660     n = int(bits, 2)
1661     return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1662
1663
1664 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1665     """Turn an IPv4 address into a tuple for sorting purposes.
1666
1667     >>> ip_v4_sort_key('10.0.0.18')
1668     (10, 0, 0, 18)
1669
1670     >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1671     >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1672     ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1673
1674     """
1675     if not is_ip_v4(txt):
1676         print(f"not IP: {txt}")
1677         return None
1678     return tuple(int(x) for x in txt.split('.'))
1679
1680
1681 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1682     """Chunk up a file path so that parent/ancestor paths sort before
1683     children/descendant paths.
1684
1685     >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1686     ('usr', 'local', 'bin')
1687
1688     >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1689     >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1690     ['/usr', '/usr/local', '/usr/local/bin']
1691
1692     """
1693     return tuple(x for x in volume.split('/') if len(x) > 0)
1694
1695
1696 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1697     """Execute several replace operations in a row.
1698
1699     >>> s = 'this_is a-test!'
1700     >>> replace_all(s, ' _-!', '')
1701     'thisisatest'
1702
1703     """
1704     for char in replace_set:
1705         in_str = in_str.replace(char, replacement)
1706     return in_str
1707
1708
1709 def replace_nth(in_str: str, source: str, target: str, nth: int):
1710     """Replaces the nth occurrance of a substring within a string.
1711
1712     >>> replace_nth('this is a test', ' ', '-', 3)
1713     'this is a-test'
1714
1715     """
1716     where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
1717     before = in_str[:where]
1718     after = in_str[where:]
1719     after = after.replace(source, target, 1)
1720     return before + after
1721
1722
1723 if __name__ == '__main__':
1724     import doctest
1725
1726     doctest.testmod()