string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import contextlib
   4 import datetime
   5 import io
   6 from itertools import zip_longest
   7 import json
   8 import logging
   9 import numbers
  10 import random
  11 import re
  12 import string
  13 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
  14 import unicodedata
  15 from uuid import uuid4
  16
  17 logger = logging.getLogger(__name__)
  18
  19 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  20
  21 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  22
  23 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  24
  25 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  26
  27 URLS_RAW_STRING = (
  28     r"([a-z-]+://)"  # scheme
  29     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  30     r"(www\.)?"  # www.
  31     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
  32     r"(:\d{2,})?"  # port number
  33     r"(/[a-z\d_%+-]*)*"  # folders
  34     r"(\.[a-z\d_%+-]+)*"  # file extension
  35     r"(\?[a-z\d_+%-=]*)?"  # query string
  36     r"(#\S*)?"  # hash
  37 )
  38
  39 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  40
  41 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  42
  43 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  44
  45 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  46
  47 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  48
  49 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  50
  51 CAMEL_CASE_TEST_RE = re.compile(
  52     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  53 )
  54
  55 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  56
  57 SNAKE_CASE_TEST_RE = re.compile(
  58     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  59 )
  60
  61 SNAKE_CASE_TEST_DASH_RE = re.compile(
  62     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  63 )
  64
  65 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  66
  67 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  68
  69 CREDIT_CARDS = {
  70     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  71     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  72     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  73     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  74     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  75     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  76 }
  77
  78 JSON_WRAPPER_RE = re.compile(
  79     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  80 )
  81
  82 UUID_RE = re.compile(
  83     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  84 )
  85
  86 UUID_HEX_OK_RE = re.compile(
  87     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  88     re.IGNORECASE,
  89 )
  90
  91 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  92
  93 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
  94
  95 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
  96
  97 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
  98
  99 MAC_ADDRESS_RE = re.compile(
 100     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
 101 )
 102
 103 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 104     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 105 )
 106
 107 WORDS_COUNT_RE = re.compile(
 108     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 109 )
 110
 111 HTML_RE = re.compile(
 112     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 113     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 114 )
 115
 116 HTML_TAG_ONLY_RE = re.compile(
 117     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 118     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 119 )
 120
 121 SPACES_RE = re.compile(r"\s")
 122
 123 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 124     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 125 )
 126
 127 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 128
 129 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 130
 131 NUM_SUFFIXES = {
 132     "Pb": (1024 ** 5),
 133     "P": (1024 ** 5),
 134     "Tb": (1024 ** 4),
 135     "T": (1024 ** 4),
 136     "Gb": (1024 ** 3),
 137     "G": (1024 ** 3),
 138     "Mb": (1024 ** 2),
 139     "M": (1024 ** 2),
 140     "Kb": (1024 ** 1),
 141     "K": (1024 ** 1),
 142 }
 143
 144
 145 def is_none_or_empty(in_str: Optional[str]) -> bool:
 146     """
 147     Returns true if the input string is either None or an empty string.
 148
 149     >>> is_none_or_empty("")
 150     True
 151     >>> is_none_or_empty(None)
 152     True
 153     >>> is_none_or_empty("   \t   ")
 154     True
 155     >>> is_none_or_empty('Test')
 156     False
 157     """
 158     return in_str is None or len(in_str.strip()) == 0
 159
 160
 161 def is_string(obj: Any) -> bool:
 162     """
 163     Checks if an object is a string.
 164
 165     >>> is_string('test')
 166     True
 167     >>> is_string(123)
 168     False
 169     >>> is_string(100.3)
 170     False
 171     >>> is_string([1, 2, 3])
 172     False
 173     """
 174     return isinstance(obj, str)
 175
 176
 177 def is_empty_string(in_str: Any) -> bool:
 178     return is_empty(in_str)
 179
 180
 181 def is_empty(in_str: Any) -> bool:
 182     """
 183     Checks if input is a string and empty or only whitespace.
 184
 185     >>> is_empty('')
 186     True
 187     >>> is_empty('    \t\t    ')
 188     True
 189     >>> is_empty('test')
 190     False
 191     >>> is_empty(100.88)
 192     False
 193     >>> is_empty([1, 2, 3])
 194     False
 195     """
 196     return is_string(in_str) and in_str.strip() == ""
 197
 198
 199 def is_full_string(in_str: Any) -> bool:
 200     """
 201     Checks that input is a string and is not empty ('') or only whitespace.
 202
 203     >>> is_full_string('test!')
 204     True
 205     >>> is_full_string('')
 206     False
 207     >>> is_full_string('      ')
 208     False
 209     >>> is_full_string(100.999)
 210     False
 211     >>> is_full_string({"a": 1, "b": 2})
 212     False
 213     """
 214     return is_string(in_str) and in_str.strip() != ""
 215
 216
 217 def is_number(in_str: str) -> bool:
 218     """
 219     Checks if a string is a valid number.
 220
 221     >>> is_number(100.5)
 222     Traceback (most recent call last):
 223     ...
 224     ValueError: 100.5
 225     >>> is_number("100.5")
 226     True
 227     >>> is_number("test")
 228     False
 229     >>> is_number("99")
 230     True
 231     >>> is_number([1, 2, 3])
 232     Traceback (most recent call last):
 233     ...
 234     ValueError: [1, 2, 3]
 235     """
 236     if not is_string(in_str):
 237         raise ValueError(in_str)
 238     return NUMBER_RE.match(in_str) is not None
 239
 240
 241 def is_integer_number(in_str: str) -> bool:
 242     """
 243     Checks whether the given string represents an integer or not.
 244
 245     An integer may be signed or unsigned or use a "scientific notation".
 246
 247     >>> is_integer_number('42')
 248     True
 249     >>> is_integer_number('42.0')
 250     False
 251     """
 252     return (
 253         (is_number(in_str) and "." not in in_str) or
 254         is_hexidecimal_integer_number(in_str) or
 255         is_octal_integer_number(in_str) or
 256         is_binary_integer_number(in_str)
 257     )
 258
 259
 260 def is_hexidecimal_integer_number(in_str: str) -> bool:
 261     """
 262     Checks whether a string is a hex integer number.
 263
 264     >>> is_hexidecimal_integer_number('0x12345')
 265     True
 266     >>> is_hexidecimal_integer_number('0x1A3E')
 267     True
 268     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 269     False
 270     >>> is_hexidecimal_integer_number('-0xff')
 271     True
 272     >>> is_hexidecimal_integer_number('test')
 273     False
 274     >>> is_hexidecimal_integer_number(12345)  # Not a string
 275     Traceback (most recent call last):
 276     ...
 277     ValueError: 12345
 278     >>> is_hexidecimal_integer_number(101.4)
 279     Traceback (most recent call last):
 280     ...
 281     ValueError: 101.4
 282     >>> is_hexidecimal_integer_number(0x1A3E)
 283     Traceback (most recent call last):
 284     ...
 285     ValueError: 6718
 286     """
 287     if not is_string(in_str):
 288         raise ValueError(in_str)
 289     return HEX_NUMBER_RE.match(in_str) is not None
 290
 291
 292 def is_octal_integer_number(in_str: str) -> bool:
 293     """
 294     Checks whether a string is an octal number.
 295
 296     >>> is_octal_integer_number('0o777')
 297     True
 298     >>> is_octal_integer_number('-0O115')
 299     True
 300     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 301     False
 302     >>> is_octal_integer_number('7777')  # Needs 0o
 303     False
 304     >>> is_octal_integer_number('test')
 305     False
 306     """
 307     if not is_string(in_str):
 308         raise ValueError(in_str)
 309     return OCT_NUMBER_RE.match(in_str) is not None
 310
 311
 312 def is_binary_integer_number(in_str: str) -> bool:
 313     """
 314     Returns whether a string contains a binary number.
 315
 316     >>> is_binary_integer_number('0b10111')
 317     True
 318     >>> is_binary_integer_number('-0b111')
 319     True
 320     >>> is_binary_integer_number('0B10101')
 321     True
 322     >>> is_binary_integer_number('0b10102')
 323     False
 324     >>> is_binary_integer_number('0xFFF')
 325     False
 326     >>> is_binary_integer_number('test')
 327     False
 328     """
 329     if not is_string(in_str):
 330         raise ValueError(in_str)
 331     return BIN_NUMBER_RE.match(in_str) is not None
 332
 333
 334 def to_int(in_str: str) -> int:
 335     """Returns the integral value of the string or raises on error.
 336
 337     >>> to_int('1234')
 338     1234
 339     >>> to_int('test')
 340     Traceback (most recent call last):
 341     ...
 342     ValueError: invalid literal for int() with base 10: 'test'
 343     """
 344     if not is_string(in_str):
 345         raise ValueError(in_str)
 346     if is_binary_integer_number(in_str):
 347         return int(in_str, 2)
 348     if is_octal_integer_number(in_str):
 349         return int(in_str, 8)
 350     if is_hexidecimal_integer_number(in_str):
 351         return int(in_str, 16)
 352     return int(in_str)
 353
 354
 355 def is_decimal_number(in_str: str) -> bool:
 356     """
 357     Checks whether the given string represents a decimal or not.
 358
 359     A decimal may be signed or unsigned or use a "scientific notation".
 360
 361     >>> is_decimal_number('42.0')
 362     True
 363     >>> is_decimal_number('42')
 364     False
 365     """
 366     return is_number(in_str) and "." in in_str
 367
 368
 369 def strip_escape_sequences(in_str: str) -> str:
 370     """
 371     Remove escape sequences in the input string.
 372
 373     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 374     'this is a test!'
 375     """
 376     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 377     return in_str
 378
 379
 380 def add_thousands_separator(
 381         in_str: str,
 382         *,
 383         separator_char = ',',
 384         places = 3
 385 ) -> str:
 386     """
 387     Add thousands separator to a numeric string.  Also handles numbers.
 388
 389     >>> add_thousands_separator('12345678')
 390     '12,345,678'
 391     >>> add_thousands_separator(12345678)
 392     '12,345,678'
 393     >>> add_thousands_separator(12345678.99)
 394     '12,345,678.99'
 395     >>> add_thousands_separator('test')
 396     Traceback (most recent call last):
 397     ...
 398     ValueError: test
 399
 400     """
 401     if isinstance(in_str, numbers.Number):
 402         in_str = f'{in_str}'
 403     if is_number(in_str):
 404         return _add_thousands_separator(
 405             in_str,
 406             separator_char = separator_char,
 407             places = places
 408         )
 409     raise ValueError(in_str)
 410
 411
 412 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 413     decimal_part = ""
 414     if '.' in in_str:
 415         (in_str, decimal_part) = in_str.split('.')
 416     tmp = [iter(in_str[::-1])] * places
 417     ret = separator_char.join(
 418         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 419     if len(decimal_part) > 0:
 420         ret += '.'
 421         ret += decimal_part
 422     return ret
 423
 424
 425 # Full url example:
 426 # scheme://username:password@www.domain.com:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 427 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 428     """
 429     Check if a string is a valid url.
 430
 431     >>> is_url('http://www.mysite.com')
 432     True
 433     >>> is_url('https://mysite.com')
 434     True
 435     >>> is_url('.mysite.com')
 436     False
 437     """
 438     if not is_full_string(in_str):
 439         return False
 440
 441     valid = URL_RE.match(in_str) is not None
 442
 443     if allowed_schemes:
 444         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 445     return valid
 446
 447
 448 def is_email(in_str: Any) -> bool:
 449     """
 450     Check if a string is a valid email.
 451
 452     Reference: https://tools.ietf.org/html/rfc3696#section-3
 453
 454     >>> is_email('my.email@the-provider.com')
 455     True
 456     >>> is_email('@gmail.com')
 457     False
 458     """
 459     if (
 460         not is_full_string(in_str)
 461         or len(in_str) > 320
 462         or in_str.startswith(".")
 463     ):
 464         return False
 465
 466     try:
 467         # we expect 2 tokens, one before "@" and one after, otherwise
 468         # we have an exception and the email is not valid.
 469         head, tail = in_str.split("@")
 470
 471         # head's size must be <= 64, tail <= 255, head must not start
 472         # with a dot or contain multiple consecutive dots.
 473         if (
 474             len(head) > 64
 475             or len(tail) > 255
 476             or head.endswith(".")
 477             or (".." in head)
 478         ):
 479             return False
 480
 481         # removes escaped spaces, so that later on the test regex will
 482         # accept the string.
 483         head = head.replace("\\ ", "")
 484         if head.startswith('"') and head.endswith('"'):
 485             head = head.replace(" ", "")[1:-1]
 486         return EMAIL_RE.match(head + "@" + tail) is not None
 487
 488     except ValueError:
 489         # borderline case in which we have multiple "@" signs but the
 490         # head part is correctly escaped.
 491         if ESCAPED_AT_SIGN.search(in_str) is not None:
 492             # replace "@" with "a" in the head
 493             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 494         return False
 495
 496
 497 def suffix_string_to_number(in_str: str) -> Optional[int]:
 498     """Take a string like "33Gb" and convert it into a number (of bytes)
 499     like 34603008.  Return None if the input string is not valid.
 500
 501     >>> suffix_string_to_number('1Mb')
 502     1048576
 503     >>> suffix_string_to_number('13.1Gb')
 504     14066017894
 505     """
 506     def suffix_capitalize(s: str) -> str:
 507         if len(s) == 1:
 508             return s.upper()
 509         elif len(s) == 2:
 510             return f"{s[0].upper()}{s[1].lower()}"
 511         return suffix_capitalize(s[0:1])
 512
 513     if is_string(in_str):
 514         if is_integer_number(in_str):
 515             return to_int(in_str)
 516         suffixes = [in_str[-2:], in_str[-1:]]
 517         rest = [in_str[:-2], in_str[:-1]]
 518         for x in range(len(suffixes)):
 519             s = suffixes[x]
 520             s = suffix_capitalize(s)
 521             multiplier = NUM_SUFFIXES.get(s, None)
 522             if multiplier is not None:
 523                 r = rest[x]
 524                 if is_integer_number(r):
 525                     return to_int(r) * multiplier
 526                 if is_decimal_number(r):
 527                     return int(float(r) * multiplier)
 528     return None
 529
 530
 531 def number_to_suffix_string(num: int) -> Optional[str]:
 532     """Take a number (of bytes) and returns a string like "43.8Gb".
 533     Returns none if the input is invalid.
 534
 535     >>> number_to_suffix_string(14066017894)
 536     '13.1Gb'
 537     >>> number_to_suffix_string(1024 * 1024)
 538     '1.0Mb'
 539
 540     """
 541     d = 0.0
 542     suffix = None
 543     for (sfx, size) in NUM_SUFFIXES.items():
 544         if num >= size:
 545             d = num / size
 546             suffix = sfx
 547             break
 548     if suffix is not None:
 549         return f"{d:.1f}{suffix}"
 550     else:
 551         return f'{num:d}'
 552
 553
 554 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 555     """
 556     Checks if a string is a valid credit card number.
 557     If card type is provided then it checks against that specific type only,
 558     otherwise any known credit card number will be accepted.
 559
 560     Supported card types are the following:
 561
 562     - VISA
 563     - MASTERCARD
 564     - AMERICAN_EXPRESS
 565     - DINERS_CLUB
 566     - DISCOVER
 567     - JCB
 568     """
 569     if not is_full_string(in_str):
 570         return False
 571
 572     if card_type is not None:
 573         if card_type not in CREDIT_CARDS:
 574             raise KeyError(
 575                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 576             )
 577         return CREDIT_CARDS[card_type].match(in_str) is not None
 578     for c in CREDIT_CARDS:
 579         if CREDIT_CARDS[c].match(in_str) is not None:
 580             return True
 581     return False
 582
 583
 584 def is_camel_case(in_str: Any) -> bool:
 585     """
 586     Checks if a string is formatted as camel case.
 587
 588     A string is considered camel case when:
 589
 590     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 591     - it contains both lowercase and uppercase letters
 592     - it does not start with a number
 593     """
 594     return (
 595         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 596     )
 597
 598
 599 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 600     """
 601     Checks if a string is formatted as "snake case".
 602
 603     A string is considered snake case when:
 604
 605     - it's composed only by lowercase/uppercase letters and digits
 606     - it contains at least one underscore (or provided separator)
 607     - it does not start with a number
 608
 609     >>> is_snake_case('this_is_a_test')
 610     True
 611     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 612     True
 613     >>> is_snake_case('this-is-a-test')
 614     False
 615     >>> is_snake_case('this-is-a-test', separator='-')
 616     True
 617
 618     """
 619     if is_full_string(in_str):
 620         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 621         re_template = (
 622             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 623         )
 624         r = re_map.get(
 625             separator,
 626             re.compile(
 627                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 628             ),
 629         )
 630         return r.match(in_str) is not None
 631     return False
 632
 633
 634 def is_json(in_str: Any) -> bool:
 635     """
 636     Check if a string is a valid json.
 637
 638     >>> is_json('{"name": "Peter"}')
 639     True
 640     >>> is_json('[1, 2, 3]')
 641     True
 642     >>> is_json('{nope}')
 643     False
 644     """
 645     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 646         try:
 647             return isinstance(json.loads(in_str), (dict, list))
 648         except (TypeError, ValueError, OverflowError):
 649             pass
 650     return False
 651
 652
 653 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 654     """
 655     Check if a string is a valid UUID.
 656
 657     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 658     True
 659     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 660     False
 661     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 662     True
 663     """
 664     # string casting is used to allow UUID itself as input data type
 665     s = str(in_str)
 666     if allow_hex:
 667         return UUID_HEX_OK_RE.match(s) is not None
 668     return UUID_RE.match(s) is not None
 669
 670
 671 def is_ip_v4(in_str: Any) -> bool:
 672     """
 673     Checks if a string is a valid ip v4.
 674
 675     >>> is_ip_v4('255.200.100.75')
 676     True
 677     >>> is_ip_v4('nope')
 678     False
 679     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 680     False
 681     """
 682     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 683         return False
 684
 685     # checks that each entry in the ip is in the valid range (0 to 255)
 686     for token in in_str.split("."):
 687         if not 0 <= int(token) <= 255:
 688             return False
 689     return True
 690
 691
 692 def extract_ip_v4(in_str: Any) -> Optional[str]:
 693     """
 694     Extracts the IPv4 chunk of a string or None.
 695
 696     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 697     '127.0.0.1'
 698     >>> extract_ip_v4('Your mom dresses you funny.')
 699     """
 700     if not is_full_string(in_str):
 701         return None
 702     m = ANYWHERE_IP_V4_RE.search(in_str)
 703     if m is not None:
 704         return m.group(0)
 705     return None
 706
 707
 708 def is_ip_v6(in_str: Any) -> bool:
 709     """
 710     Checks if a string is a valid ip v6.
 711
 712     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 713     True
 714     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 715     False
 716     """
 717     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 718
 719
 720 def extract_ip_v6(in_str: Any) -> Optional[str]:
 721     """
 722     Extract IPv6 chunk or None.
 723
 724     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 725     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 726     >>> extract_ip_v6("(and she's ugly too, btw)")
 727     """
 728     if not is_full_string(in_str):
 729         return None
 730     m = ANYWHERE_IP_V6_RE.search(in_str)
 731     if m is not None:
 732         return m.group(0)
 733     return None
 734
 735
 736 def is_ip(in_str: Any) -> bool:
 737     """
 738     Checks if a string is a valid ip (either v4 or v6).
 739
 740     >>> is_ip('255.200.100.75')
 741     True
 742     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 743     True
 744     >>> is_ip('1.2.3')
 745     False
 746     >>> is_ip('1.2.3.999')
 747     False
 748     """
 749     return is_ip_v6(in_str) or is_ip_v4(in_str)
 750
 751
 752 def extract_ip(in_str: Any) -> Optional[str]:
 753     """
 754     Extract the IP address or None.
 755
 756     >>> extract_ip('Attacker: 255.200.100.75')
 757     '255.200.100.75'
 758     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 759     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 760     >>> extract_ip('1.2.3')
 761
 762     """
 763     ip = extract_ip_v4(in_str)
 764     if ip is None:
 765         ip = extract_ip_v6(in_str)
 766     return ip
 767
 768
 769 def is_mac_address(in_str: Any) -> bool:
 770     """Return True if in_str is a valid MAC address false otherwise.
 771
 772     >>> is_mac_address("34:29:8F:12:0D:2F")
 773     True
 774     >>> is_mac_address('34:29:8f:12:0d:2f')
 775     True
 776     >>> is_mac_address('34-29-8F-12-0D-2F')
 777     True
 778     >>> is_mac_address("test")
 779     False
 780     """
 781     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 782
 783
 784 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 785     """
 786     Extract the MAC address from in_str.
 787
 788     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 789     '34:29:8F:12:0D:2F'
 790
 791     """
 792     if not is_full_string(in_str):
 793         return None
 794     in_str.strip()
 795     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 796     if m is not None:
 797         mac = m.group(0)
 798         mac.replace(":", separator)
 799         mac.replace("-", separator)
 800         return mac
 801     return None
 802
 803
 804 def is_slug(in_str: Any, separator: str = "-") -> bool:
 805     """
 806     Checks if a given string is a slug (as created by `slugify()`).
 807
 808     >>> is_slug('my-blog-post-title')
 809     True
 810     >>> is_slug('My blog post title')
 811     False
 812
 813     """
 814     if not is_full_string(in_str):
 815         return False
 816     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 817     return re.match(rex, in_str) is not None
 818
 819
 820 def contains_html(in_str: str) -> bool:
 821     """
 822     Checks if the given string contains HTML/XML tags.
 823
 824     By design, this function matches ANY type of tag, so don't expect to use it
 825     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 826
 827     >>> contains_html('my string is <strong>bold</strong>')
 828     True
 829     >>> contains_html('my string is not bold')
 830     False
 831
 832     """
 833     if not is_string(in_str):
 834         raise ValueError(in_str)
 835     return HTML_RE.search(in_str) is not None
 836
 837
 838 def words_count(in_str: str) -> int:
 839     """
 840     Returns the number of words contained into the given string.
 841
 842     This method is smart, it does consider only sequence of one or more letter and/or numbers
 843     as "words", so a string like this: "! @ # % ... []" will return zero!
 844     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 845     will be 4 not 1 (even if there are no spaces in the string).
 846
 847     >>> words_count('hello world')
 848     2
 849     >>> words_count('one,two,three.stop')
 850     4
 851
 852     """
 853     if not is_string(in_str):
 854         raise ValueError(in_str)
 855     return len(WORDS_COUNT_RE.findall(in_str))
 856
 857
 858 def generate_uuid(as_hex: bool = False) -> str:
 859     """
 860     Generated an UUID string (using `uuid.uuid4()`).
 861
 862     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 863     generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 864
 865     """
 866     uid = uuid4()
 867     if as_hex:
 868         return uid.hex
 869     return str(uid)
 870
 871
 872 def generate_random_alphanumeric_string(size: int) -> str:
 873     """
 874     Returns a string of the specified size containing random
 875     characters (uppercase/lowercase ascii letters and digits).
 876
 877     random_string(9) # possible output: "cx3QQbzYg"
 878
 879     """
 880     if size < 1:
 881         raise ValueError("size must be >= 1")
 882     chars = string.ascii_letters + string.digits
 883     buffer = [random.choice(chars) for _ in range(size)]
 884     return from_char_list(buffer)
 885
 886
 887 def reverse(in_str: str) -> str:
 888     """
 889     Returns the string with its chars reversed.
 890
 891     >>> reverse('test')
 892     'tset'
 893
 894     """
 895     if not is_string(in_str):
 896         raise ValueError(in_str)
 897     return in_str[::-1]
 898
 899
 900 def camel_case_to_snake_case(in_str, *, separator="_"):
 901     """
 902     Convert a camel case string into a snake case one.
 903     (The original string is returned if is not a valid camel case string)
 904
 905     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 906     'mac_address_extractor_factory'
 907     >>> camel_case_to_snake_case('Luke Skywalker')
 908     'Luke Skywalker'
 909     """
 910     if not is_string(in_str):
 911         raise ValueError(in_str)
 912     if not is_camel_case(in_str):
 913         return in_str
 914     return CAMEL_CASE_REPLACE_RE.sub(
 915         lambda m: m.group(1) + separator, in_str
 916     ).lower()
 917
 918
 919 def snake_case_to_camel_case(
 920     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 921 ) -> str:
 922     """
 923     Convert a snake case string into a camel case one.
 924     (The original string is returned if is not a valid snake case string)
 925
 926     >>> snake_case_to_camel_case('this_is_a_test')
 927     'ThisIsATest'
 928     >>> snake_case_to_camel_case('Han Solo')
 929     'Han Solo'
 930     """
 931     if not is_string(in_str):
 932         raise ValueError(in_str)
 933     if not is_snake_case(in_str, separator=separator):
 934         return in_str
 935     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 936     if not upper_case_first:
 937         tokens[0] = tokens[0].lower()
 938     return from_char_list(tokens)
 939
 940
 941 def to_char_list(in_str: str) -> List[str]:
 942     """Convert a string into a list of chars.
 943
 944     >>> to_char_list('test')
 945     ['t', 'e', 's', 't']
 946     """
 947     if not is_string(in_str):
 948         return []
 949     return list(in_str)
 950
 951
 952 def from_char_list(in_list: List[str]) -> str:
 953     """Convert a char list into a string.
 954
 955     >>> from_char_list(['t', 'e', 's', 't'])
 956     'test'
 957     """
 958     return "".join(in_list)
 959
 960
 961 def shuffle(in_str: str) -> str:
 962     """Return a new string containing same chars of the given one but in
 963     a randomized order.
 964     """
 965     if not is_string(in_str):
 966         raise ValueError(in_str)
 967
 968     # turn the string into a list of chars
 969     chars = to_char_list(in_str)
 970     random.shuffle(chars)
 971     return from_char_list(chars)
 972
 973
 974 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 975     """
 976     Remove html code contained into the given string.
 977
 978     >>> strip_html('test: <a href="foo/bar">click here</a>')
 979     'test: '
 980     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 981     'test: click here'
 982     """
 983     if not is_string(in_str):
 984         raise ValueError(in_str)
 985     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 986     return r.sub("", in_str)
 987
 988
 989 def asciify(in_str: str) -> str:
 990     """
 991     Force string content to be ascii-only by translating all non-ascii
 992     chars into the closest possible representation (eg: ó -> o, Ë ->
 993     E, ç -> c...).
 994
 995     N.B. Some chars may be lost if impossible to translate.
 996
 997     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
 998     'eeuuooaaeynAAACIINOE'
 999     """
1000     if not is_string(in_str):
1001         raise ValueError(in_str)
1002
1003     # "NFKD" is the algorithm which is able to successfully translate
1004     # the most of non-ascii chars.
1005     normalized = unicodedata.normalize("NFKD", in_str)
1006
1007     # encode string forcing ascii and ignore any errors
1008     # (unrepresentable chars will be stripped out)
1009     ascii_bytes = normalized.encode("ascii", "ignore")
1010
1011     # turns encoded bytes into an utf-8 string
1012     return ascii_bytes.decode("utf-8")
1013
1014
1015 def slugify(in_str: str, *, separator: str = "-") -> str:
1016     """
1017     Converts a string into a "slug" using provided separator.
1018     The returned string has the following properties:
1019
1020     - it has no spaces
1021     - all letters are in lower case
1022     - all punctuation signs and non alphanumeric chars are removed
1023     - words are divided using provided separator
1024     - all chars are encoded as ascii (by using `asciify()`)
1025     - is safe for URL
1026
1027     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1028     'top-10-reasons-to-love-dogs'
1029     >>> slugify('Mönstér Mägnët')
1030     'monster-magnet'
1031     """
1032     if not is_string(in_str):
1033         raise ValueError(in_str)
1034
1035     # replace any character that is NOT letter or number with spaces
1036     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1037
1038     # replace spaces with join sign
1039     out = SPACES_RE.sub(separator, out)
1040
1041     # normalize joins (remove duplicates)
1042     out = re.sub(re.escape(separator) + r"+", separator, out)
1043     return asciify(out)
1044
1045
1046 def to_bool(in_str: str) -> bool:
1047     """
1048     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1049
1050     A positive boolean (True) is returned if the string value is one
1051     of the following:
1052
1053     - "true"
1054     - "1"
1055     - "yes"
1056     - "y"
1057
1058     Otherwise False is returned.
1059
1060     >>> to_bool('True')
1061     True
1062
1063     >>> to_bool('1')
1064     True
1065
1066     >>> to_bool('yes')
1067     True
1068
1069     >>> to_bool('no')
1070     False
1071
1072     >>> to_bool('huh?')
1073     False
1074
1075     >>> to_bool('on')
1076     True
1077
1078     """
1079     if not is_string(in_str):
1080         raise ValueError(in_str)
1081     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1082
1083
1084 def to_date(in_str: str) -> Optional[datetime.date]:
1085     """
1086     Parses a date string.  See DateParser docs for details.
1087     """
1088     import dateparse.dateparse_utils as dp
1089     try:
1090         d = dp.DateParser()
1091         d.parse(in_str)
1092         return d.get_date()
1093     except dp.ParseException:
1094         logger.warning(f'Unable to parse date {in_str}.')
1095     return None
1096
1097
1098 def valid_date(in_str: str) -> bool:
1099     """
1100     True if the string represents a valid date.
1101     """
1102     import dateparse.dateparse_utils as dp
1103     try:
1104         d = dp.DateParser()
1105         _ = d.parse(in_str)
1106         return True
1107     except dp.ParseException:
1108         logger.warning(f'Unable to parse date {in_str}.')
1109     return False
1110
1111
1112 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1113     """
1114     Parses a datetime string.  See DateParser docs for more info.
1115     """
1116     import dateparse.dateparse_utils as dp
1117     try:
1118         d = dp.DateParser()
1119         dt = d.parse(in_str)
1120         if type(dt) == datetime.datetime:
1121             return dt
1122     except ValueError:
1123         logger.warning(f'Unable to parse datetime {in_str}.')
1124     return None
1125
1126
1127 def valid_datetime(in_str: str) -> bool:
1128     """
1129     True if the string represents a valid datetime.
1130     """
1131     _ = to_datetime(in_str)
1132     if _ is not None:
1133         return True
1134     logger.warning(f'Unable to parse datetime {in_str}.')
1135     return False
1136
1137
1138 def dedent(in_str: str) -> str:
1139     """
1140     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1141     """
1142     if not is_string(in_str):
1143         raise ValueError(in_str)
1144     line_separator = '\n'
1145     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1146     return line_separator.join(lines)
1147
1148
1149 def indent(in_str: str, amount: int) -> str:
1150     """
1151     Indents string by prepending amount spaces.
1152
1153     >>> indent('This is a test', 4)
1154     '    This is a test'
1155
1156     """
1157     if not is_string(in_str):
1158         raise ValueError(in_str)
1159     line_separator = '\n'
1160     lines = [" " * amount + line for line in in_str.split(line_separator)]
1161     return line_separator.join(lines)
1162
1163
1164 def sprintf(*args, **kwargs) -> str:
1165     """String printf, like in C"""
1166     ret = ""
1167
1168     sep = kwargs.pop("sep", None)
1169     if sep is not None:
1170         if not isinstance(sep, str):
1171             raise TypeError("sep must be None or a string")
1172
1173     end = kwargs.pop("end", None)
1174     if end is not None:
1175         if not isinstance(end, str):
1176             raise TypeError("end must be None or a string")
1177
1178     if kwargs:
1179         raise TypeError("invalid keyword arguments to sprint()")
1180
1181     if sep is None:
1182         sep = " "
1183     if end is None:
1184         end = "\n"
1185     for i, arg in enumerate(args):
1186         if i:
1187             ret += sep
1188         if isinstance(arg, str):
1189             ret += arg
1190         else:
1191             ret += str(arg)
1192     ret += end
1193     return ret
1194
1195
1196 class SprintfStdout(object):
1197     """
1198     A context manager that captures outputs to stdout.
1199
1200     with SprintfStdout() as buf:
1201         print("test")
1202     print(buf())
1203
1204     'test\n'
1205     """
1206     def __init__(self) -> None:
1207         self.destination = io.StringIO()
1208         self.recorder = None
1209
1210     def __enter__(self) -> Callable[[], str]:
1211         self.recorder = contextlib.redirect_stdout(self.destination)
1212         self.recorder.__enter__()
1213         return lambda: self.destination.getvalue()
1214
1215     def __exit__(self, *args) -> None:
1216         self.recorder.__exit__(*args)
1217         self.destination.seek(0)
1218         return None  # don't suppress exceptions
1219
1220
1221 def is_are(n: int) -> str:
1222     """Is or are?
1223
1224     >>> is_are(1)
1225     'is'
1226     >>> is_are(2)
1227     'are'
1228
1229     """
1230     if n == 1:
1231         return "is"
1232     return "are"
1233
1234
1235 def pluralize(n: int) -> str:
1236     """Add an s?
1237
1238     >>> pluralize(15)
1239     's'
1240     >>> count = 1
1241     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1242     There is 1 file.
1243     >>> count = 4
1244     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1245     There are 4 files.
1246
1247     """
1248     if n == 1:
1249         return ""
1250     return "s"
1251
1252
1253 def thify(n: int) -> str:
1254     """Return the proper cardinal suffix for a number.
1255
1256     >>> thify(1)
1257     'st'
1258     >>> thify(33)
1259     'rd'
1260     >>> thify(16)
1261     'th'
1262
1263     """
1264     digit = str(n)
1265     assert is_integer_number(digit)
1266     digit = digit[-1:]
1267     if digit == "1":
1268         return "st"
1269     elif digit == "2":
1270         return "nd"
1271     elif digit == "3":
1272         return "rd"
1273     else:
1274         return "th"
1275
1276
1277 def ngrams(txt: str, n: int):
1278     """Return the ngrams from a string.
1279
1280     >>> [x for x in ngrams('This is a test', 2)]
1281     ['This is', 'is a', 'a test']
1282
1283     """
1284     words = txt.split()
1285     return ngrams_presplit(words, n)
1286
1287
1288 def ngrams_presplit(words: Iterable[str], n: int):
1289     for ngram in zip(*[words[i:] for i in range(n)]):
1290         yield(' '.join(ngram))
1291
1292
1293 def bigrams(txt: str):
1294     return ngrams(txt, 2)
1295
1296
1297 def trigrams(txt: str):
1298     return ngrams(txt, 3)
1299
1300
1301 def shuffle_columns_into_list(
1302         input_lines: Iterable[str],
1303         column_specs: Iterable[Iterable[int]],
1304         delim=''
1305 ) -> Iterable[str]:
1306     """Helper to shuffle / parse columnar data and return the results as a
1307     list.  The column_specs argument is an iterable collection of
1308     numeric sequences that indicate one or more column numbers to
1309     copy.
1310
1311     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1312     >>> shuffle_columns_into_list(
1313     ...     cols,
1314     ...     [ [8], [2, 3], [5, 6, 7] ],
1315     ...     delim=' ',
1316     ... )
1317     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1318
1319     """
1320     out = []
1321
1322     # Column specs map input lines' columns into outputs.
1323     # [col1, col2...]
1324     for spec in column_specs:
1325         chunk = ''
1326         for n in spec:
1327             chunk = chunk + delim + input_lines[n]
1328         chunk = chunk.strip(delim)
1329         out.append(chunk)
1330     return out
1331
1332
1333 def shuffle_columns_into_dict(
1334         input_lines: Iterable[str],
1335         column_specs: Iterable[Tuple[str, Iterable[int]]],
1336         delim=''
1337 ) -> Dict[str, str]:
1338     """Helper to shuffle / parse columnar data and return the results
1339     as a dict.
1340
1341     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1342     >>> shuffle_columns_into_dict(
1343     ...     cols,
1344     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1345     ...     delim=' ',
1346     ... )
1347     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1348
1349     """
1350     out = {}
1351
1352     # Column specs map input lines' columns into outputs.
1353     # "key", [col1, col2...]
1354     for spec in column_specs:
1355         chunk = ''
1356         for n in spec[1]:
1357             chunk = chunk + delim + input_lines[n]
1358         chunk = chunk.strip(delim)
1359         out[spec[0]] = chunk
1360     return out
1361
1362
1363 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1364     """Interpolate a string with data from a dict.
1365
1366     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1367     ...                        {'adjective': 'good', 'noun': 'example'})
1368     'This is a good example.'
1369
1370     """
1371     return sprintf(txt.format(**values), end='')
1372
1373
1374 if __name__ == '__main__':
1375     import doctest
1376     doctest.testmod()