string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import contextlib
   4 import datetime
   5 import io
   6 from itertools import zip_longest
   7 import json
   8 import logging
   9 import numbers
  10 import random
  11 import re
  12 import string
  13 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
  14 import unicodedata
  15 from uuid import uuid4
  16
  17 logger = logging.getLogger(__name__)
  18
  19 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  20
  21 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  22
  23 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  24
  25 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  26
  27 URLS_RAW_STRING = (
  28     r"([a-z-]+://)"  # scheme
  29     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  30     r"(www\.)?"  # www.
  31     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  32     r"(:\d{2,})?"  # port number
  33     r"(/[a-z\d_%+-]*)*"  # folders
  34     r"(\.[a-z\d_%+-]+)*"  # file extension
  35     r"(\?[a-z\d_+%-=]*)?"  # query string
  36     r"(#\S*)?"  # hash
  37 )
  38
  39 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  40
  41 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  42
  43 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  44
  45 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  46
  47 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  48
  49 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  50
  51 CAMEL_CASE_TEST_RE = re.compile(
  52     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  53 )
  54
  55 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  56
  57 SNAKE_CASE_TEST_RE = re.compile(
  58     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  59 )
  60
  61 SNAKE_CASE_TEST_DASH_RE = re.compile(
  62     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  63 )
  64
  65 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  66
  67 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  68
  69 CREDIT_CARDS = {
  70     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  71     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  72     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  73     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  74     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  75     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  76 }
  77
  78 JSON_WRAPPER_RE = re.compile(
  79     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  80 )
  81
  82 UUID_RE = re.compile(
  83     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  84 )
  85
  86 UUID_HEX_OK_RE = re.compile(
  87     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  88     re.IGNORECASE,
  89 )
  90
  91 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  92
  93 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
  94
  95 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
  96
  97 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
  98
  99 MAC_ADDRESS_RE = re.compile(
 100     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
 101 )
 102
 103 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 104     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 105 )
 106
 107 WORDS_COUNT_RE = re.compile(
 108     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 109 )
 110
 111 HTML_RE = re.compile(
 112     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 113     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 114 )
 115
 116 HTML_TAG_ONLY_RE = re.compile(
 117     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 118     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 119 )
 120
 121 SPACES_RE = re.compile(r"\s")
 122
 123 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 124     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 125 )
 126
 127 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 128
 129 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 130
 131 NUM_SUFFIXES = {
 132     "Pb": (1024 ** 5),
 133     "P": (1024 ** 5),
 134     "Tb": (1024 ** 4),
 135     "T": (1024 ** 4),
 136     "Gb": (1024 ** 3),
 137     "G": (1024 ** 3),
 138     "Mb": (1024 ** 2),
 139     "M": (1024 ** 2),
 140     "Kb": (1024 ** 1),
 141     "K": (1024 ** 1),
 142 }
 143
 144
 145 def is_none_or_empty(in_str: Optional[str]) -> bool:
 146     """
 147     Returns true if the input string is either None or an empty string.
 148
 149     >>> is_none_or_empty("")
 150     True
 151     >>> is_none_or_empty(None)
 152     True
 153     >>> is_none_or_empty(" ")
 154     True
 155     >>> is_none_or_empty('Test')
 156     False
 157     """
 158     return in_str is None or len(in_str.strip()) == 0
 159
 160
 161 def is_string(obj: Any) -> bool:
 162     """
 163     Checks if an object is a string.
 164
 165     >>> is_string('test')
 166     True
 167     >>> is_string(123)
 168     False
 169     >>> is_string(100.3)
 170     False
 171     >>> is_string([1, 2, 3])
 172     False
 173     """
 174     return isinstance(obj, str)
 175
 176
 177 def is_empty_string(in_str: Any) -> bool:
 178     """
 179     Checks if input is a string and empty or only whitespace.
 180
 181     >>> is_empty_string('')
 182     True
 183     >>> is_empty_string('    \t\t    ')
 184     True
 185     >>> is_empty_string('test')
 186     False
 187     >>> is_empty_string(100.88)
 188     False
 189     >>> is_empty_string([1, 2, 3])
 190     False
 191     """
 192     return is_string(in_str) and in_str.strip() == ""
 193
 194
 195 def is_full_string(in_str: Any) -> bool:
 196     """
 197     Checks that input is a string and is not empty ('') or only whitespace.
 198
 199     >>> is_full_string('test!')
 200     True
 201     >>> is_full_string('')
 202     False
 203     >>> is_full_string('      ')
 204     False
 205     >>> is_full_string(100.999)
 206     False
 207     >>> is_full_string({"a": 1, "b": 2})
 208     False
 209     """
 210     return is_string(in_str) and in_str.strip() != ""
 211
 212
 213 def is_number(in_str: str) -> bool:
 214     """
 215     Checks if a string is a valid number.
 216
 217     >>> is_number(100.5)
 218     Traceback (most recent call last):
 219     ...
 220     ValueError: 100.5
 221     >>> is_number("100.5")
 222     True
 223     >>> is_number("test")
 224     False
 225     >>> is_number("99")
 226     True
 227     >>> is_number([1, 2, 3])
 228     Traceback (most recent call last):
 229     ...
 230     ValueError: [1, 2, 3]
 231     """
 232     if not is_string(in_str):
 233         raise ValueError(in_str)
 234     return NUMBER_RE.match(in_str) is not None
 235
 236
 237 def is_integer_number(in_str: str) -> bool:
 238     """
 239     Checks whether the given string represents an integer or not.
 240
 241     An integer may be signed or unsigned or use a "scientific notation".
 242
 243     >>> is_integer_number('42')
 244     True
 245     >>> is_integer_number('42.0')
 246     False
 247     """
 248     return (
 249         (is_number(in_str) and "." not in in_str) or
 250         is_hexidecimal_integer_number(in_str) or
 251         is_octal_integer_number(in_str) or
 252         is_binary_integer_number(in_str)
 253     )
 254
 255
 256 def is_hexidecimal_integer_number(in_str: str) -> bool:
 257     """
 258     Checks whether a string is a hex integer number.
 259
 260     >>> is_hexidecimal_integer_number('0x12345')
 261     True
 262     >>> is_hexidecimal_integer_number('0x1A3E')
 263     True
 264     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 265     False
 266     >>> is_hexidecimal_integer_number('-0xff')
 267     True
 268     >>> is_hexidecimal_integer_number('test')
 269     False
 270     >>> is_hexidecimal_integer_number(12345)  # Not a string
 271     Traceback (most recent call last):
 272     ...
 273     ValueError: 12345
 274     >>> is_hexidecimal_integer_number(101.4)
 275     Traceback (most recent call last):
 276     ...
 277     ValueError: 101.4
 278     >>> is_hexidecimal_integer_number(0x1A3E)
 279     Traceback (most recent call last):
 280     ...
 281     ValueError: 6718
 282     """
 283     if not is_string(in_str):
 284         raise ValueError(in_str)
 285     return HEX_NUMBER_RE.match(in_str) is not None
 286
 287
 288 def is_octal_integer_number(in_str: str) -> bool:
 289     """
 290     Checks whether a string is an octal number.
 291
 292     >>> is_octal_integer_number('0o777')
 293     True
 294     >>> is_octal_integer_number('-0O115')
 295     True
 296     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 297     False
 298     >>> is_octal_integer_number('7777')  # Needs 0o
 299     False
 300     >>> is_octal_integer_number('test')
 301     False
 302     """
 303     if not is_string(in_str):
 304         raise ValueError(in_str)
 305     return OCT_NUMBER_RE.match(in_str) is not None
 306
 307
 308 def is_binary_integer_number(in_str: str) -> bool:
 309     """
 310     Returns whether a string contains a binary number.
 311
 312     >>> is_binary_integer_number('0b10111')
 313     True
 314     >>> is_binary_integer_number('-0b111')
 315     True
 316     >>> is_binary_integer_number('0B10101')
 317     True
 318     >>> is_binary_integer_number('0b10102')
 319     False
 320     >>> is_binary_integer_number('0xFFF')
 321     False
 322     >>> is_binary_integer_number('test')
 323     False
 324     """
 325     if not is_string(in_str):
 326         raise ValueError(in_str)
 327     return BIN_NUMBER_RE.match(in_str) is not None
 328
 329
 330 def to_int(in_str: str) -> int:
 331     """Returns the integral value of the string or raises on error.
 332
 333     >>> to_int('1234')
 334     1234
 335     >>> to_int('test')
 336     Traceback (most recent call last):
 337     ...
 338     ValueError: invalid literal for int() with base 10: 'test'
 339     """
 340     if not is_string(in_str):
 341         raise ValueError(in_str)
 342     if is_binary_integer_number(in_str):
 343         return int(in_str, 2)
 344     if is_octal_integer_number(in_str):
 345         return int(in_str, 8)
 346     if is_hexidecimal_integer_number(in_str):
 347         return int(in_str, 16)
 348     return int(in_str)
 349
 350
 351 def is_decimal_number(in_str: str) -> bool:
 352     """
 353     Checks whether the given string represents a decimal or not.
 354
 355     A decimal may be signed or unsigned or use a "scientific notation".
 356
 357     >>> is_decimal_number('42.0')
 358     True
 359     >>> is_decimal_number('42')
 360     False
 361     """
 362     return is_number(in_str) and "." in in_str
 363
 364
 365 def strip_escape_sequences(in_str: str) -> str:
 366     """
 367     Remove escape sequences in the input string.
 368
 369     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 370     'this is a test!'
 371     """
 372     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 373     return in_str
 374
 375
 376 def add_thousands_separator(
 377         in_str: str,
 378         *,
 379         separator_char = ',',
 380         places = 3
 381 ) -> str:
 382     """
 383     Add thousands separator to a numeric string.  Also handles numbers.
 384
 385     >>> add_thousands_separator('12345678')
 386     '12,345,678'
 387     >>> add_thousands_separator(12345678)
 388     '12,345,678'
 389     >>> add_thousands_separator(12345678.99)
 390     '12,345,678.99'
 391     >>> add_thousands_separator('test')
 392     Traceback (most recent call last):
 393     ...
 394     ValueError: test
 395
 396     """
 397     if isinstance(in_str, numbers.Number):
 398         in_str = f'{in_str}'
 399     if is_number(in_str):
 400         return _add_thousands_separator(
 401             in_str,
 402             separator_char = separator_char,
 403             places = places
 404         )
 405     raise ValueError(in_str)
 406
 407
 408 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 409     decimal_part = ""
 410     if '.' in in_str:
 411         (in_str, decimal_part) = in_str.split('.')
 412     tmp = [iter(in_str[::-1])] * places
 413     ret = separator_char.join(
 414         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 415     if len(decimal_part) > 0:
 416         ret += '.'
 417         ret += decimal_part
 418     return ret
 419
 420
 421 # Full url example:
 422 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 423 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 424     """
 425     Check if a string is a valid url.
 426
 427     >>> is_url('http://www.mysite.com')
 428     True
 429     >>> is_url('https://mysite.com')
 430     True
 431     >>> is_url('.mysite.com')
 432     False
 433     """
 434     if not is_full_string(in_str):
 435         return False
 436
 437     valid = URL_RE.match(in_str) is not None
 438
 439     if allowed_schemes:
 440         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 441     return valid
 442
 443
 444 def is_email(in_str: Any) -> bool:
 445     """
 446     Check if a string is a valid email.
 447
 448     Reference: https://tools.ietf.org/html/rfc3696#section-3
 449
 450     >>> is_email('[email protected]')
 451     True
 452     >>> is_email('@gmail.com')
 453     False
 454     """
 455     if (
 456         not is_full_string(in_str)
 457         or len(in_str) > 320
 458         or in_str.startswith(".")
 459     ):
 460         return False
 461
 462     try:
 463         # we expect 2 tokens, one before "@" and one after, otherwise
 464         # we have an exception and the email is not valid.
 465         head, tail = in_str.split("@")
 466
 467         # head's size must be <= 64, tail <= 255, head must not start
 468         # with a dot or contain multiple consecutive dots.
 469         if (
 470             len(head) > 64
 471             or len(tail) > 255
 472             or head.endswith(".")
 473             or (".." in head)
 474         ):
 475             return False
 476
 477         # removes escaped spaces, so that later on the test regex will
 478         # accept the string.
 479         head = head.replace("\\ ", "")
 480         if head.startswith('"') and head.endswith('"'):
 481             head = head.replace(" ", "")[1:-1]
 482         return EMAIL_RE.match(head + "@" + tail) is not None
 483
 484     except ValueError:
 485         # borderline case in which we have multiple "@" signs but the
 486         # head part is correctly escaped.
 487         if ESCAPED_AT_SIGN.search(in_str) is not None:
 488             # replace "@" with "a" in the head
 489             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 490         return False
 491
 492
 493 def suffix_string_to_number(in_str: str) -> Optional[int]:
 494     """Take a string like "33Gb" and convert it into a number (of bytes)
 495     like 34603008.  Return None if the input string is not valid.
 496
 497     >>> suffix_string_to_number('1Mb')
 498     1048576
 499     >>> suffix_string_to_number('13.1Gb')
 500     14066017894
 501     """
 502     def suffix_capitalize(s: str) -> str:
 503         if len(s) == 1:
 504             return s.upper()
 505         elif len(s) == 2:
 506             return f"{s[0].upper()}{s[1].lower()}"
 507         return suffix_capitalize(s[0:1])
 508
 509     if is_string(in_str):
 510         if is_integer_number(in_str):
 511             return to_int(in_str)
 512         suffixes = [in_str[-2:], in_str[-1:]]
 513         rest = [in_str[:-2], in_str[:-1]]
 514         for x in range(len(suffixes)):
 515             s = suffixes[x]
 516             s = suffix_capitalize(s)
 517             multiplier = NUM_SUFFIXES.get(s, None)
 518             if multiplier is not None:
 519                 r = rest[x]
 520                 if is_integer_number(r):
 521                     return to_int(r) * multiplier
 522                 if is_decimal_number(r):
 523                     return int(float(r) * multiplier)
 524     return None
 525
 526
 527 def number_to_suffix_string(num: int) -> Optional[str]:
 528     """Take a number (of bytes) and returns a string like "43.8Gb".
 529     Returns none if the input is invalid.
 530
 531     >>> number_to_suffix_string(14066017894)
 532     '13.1Gb'
 533     >>> number_to_suffix_string(1024 * 1024)
 534     '1.0Mb'
 535
 536     """
 537     d = 0.0
 538     suffix = None
 539     for (sfx, size) in NUM_SUFFIXES.items():
 540         if num >= size:
 541             d = num / size
 542             suffix = sfx
 543             break
 544     if suffix is not None:
 545         return f"{d:.1f}{suffix}"
 546     else:
 547         return f'{num:d}'
 548
 549
 550 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 551     """
 552     Checks if a string is a valid credit card number.
 553     If card type is provided then it checks against that specific type only,
 554     otherwise any known credit card number will be accepted.
 555
 556     Supported card types are the following:
 557
 558     - VISA
 559     - MASTERCARD
 560     - AMERICAN_EXPRESS
 561     - DINERS_CLUB
 562     - DISCOVER
 563     - JCB
 564     """
 565     if not is_full_string(in_str):
 566         return False
 567
 568     if card_type is not None:
 569         if card_type not in CREDIT_CARDS:
 570             raise KeyError(
 571                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 572             )
 573         return CREDIT_CARDS[card_type].match(in_str) is not None
 574     for c in CREDIT_CARDS:
 575         if CREDIT_CARDS[c].match(in_str) is not None:
 576             return True
 577     return False
 578
 579
 580 def is_camel_case(in_str: Any) -> bool:
 581     """
 582     Checks if a string is formatted as camel case.
 583
 584     A string is considered camel case when:
 585
 586     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 587     - it contains both lowercase and uppercase letters
 588     - it does not start with a number
 589     """
 590     return (
 591         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 592     )
 593
 594
 595 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 596     """
 597     Checks if a string is formatted as "snake case".
 598
 599     A string is considered snake case when:
 600
 601     - it's composed only by lowercase/uppercase letters and digits
 602     - it contains at least one underscore (or provided separator)
 603     - it does not start with a number
 604
 605     >>> is_snake_case('this_is_a_test')
 606     True
 607     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 608     True
 609     >>> is_snake_case('this-is-a-test')
 610     False
 611     >>> is_snake_case('this-is-a-test', separator='-')
 612     True
 613
 614     """
 615     if is_full_string(in_str):
 616         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 617         re_template = (
 618             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 619         )
 620         r = re_map.get(
 621             separator,
 622             re.compile(
 623                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 624             ),
 625         )
 626         return r.match(in_str) is not None
 627     return False
 628
 629
 630 def is_json(in_str: Any) -> bool:
 631     """
 632     Check if a string is a valid json.
 633
 634     >>> is_json('{"name": "Peter"}')
 635     True
 636     >>> is_json('[1, 2, 3]')
 637     True
 638     >>> is_json('{nope}')
 639     False
 640     """
 641     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 642         try:
 643             return isinstance(json.loads(in_str), (dict, list))
 644         except (TypeError, ValueError, OverflowError):
 645             pass
 646     return False
 647
 648
 649 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 650     """
 651     Check if a string is a valid UUID.
 652
 653     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 654     True
 655     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 656     False
 657     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 658     True
 659     """
 660     # string casting is used to allow UUID itself as input data type
 661     s = str(in_str)
 662     if allow_hex:
 663         return UUID_HEX_OK_RE.match(s) is not None
 664     return UUID_RE.match(s) is not None
 665
 666
 667 def is_ip_v4(in_str: Any) -> bool:
 668     """
 669     Checks if a string is a valid ip v4.
 670
 671     >>> is_ip_v4('255.200.100.75')
 672     True
 673     >>> is_ip_v4('nope')
 674     False
 675     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 676     False
 677     """
 678     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 679         return False
 680
 681     # checks that each entry in the ip is in the valid range (0 to 255)
 682     for token in in_str.split("."):
 683         if not 0 <= int(token) <= 255:
 684             return False
 685     return True
 686
 687
 688 def extract_ip_v4(in_str: Any) -> Optional[str]:
 689     """
 690     Extracts the IPv4 chunk of a string or None.
 691
 692     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 693     '127.0.0.1'
 694     >>> extract_ip_v4('Your mom dresses you funny.')
 695     """
 696     if not is_full_string(in_str):
 697         return None
 698     m = ANYWHERE_IP_V4_RE.search(in_str)
 699     if m is not None:
 700         return m.group(0)
 701     return None
 702
 703
 704 def is_ip_v6(in_str: Any) -> bool:
 705     """
 706     Checks if a string is a valid ip v6.
 707
 708     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 709     True
 710     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 711     False
 712     """
 713     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 714
 715
 716 def extract_ip_v6(in_str: Any) -> Optional[str]:
 717     """
 718     Extract IPv6 chunk or None.
 719
 720     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 721     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 722     >>> extract_ip_v6("(and she's ugly too, btw)")
 723     """
 724     if not is_full_string(in_str):
 725         return None
 726     m = ANYWHERE_IP_V6_RE.search(in_str)
 727     if m is not None:
 728         return m.group(0)
 729     return None
 730
 731
 732 def is_ip(in_str: Any) -> bool:
 733     """
 734     Checks if a string is a valid ip (either v4 or v6).
 735
 736     *Examples:*
 737
 738     >>> is_ip('255.200.100.75')
 739     True
 740     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 741     True
 742     >>> is_ip('1.2.3')
 743     False
 744     >>> is_ip('1.2.3.999')
 745     False
 746     """
 747     return is_ip_v6(in_str) or is_ip_v4(in_str)
 748
 749
 750 def extract_ip(in_str: Any) -> Optional[str]:
 751     """
 752     Extract the IP address or None.
 753
 754     >>> extract_ip('Attacker: 255.200.100.75')
 755     '255.200.100.75'
 756     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 757     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 758     >>> extract_ip('1.2.3')
 759
 760     """
 761     ip = extract_ip_v4(in_str)
 762     if ip is None:
 763         ip = extract_ip_v6(in_str)
 764     return ip
 765
 766
 767 def is_mac_address(in_str: Any) -> bool:
 768     """Return True if in_str is a valid MAC address false otherwise.
 769
 770     >>> is_mac_address("34:29:8F:12:0D:2F")
 771     True
 772     >>> is_mac_address('34:29:8f:12:0d:2f')
 773     True
 774     >>> is_mac_address('34-29-8F-12-0D-2F')
 775     True
 776     >>> is_mac_address("test")
 777     False
 778     """
 779     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 780
 781
 782 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 783     """
 784     Extract the MAC address from in_str.
 785
 786     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 787     '34:29:8F:12:0D:2F'
 788
 789     """
 790     if not is_full_string(in_str):
 791         return None
 792     in_str.strip()
 793     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 794     if m is not None:
 795         mac = m.group(0)
 796         mac.replace(":", separator)
 797         mac.replace("-", separator)
 798         return mac
 799     return None
 800
 801
 802 def is_slug(in_str: Any, separator: str = "-") -> bool:
 803     """
 804     Checks if a given string is a slug (as created by `slugify()`).
 805
 806     >>> is_slug('my-blog-post-title')
 807     True
 808     >>> is_slug('My blog post title')
 809     False
 810
 811     """
 812     if not is_full_string(in_str):
 813         return False
 814     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 815     return re.match(rex, in_str) is not None
 816
 817
 818 def contains_html(in_str: str) -> bool:
 819     """
 820     Checks if the given string contains HTML/XML tags.
 821
 822     By design, this function matches ANY type of tag, so don't expect to use it
 823     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 824
 825     >>> contains_html('my string is <strong>bold</strong>')
 826     True
 827     >>> contains_html('my string is not bold')
 828     False
 829
 830     """
 831     if not is_string(in_str):
 832         raise ValueError(in_str)
 833     return HTML_RE.search(in_str) is not None
 834
 835
 836 def words_count(in_str: str) -> int:
 837     """
 838     Returns the number of words contained into the given string.
 839
 840     This method is smart, it does consider only sequence of one or more letter and/or numbers
 841     as "words", so a string like this: "! @ # % ... []" will return zero!
 842     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 843     will be 4 not 1 (even if there are no spaces in the string).
 844
 845     >>> words_count('hello world')
 846     2
 847     >>> words_count('one,two,three.stop')
 848     4
 849
 850     """
 851     if not is_string(in_str):
 852         raise ValueError(in_str)
 853     return len(WORDS_COUNT_RE.findall(in_str))
 854
 855
 856 def generate_uuid(as_hex: bool = False) -> str:
 857     """
 858     Generated an UUID string (using `uuid.uuid4()`).
 859
 860     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 861     generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 862
 863     """
 864     uid = uuid4()
 865     if as_hex:
 866         return uid.hex
 867     return str(uid)
 868
 869
 870 def generate_random_alphanumeric_string(size: int) -> str:
 871     """
 872     Returns a string of the specified size containing random
 873     characters (uppercase/lowercase ascii letters and digits).
 874
 875     random_string(9) # possible output: "cx3QQbzYg"
 876
 877     """
 878     if size < 1:
 879         raise ValueError("size must be >= 1")
 880     chars = string.ascii_letters + string.digits
 881     buffer = [random.choice(chars) for _ in range(size)]
 882     return from_char_list(buffer)
 883
 884
 885 def reverse(in_str: str) -> str:
 886     """
 887     Returns the string with its chars reversed.
 888
 889     >>> reverse('test')
 890     'tset'
 891
 892     """
 893     if not is_string(in_str):
 894         raise ValueError(in_str)
 895     return in_str[::-1]
 896
 897
 898 def camel_case_to_snake_case(in_str, *, separator="_"):
 899     """
 900     Convert a camel case string into a snake case one.
 901     (The original string is returned if is not a valid camel case string)
 902
 903     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 904     'mac_address_extractor_factory'
 905     >>> camel_case_to_snake_case('Luke Skywalker')
 906     'Luke Skywalker'
 907     """
 908     if not is_string(in_str):
 909         raise ValueError(in_str)
 910     if not is_camel_case(in_str):
 911         return in_str
 912     return CAMEL_CASE_REPLACE_RE.sub(
 913         lambda m: m.group(1) + separator, in_str
 914     ).lower()
 915
 916
 917 def snake_case_to_camel_case(
 918     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 919 ) -> str:
 920     """
 921     Convert a snake case string into a camel case one.
 922     (The original string is returned if is not a valid snake case string)
 923
 924     >>> snake_case_to_camel_case('this_is_a_test')
 925     'ThisIsATest'
 926     >>> snake_case_to_camel_case('Han Solo')
 927     'Han Solo'
 928     """
 929     if not is_string(in_str):
 930         raise ValueError(in_str)
 931     if not is_snake_case(in_str, separator=separator):
 932         return in_str
 933     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 934     if not upper_case_first:
 935         tokens[0] = tokens[0].lower()
 936     return from_char_list(tokens)
 937
 938
 939 def to_char_list(in_str: str) -> List[str]:
 940     """Convert a string into a list of chars.
 941
 942     >>> to_char_list('test')
 943     ['t', 'e', 's', 't']
 944     """
 945     if not is_string(in_str):
 946         return []
 947     return list(in_str)
 948
 949
 950 def from_char_list(in_list: List[str]) -> str:
 951     """Convert a char list into a string.
 952
 953     >>> from_char_list(['t', 'e', 's', 't'])
 954     'test'
 955     """
 956     return "".join(in_list)
 957
 958
 959 def shuffle(in_str: str) -> str:
 960     """Return a new string containing same chars of the given one but in
 961     a randomized order.
 962     """
 963     if not is_string(in_str):
 964         raise ValueError(in_str)
 965
 966     # turn the string into a list of chars
 967     chars = to_char_list(in_str)
 968     random.shuffle(chars)
 969     return from_char_list(chars)
 970
 971
 972 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 973     """
 974     Remove html code contained into the given string.
 975
 976     >>> strip_html('test: <a href="foo/bar">click here</a>')
 977     'test: '
 978     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 979     'test: click here'
 980     """
 981     if not is_string(in_str):
 982         raise ValueError(in_str)
 983     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 984     return r.sub("", in_str)
 985
 986
 987 def asciify(in_str: str) -> str:
 988     """
 989     Force string content to be ascii-only by translating all non-ascii
 990     chars into the closest possible representation (eg: ó -> o, Ë ->
 991     E, ç -> c...).
 992
 993     N.B. Some chars may be lost if impossible to translate.
 994
 995     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
 996     'eeuuooaaeynAAACIINOE'
 997     """
 998     if not is_string(in_str):
 999         raise ValueError(in_str)
1000
1001     # "NFKD" is the algorithm which is able to successfully translate
1002     # the most of non-ascii chars.
1003     normalized = unicodedata.normalize("NFKD", in_str)
1004
1005     # encode string forcing ascii and ignore any errors
1006     # (unrepresentable chars will be stripped out)
1007     ascii_bytes = normalized.encode("ascii", "ignore")
1008
1009     # turns encoded bytes into an utf-8 string
1010     return ascii_bytes.decode("utf-8")
1011
1012
1013 def slugify(in_str: str, *, separator: str = "-") -> str:
1014     """
1015     Converts a string into a "slug" using provided separator.
1016     The returned string has the following properties:
1017
1018     - it has no spaces
1019     - all letters are in lower case
1020     - all punctuation signs and non alphanumeric chars are removed
1021     - words are divided using provided separator
1022     - all chars are encoded as ascii (by using `asciify()`)
1023     - is safe for URL
1024
1025     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1026     'top-10-reasons-to-love-dogs'
1027     >>> slugify('Mönstér Mägnët')
1028     'monster-magnet'
1029     """
1030     if not is_string(in_str):
1031         raise ValueError(in_str)
1032
1033     # replace any character that is NOT letter or number with spaces
1034     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1035
1036     # replace spaces with join sign
1037     out = SPACES_RE.sub(separator, out)
1038
1039     # normalize joins (remove duplicates)
1040     out = re.sub(re.escape(separator) + r"+", separator, out)
1041     return asciify(out)
1042
1043
1044 def to_bool(in_str: str) -> bool:
1045     """
1046     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1047
1048     A positive boolean (True) is returned if the string value is one
1049     of the following:
1050
1051     - "true"
1052     - "1"
1053     - "yes"
1054     - "y"
1055
1056     Otherwise False is returned.
1057
1058     >>> to_bool('True')
1059     True
1060     >>> to_bool('1')
1061     True
1062     >>> to_bool('yes')
1063     True
1064     >>> to_bool('no')
1065     False
1066     >>> to_bool('huh?')
1067     False
1068     """
1069     if not is_string(in_str):
1070         raise ValueError(in_str)
1071     return in_str.lower() in ("true", "1", "yes", "y", "t")
1072
1073
1074 def to_date(in_str: str) -> Optional[datetime.date]:
1075     """
1076     Parses a date string.  See DateParser docs for details.
1077     """
1078     import dateparse.dateparse_utils as dp
1079     try:
1080         d = dp.DateParser()
1081         d.parse(in_str)
1082         return d.get_date()
1083     except dp.ParseException:
1084         logger.warning(f'Unable to parse date {in_str}.')
1085     return None
1086
1087
1088 def valid_date(in_str: str) -> bool:
1089     """
1090     True if the string represents a valid date.
1091     """
1092     import dateparse.dateparse_utils as dp
1093     try:
1094         d = dp.DateParser()
1095         _ = d.parse(in_str)
1096         return True
1097     except dp.ParseException:
1098         logger.warning(f'Unable to parse date {in_str}.')
1099     return False
1100
1101
1102 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1103     """
1104     Parses a datetime string.  See DateParser docs for more info.
1105     """
1106     import dateparse.dateparse_utils as dp
1107     try:
1108         d = dp.DateParser()
1109         dt = d.parse(in_str)
1110         if type(dt) == datetime.datetime:
1111             return dt
1112     except ValueError:
1113         logger.warning(f'Unable to parse datetime {in_str}.')
1114     return None
1115
1116
1117 def valid_datetime(in_str: str) -> bool:
1118     """
1119     True if the string represents a valid datetime.
1120     """
1121     _ = to_datetime(in_str)
1122     if _ is not None:
1123         return True
1124     logger.warning(f'Unable to parse datetime {in_str}.')
1125     return False
1126
1127
1128 def dedent(in_str: str) -> str:
1129     """
1130     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1131     """
1132     if not is_string(in_str):
1133         raise ValueError(in_str)
1134     line_separator = '\n'
1135     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1136     return line_separator.join(lines)
1137
1138
1139 def indent(in_str: str, amount: int) -> str:
1140     """
1141     Indents string by prepending amount spaces.
1142
1143     >>> indent('This is a test', 4)
1144     '    This is a test'
1145
1146     """
1147     if not is_string(in_str):
1148         raise ValueError(in_str)
1149     line_separator = '\n'
1150     lines = [" " * amount + line for line in in_str.split(line_separator)]
1151     return line_separator.join(lines)
1152
1153
1154 def sprintf(*args, **kwargs) -> str:
1155     """String printf, like in C"""
1156     ret = ""
1157
1158     sep = kwargs.pop("sep", None)
1159     if sep is not None:
1160         if not isinstance(sep, str):
1161             raise TypeError("sep must be None or a string")
1162
1163     end = kwargs.pop("end", None)
1164     if end is not None:
1165         if not isinstance(end, str):
1166             raise TypeError("end must be None or a string")
1167
1168     if kwargs:
1169         raise TypeError("invalid keyword arguments to sprint()")
1170
1171     if sep is None:
1172         sep = " "
1173     if end is None:
1174         end = "\n"
1175     for i, arg in enumerate(args):
1176         if i:
1177             ret += sep
1178         if isinstance(arg, str):
1179             ret += arg
1180         else:
1181             ret += str(arg)
1182     ret += end
1183     return ret
1184
1185
1186 class SprintfStdout(object):
1187     """
1188     A context manager that captures outputs to stdout.
1189
1190     with SprintfStdout() as buf:
1191         print("test")
1192     print(buf())
1193
1194     'test\n'
1195     """
1196     def __init__(self) -> None:
1197         self.destination = io.StringIO()
1198         self.recorder = None
1199
1200     def __enter__(self) -> Callable[[], str]:
1201         self.recorder = contextlib.redirect_stdout(self.destination)
1202         self.recorder.__enter__()
1203         return lambda: self.destination.getvalue()
1204
1205     def __exit__(self, *args) -> None:
1206         self.recorder.__exit__(*args)
1207         self.destination.seek(0)
1208         return None  # don't suppress exceptions
1209
1210
1211 def is_are(n: int) -> str:
1212     """Is or are?
1213
1214     >>> is_are(1)
1215     'is'
1216     >>> is_are(2)
1217     'are'
1218
1219     """
1220     if n == 1:
1221         return "is"
1222     return "are"
1223
1224
1225 def pluralize(n: int) -> str:
1226     """Add an s?
1227
1228     >>> pluralize(15)
1229     's'
1230     >>> count = 1
1231     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1232     There is 1 file.
1233     >>> count = 4
1234     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1235     There are 4 files.
1236
1237     """
1238     if n == 1:
1239         return ""
1240     return "s"
1241
1242
1243 def thify(n: int) -> str:
1244     """Return the proper cardinal suffix for a number.
1245
1246     >>> thify(1)
1247     'st'
1248     >>> thify(33)
1249     'rd'
1250     >>> thify(16)
1251     'th'
1252
1253     """
1254     digit = str(n)
1255     assert is_integer_number(digit)
1256     digit = digit[-1:]
1257     if digit == "1":
1258         return "st"
1259     elif digit == "2":
1260         return "nd"
1261     elif digit == "3":
1262         return "rd"
1263     else:
1264         return "th"
1265
1266
1267 def ngrams(txt: str, n: int):
1268     """Return the ngrams from a string.
1269
1270     >>> [x for x in ngrams('This is a test', 2)]
1271     ['This is', 'is a', 'a test']
1272
1273     """
1274     words = txt.split()
1275     return ngrams_presplit(words, n)
1276
1277
1278 def ngrams_presplit(words: Iterable[str], n: int):
1279     for ngram in zip(*[words[i:] for i in range(n)]):
1280         yield(' '.join(ngram))
1281
1282
1283 def bigrams(txt: str):
1284     return ngrams(txt, 2)
1285
1286
1287 def trigrams(txt: str):
1288     return ngrams(txt, 3)
1289
1290
1291 def shuffle_columns_into_list(
1292         input_lines: Iterable[str],
1293         column_specs: Iterable[Iterable[int]],
1294         delim=''
1295 ) -> Iterable[str]:
1296     """Helper to shuffle / parse columnar data and return the results as a
1297     list.  The column_specs argument is an iterable collection of
1298     numeric sequences that indicate one or more column numbers to
1299     copy.
1300
1301     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1302     >>> shuffle_columns_into_list(
1303     ...     cols,
1304     ...     [ [8], [2, 3], [5, 6, 7] ],
1305     ...     delim=' ',
1306     ... )
1307     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1308
1309     """
1310     out = []
1311
1312     # Column specs map input lines' columns into outputs.
1313     # [col1, col2...]
1314     for spec in column_specs:
1315         chunk = ''
1316         for n in spec:
1317             chunk = chunk + delim + input_lines[n]
1318         chunk = chunk.strip(delim)
1319         out.append(chunk)
1320     return out
1321
1322
1323 def shuffle_columns_into_dict(
1324         input_lines: Iterable[str],
1325         column_specs: Iterable[Tuple[str, Iterable[int]]],
1326         delim=''
1327 ) -> Dict[str, str]:
1328     """Helper to shuffle / parse columnar data and return the results
1329     as a dict.
1330
1331     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1332     >>> shuffle_columns_into_dict(
1333     ...     cols,
1334     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1335     ...     delim=' ',
1336     ... )
1337     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1338
1339     """
1340     out = {}
1341
1342     # Column specs map input lines' columns into outputs.
1343     # "key", [col1, col2...]
1344     for spec in column_specs:
1345         chunk = ''
1346         for n in spec[1]:
1347             chunk = chunk + delim + input_lines[n]
1348         chunk = chunk.strip(delim)
1349         out[spec[0]] = chunk
1350     return out
1351
1352
1353 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1354     """Interpolate a string with data from a dict.
1355
1356     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1357     ...                        {'adjective': 'good', 'noun': 'example'})
1358     'This is a good example.'
1359
1360     """
1361     return sprintf(txt.format(**values), end='')
1362
1363
1364 if __name__ == '__main__':
1365     import doctest
1366     doctest.testmod()