string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import contextlib
   4 import datetime
   5 import io
   6 from itertools import zip_longest
   7 import json
   8 import logging
   9 import numbers
  10 import random
  11 import re
  12 import string
  13 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
  14 import unicodedata
  15 from uuid import uuid4
  16
  17 import list_utils
  18
  19 logger = logging.getLogger(__name__)
  20
  21 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  22
  23 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  24
  25 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  26
  27 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  28
  29 URLS_RAW_STRING = (
  30     r"([a-z-]+://)"  # scheme
  31     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  32     r"(www\.)?"  # www.
  33     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
  34     r"(:\d{2,})?"  # port number
  35     r"(/[a-z\d_%+-]*)*"  # folders
  36     r"(\.[a-z\d_%+-]+)*"  # file extension
  37     r"(\?[a-z\d_+%-=]*)?"  # query string
  38     r"(#\S*)?"  # hash
  39 )
  40
  41 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  42
  43 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  44
  45 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  46
  47 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  48
  49 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  50
  51 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  52
  53 CAMEL_CASE_TEST_RE = re.compile(
  54     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  55 )
  56
  57 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  58
  59 SNAKE_CASE_TEST_RE = re.compile(
  60     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  61 )
  62
  63 SNAKE_CASE_TEST_DASH_RE = re.compile(
  64     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  65 )
  66
  67 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  68
  69 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  70
  71 CREDIT_CARDS = {
  72     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  73     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  74     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  75     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  76     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  77     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  78 }
  79
  80 JSON_WRAPPER_RE = re.compile(
  81     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  82 )
  83
  84 UUID_RE = re.compile(
  85     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  86 )
  87
  88 UUID_HEX_OK_RE = re.compile(
  89     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  90     re.IGNORECASE,
  91 )
  92
  93 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  94
  95 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
  96
  97 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
  98
  99 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
 100
 101 MAC_ADDRESS_RE = re.compile(
 102     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
 103 )
 104
 105 ANYWHERE_MAC_ADDRESS_RE = re.compile(
 106     r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
 107 )
 108
 109 WORDS_COUNT_RE = re.compile(
 110     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 111 )
 112
 113 HTML_RE = re.compile(
 114     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 115     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 116 )
 117
 118 HTML_TAG_ONLY_RE = re.compile(
 119     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 120     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 121 )
 122
 123 SPACES_RE = re.compile(r"\s")
 124
 125 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 126     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 127 )
 128
 129 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 130
 131 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 132
 133 NUM_SUFFIXES = {
 134     "Pb": (1024 ** 5),
 135     "P": (1024 ** 5),
 136     "Tb": (1024 ** 4),
 137     "T": (1024 ** 4),
 138     "Gb": (1024 ** 3),
 139     "G": (1024 ** 3),
 140     "Mb": (1024 ** 2),
 141     "M": (1024 ** 2),
 142     "Kb": (1024 ** 1),
 143     "K": (1024 ** 1),
 144 }
 145
 146
 147 def is_none_or_empty(in_str: Optional[str]) -> bool:
 148     """
 149     Returns true if the input string is either None or an empty string.
 150
 151     >>> is_none_or_empty("")
 152     True
 153     >>> is_none_or_empty(None)
 154     True
 155     >>> is_none_or_empty("   \t   ")
 156     True
 157     >>> is_none_or_empty('Test')
 158     False
 159     """
 160     return in_str is None or len(in_str.strip()) == 0
 161
 162
 163 def is_string(obj: Any) -> bool:
 164     """
 165     Checks if an object is a string.
 166
 167     >>> is_string('test')
 168     True
 169     >>> is_string(123)
 170     False
 171     >>> is_string(100.3)
 172     False
 173     >>> is_string([1, 2, 3])
 174     False
 175     """
 176     return isinstance(obj, str)
 177
 178
 179 def is_empty_string(in_str: Any) -> bool:
 180     return is_empty(in_str)
 181
 182
 183 def is_empty(in_str: Any) -> bool:
 184     """
 185     Checks if input is a string and empty or only whitespace.
 186
 187     >>> is_empty('')
 188     True
 189     >>> is_empty('    \t\t    ')
 190     True
 191     >>> is_empty('test')
 192     False
 193     >>> is_empty(100.88)
 194     False
 195     >>> is_empty([1, 2, 3])
 196     False
 197     """
 198     return is_string(in_str) and in_str.strip() == ""
 199
 200
 201 def is_full_string(in_str: Any) -> bool:
 202     """
 203     Checks that input is a string and is not empty ('') or only whitespace.
 204
 205     >>> is_full_string('test!')
 206     True
 207     >>> is_full_string('')
 208     False
 209     >>> is_full_string('      ')
 210     False
 211     >>> is_full_string(100.999)
 212     False
 213     >>> is_full_string({"a": 1, "b": 2})
 214     False
 215     """
 216     return is_string(in_str) and in_str.strip() != ""
 217
 218
 219 def is_number(in_str: str) -> bool:
 220     """
 221     Checks if a string is a valid number.
 222
 223     >>> is_number(100.5)
 224     Traceback (most recent call last):
 225     ...
 226     ValueError: 100.5
 227     >>> is_number("100.5")
 228     True
 229     >>> is_number("test")
 230     False
 231     >>> is_number("99")
 232     True
 233     >>> is_number([1, 2, 3])
 234     Traceback (most recent call last):
 235     ...
 236     ValueError: [1, 2, 3]
 237     """
 238     if not is_string(in_str):
 239         raise ValueError(in_str)
 240     return NUMBER_RE.match(in_str) is not None
 241
 242
 243 def is_integer_number(in_str: str) -> bool:
 244     """
 245     Checks whether the given string represents an integer or not.
 246
 247     An integer may be signed or unsigned or use a "scientific notation".
 248
 249     >>> is_integer_number('42')
 250     True
 251     >>> is_integer_number('42.0')
 252     False
 253     """
 254     return (
 255         (is_number(in_str) and "." not in in_str) or
 256         is_hexidecimal_integer_number(in_str) or
 257         is_octal_integer_number(in_str) or
 258         is_binary_integer_number(in_str)
 259     )
 260
 261
 262 def is_hexidecimal_integer_number(in_str: str) -> bool:
 263     """
 264     Checks whether a string is a hex integer number.
 265
 266     >>> is_hexidecimal_integer_number('0x12345')
 267     True
 268     >>> is_hexidecimal_integer_number('0x1A3E')
 269     True
 270     >>> is_hexidecimal_integer_number('1234')  # Needs 0x
 271     False
 272     >>> is_hexidecimal_integer_number('-0xff')
 273     True
 274     >>> is_hexidecimal_integer_number('test')
 275     False
 276     >>> is_hexidecimal_integer_number(12345)  # Not a string
 277     Traceback (most recent call last):
 278     ...
 279     ValueError: 12345
 280     >>> is_hexidecimal_integer_number(101.4)
 281     Traceback (most recent call last):
 282     ...
 283     ValueError: 101.4
 284     >>> is_hexidecimal_integer_number(0x1A3E)
 285     Traceback (most recent call last):
 286     ...
 287     ValueError: 6718
 288     """
 289     if not is_string(in_str):
 290         raise ValueError(in_str)
 291     return HEX_NUMBER_RE.match(in_str) is not None
 292
 293
 294 def is_octal_integer_number(in_str: str) -> bool:
 295     """
 296     Checks whether a string is an octal number.
 297
 298     >>> is_octal_integer_number('0o777')
 299     True
 300     >>> is_octal_integer_number('-0O115')
 301     True
 302     >>> is_octal_integer_number('0xFF')  # Not octal, needs 0o
 303     False
 304     >>> is_octal_integer_number('7777')  # Needs 0o
 305     False
 306     >>> is_octal_integer_number('test')
 307     False
 308     """
 309     if not is_string(in_str):
 310         raise ValueError(in_str)
 311     return OCT_NUMBER_RE.match(in_str) is not None
 312
 313
 314 def is_binary_integer_number(in_str: str) -> bool:
 315     """
 316     Returns whether a string contains a binary number.
 317
 318     >>> is_binary_integer_number('0b10111')
 319     True
 320     >>> is_binary_integer_number('-0b111')
 321     True
 322     >>> is_binary_integer_number('0B10101')
 323     True
 324     >>> is_binary_integer_number('0b10102')
 325     False
 326     >>> is_binary_integer_number('0xFFF')
 327     False
 328     >>> is_binary_integer_number('test')
 329     False
 330     """
 331     if not is_string(in_str):
 332         raise ValueError(in_str)
 333     return BIN_NUMBER_RE.match(in_str) is not None
 334
 335
 336 def to_int(in_str: str) -> int:
 337     """Returns the integral value of the string or raises on error.
 338
 339     >>> to_int('1234')
 340     1234
 341     >>> to_int('test')
 342     Traceback (most recent call last):
 343     ...
 344     ValueError: invalid literal for int() with base 10: 'test'
 345     """
 346     if not is_string(in_str):
 347         raise ValueError(in_str)
 348     if is_binary_integer_number(in_str):
 349         return int(in_str, 2)
 350     if is_octal_integer_number(in_str):
 351         return int(in_str, 8)
 352     if is_hexidecimal_integer_number(in_str):
 353         return int(in_str, 16)
 354     return int(in_str)
 355
 356
 357 def is_decimal_number(in_str: str) -> bool:
 358     """
 359     Checks whether the given string represents a decimal or not.
 360
 361     A decimal may be signed or unsigned or use a "scientific notation".
 362
 363     >>> is_decimal_number('42.0')
 364     True
 365     >>> is_decimal_number('42')
 366     False
 367     """
 368     return is_number(in_str) and "." in in_str
 369
 370
 371 def strip_escape_sequences(in_str: str) -> str:
 372     """
 373     Remove escape sequences in the input string.
 374
 375     >>> strip_escape_sequences('\e[12;11;22mthis is a test!')
 376     'this is a test!'
 377     """
 378     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 379     return in_str
 380
 381
 382 def add_thousands_separator(
 383         in_str: str,
 384         *,
 385         separator_char = ',',
 386         places = 3
 387 ) -> str:
 388     """
 389     Add thousands separator to a numeric string.  Also handles numbers.
 390
 391     >>> add_thousands_separator('12345678')
 392     '12,345,678'
 393     >>> add_thousands_separator(12345678)
 394     '12,345,678'
 395     >>> add_thousands_separator(12345678.99)
 396     '12,345,678.99'
 397     >>> add_thousands_separator('test')
 398     Traceback (most recent call last):
 399     ...
 400     ValueError: test
 401
 402     """
 403     if isinstance(in_str, numbers.Number):
 404         in_str = f'{in_str}'
 405     if is_number(in_str):
 406         return _add_thousands_separator(
 407             in_str,
 408             separator_char = separator_char,
 409             places = places
 410         )
 411     raise ValueError(in_str)
 412
 413
 414 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 415     decimal_part = ""
 416     if '.' in in_str:
 417         (in_str, decimal_part) = in_str.split('.')
 418     tmp = [iter(in_str[::-1])] * places
 419     ret = separator_char.join(
 420         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 421     if len(decimal_part) > 0:
 422         ret += '.'
 423         ret += decimal_part
 424     return ret
 425
 426
 427 # Full url example:
 428 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 429 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 430     """
 431     Check if a string is a valid url.
 432
 433     >>> is_url('http://www.mysite.com')
 434     True
 435     >>> is_url('https://mysite.com')
 436     True
 437     >>> is_url('.mysite.com')
 438     False
 439     """
 440     if not is_full_string(in_str):
 441         return False
 442
 443     valid = URL_RE.match(in_str) is not None
 444
 445     if allowed_schemes:
 446         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 447     return valid
 448
 449
 450 def is_email(in_str: Any) -> bool:
 451     """
 452     Check if a string is a valid email.
 453
 454     Reference: https://tools.ietf.org/html/rfc3696#section-3
 455
 456     >>> is_email('[email protected]')
 457     True
 458     >>> is_email('@gmail.com')
 459     False
 460     """
 461     if (
 462         not is_full_string(in_str)
 463         or len(in_str) > 320
 464         or in_str.startswith(".")
 465     ):
 466         return False
 467
 468     try:
 469         # we expect 2 tokens, one before "@" and one after, otherwise
 470         # we have an exception and the email is not valid.
 471         head, tail = in_str.split("@")
 472
 473         # head's size must be <= 64, tail <= 255, head must not start
 474         # with a dot or contain multiple consecutive dots.
 475         if (
 476             len(head) > 64
 477             or len(tail) > 255
 478             or head.endswith(".")
 479             or (".." in head)
 480         ):
 481             return False
 482
 483         # removes escaped spaces, so that later on the test regex will
 484         # accept the string.
 485         head = head.replace("\\ ", "")
 486         if head.startswith('"') and head.endswith('"'):
 487             head = head.replace(" ", "")[1:-1]
 488         return EMAIL_RE.match(head + "@" + tail) is not None
 489
 490     except ValueError:
 491         # borderline case in which we have multiple "@" signs but the
 492         # head part is correctly escaped.
 493         if ESCAPED_AT_SIGN.search(in_str) is not None:
 494             # replace "@" with "a" in the head
 495             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 496         return False
 497
 498
 499 def suffix_string_to_number(in_str: str) -> Optional[int]:
 500     """Take a string like "33Gb" and convert it into a number (of bytes)
 501     like 34603008.  Return None if the input string is not valid.
 502
 503     >>> suffix_string_to_number('1Mb')
 504     1048576
 505     >>> suffix_string_to_number('13.1Gb')
 506     14066017894
 507     """
 508     def suffix_capitalize(s: str) -> str:
 509         if len(s) == 1:
 510             return s.upper()
 511         elif len(s) == 2:
 512             return f"{s[0].upper()}{s[1].lower()}"
 513         return suffix_capitalize(s[0:1])
 514
 515     if is_string(in_str):
 516         if is_integer_number(in_str):
 517             return to_int(in_str)
 518         suffixes = [in_str[-2:], in_str[-1:]]
 519         rest = [in_str[:-2], in_str[:-1]]
 520         for x in range(len(suffixes)):
 521             s = suffixes[x]
 522             s = suffix_capitalize(s)
 523             multiplier = NUM_SUFFIXES.get(s, None)
 524             if multiplier is not None:
 525                 r = rest[x]
 526                 if is_integer_number(r):
 527                     return to_int(r) * multiplier
 528                 if is_decimal_number(r):
 529                     return int(float(r) * multiplier)
 530     return None
 531
 532
 533 def number_to_suffix_string(num: int) -> Optional[str]:
 534     """Take a number (of bytes) and returns a string like "43.8Gb".
 535     Returns none if the input is invalid.
 536
 537     >>> number_to_suffix_string(14066017894)
 538     '13.1Gb'
 539     >>> number_to_suffix_string(1024 * 1024)
 540     '1.0Mb'
 541
 542     """
 543     d = 0.0
 544     suffix = None
 545     for (sfx, size) in NUM_SUFFIXES.items():
 546         if num >= size:
 547             d = num / size
 548             suffix = sfx
 549             break
 550     if suffix is not None:
 551         return f"{d:.1f}{suffix}"
 552     else:
 553         return f'{num:d}'
 554
 555
 556 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 557     """
 558     Checks if a string is a valid credit card number.
 559     If card type is provided then it checks against that specific type only,
 560     otherwise any known credit card number will be accepted.
 561
 562     Supported card types are the following:
 563
 564     - VISA
 565     - MASTERCARD
 566     - AMERICAN_EXPRESS
 567     - DINERS_CLUB
 568     - DISCOVER
 569     - JCB
 570     """
 571     if not is_full_string(in_str):
 572         return False
 573
 574     if card_type is not None:
 575         if card_type not in CREDIT_CARDS:
 576             raise KeyError(
 577                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 578             )
 579         return CREDIT_CARDS[card_type].match(in_str) is not None
 580     for c in CREDIT_CARDS:
 581         if CREDIT_CARDS[c].match(in_str) is not None:
 582             return True
 583     return False
 584
 585
 586 def is_camel_case(in_str: Any) -> bool:
 587     """
 588     Checks if a string is formatted as camel case.
 589
 590     A string is considered camel case when:
 591
 592     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 593     - it contains both lowercase and uppercase letters
 594     - it does not start with a number
 595     """
 596     return (
 597         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 598     )
 599
 600
 601 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 602     """
 603     Checks if a string is formatted as "snake case".
 604
 605     A string is considered snake case when:
 606
 607     - it's composed only by lowercase/uppercase letters and digits
 608     - it contains at least one underscore (or provided separator)
 609     - it does not start with a number
 610
 611     >>> is_snake_case('this_is_a_test')
 612     True
 613     >>> is_snake_case('___This_Is_A_Test_1_2_3___')
 614     True
 615     >>> is_snake_case('this-is-a-test')
 616     False
 617     >>> is_snake_case('this-is-a-test', separator='-')
 618     True
 619
 620     """
 621     if is_full_string(in_str):
 622         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 623         re_template = (
 624             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 625         )
 626         r = re_map.get(
 627             separator,
 628             re.compile(
 629                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 630             ),
 631         )
 632         return r.match(in_str) is not None
 633     return False
 634
 635
 636 def is_json(in_str: Any) -> bool:
 637     """
 638     Check if a string is a valid json.
 639
 640     >>> is_json('{"name": "Peter"}')
 641     True
 642     >>> is_json('[1, 2, 3]')
 643     True
 644     >>> is_json('{nope}')
 645     False
 646     """
 647     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 648         try:
 649             return isinstance(json.loads(in_str), (dict, list))
 650         except (TypeError, ValueError, OverflowError):
 651             pass
 652     return False
 653
 654
 655 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 656     """
 657     Check if a string is a valid UUID.
 658
 659     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
 660     True
 661     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
 662     False
 663     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
 664     True
 665     """
 666     # string casting is used to allow UUID itself as input data type
 667     s = str(in_str)
 668     if allow_hex:
 669         return UUID_HEX_OK_RE.match(s) is not None
 670     return UUID_RE.match(s) is not None
 671
 672
 673 def is_ip_v4(in_str: Any) -> bool:
 674     """
 675     Checks if a string is a valid ip v4.
 676
 677     >>> is_ip_v4('255.200.100.75')
 678     True
 679     >>> is_ip_v4('nope')
 680     False
 681     >>> is_ip_v4('255.200.100.999')  # 999 out of range
 682     False
 683     """
 684     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 685         return False
 686
 687     # checks that each entry in the ip is in the valid range (0 to 255)
 688     for token in in_str.split("."):
 689         if not 0 <= int(token) <= 255:
 690             return False
 691     return True
 692
 693
 694 def extract_ip_v4(in_str: Any) -> Optional[str]:
 695     """
 696     Extracts the IPv4 chunk of a string or None.
 697
 698     >>> extract_ip_v4('   The secret IP address: 127.0.0.1 (use it wisely)   ')
 699     '127.0.0.1'
 700     >>> extract_ip_v4('Your mom dresses you funny.')
 701     """
 702     if not is_full_string(in_str):
 703         return None
 704     m = ANYWHERE_IP_V4_RE.search(in_str)
 705     if m is not None:
 706         return m.group(0)
 707     return None
 708
 709
 710 def is_ip_v6(in_str: Any) -> bool:
 711     """
 712     Checks if a string is a valid ip v6.
 713
 714     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
 715     True
 716     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?')    # invalid "?"
 717     False
 718     """
 719     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 720
 721
 722 def extract_ip_v6(in_str: Any) -> Optional[str]:
 723     """
 724     Extract IPv6 chunk or None.
 725
 726     >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 727     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 728     >>> extract_ip_v6("(and she's ugly too, btw)")
 729     """
 730     if not is_full_string(in_str):
 731         return None
 732     m = ANYWHERE_IP_V6_RE.search(in_str)
 733     if m is not None:
 734         return m.group(0)
 735     return None
 736
 737
 738 def is_ip(in_str: Any) -> bool:
 739     """
 740     Checks if a string is a valid ip (either v4 or v6).
 741
 742     >>> is_ip('255.200.100.75')
 743     True
 744     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
 745     True
 746     >>> is_ip('1.2.3')
 747     False
 748     >>> is_ip('1.2.3.999')
 749     False
 750     """
 751     return is_ip_v6(in_str) or is_ip_v4(in_str)
 752
 753
 754 def extract_ip(in_str: Any) -> Optional[str]:
 755     """
 756     Extract the IP address or None.
 757
 758     >>> extract_ip('Attacker: 255.200.100.75')
 759     '255.200.100.75'
 760     >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
 761     '2001:db8:85a3:0000:0000:8a2e:370:7334'
 762     >>> extract_ip('1.2.3')
 763
 764     """
 765     ip = extract_ip_v4(in_str)
 766     if ip is None:
 767         ip = extract_ip_v6(in_str)
 768     return ip
 769
 770
 771 def is_mac_address(in_str: Any) -> bool:
 772     """Return True if in_str is a valid MAC address false otherwise.
 773
 774     >>> is_mac_address("34:29:8F:12:0D:2F")
 775     True
 776     >>> is_mac_address('34:29:8f:12:0d:2f')
 777     True
 778     >>> is_mac_address('34-29-8F-12-0D-2F')
 779     True
 780     >>> is_mac_address("test")
 781     False
 782     """
 783     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 784
 785
 786 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 787     """
 788     Extract the MAC address from in_str.
 789
 790     >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
 791     '34:29:8F:12:0D:2F'
 792
 793     """
 794     if not is_full_string(in_str):
 795         return None
 796     in_str.strip()
 797     m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
 798     if m is not None:
 799         mac = m.group(0)
 800         mac.replace(":", separator)
 801         mac.replace("-", separator)
 802         return mac
 803     return None
 804
 805
 806 def is_slug(in_str: Any, separator: str = "-") -> bool:
 807     """
 808     Checks if a given string is a slug (as created by `slugify()`).
 809
 810     >>> is_slug('my-blog-post-title')
 811     True
 812     >>> is_slug('My blog post title')
 813     False
 814
 815     """
 816     if not is_full_string(in_str):
 817         return False
 818     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 819     return re.match(rex, in_str) is not None
 820
 821
 822 def contains_html(in_str: str) -> bool:
 823     """
 824     Checks if the given string contains HTML/XML tags.
 825
 826     By design, this function matches ANY type of tag, so don't expect to use it
 827     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 828
 829     >>> contains_html('my string is <strong>bold</strong>')
 830     True
 831     >>> contains_html('my string is not bold')
 832     False
 833
 834     """
 835     if not is_string(in_str):
 836         raise ValueError(in_str)
 837     return HTML_RE.search(in_str) is not None
 838
 839
 840 def words_count(in_str: str) -> int:
 841     """
 842     Returns the number of words contained into the given string.
 843
 844     This method is smart, it does consider only sequence of one or more letter and/or numbers
 845     as "words", so a string like this: "! @ # % ... []" will return zero!
 846     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 847     will be 4 not 1 (even if there are no spaces in the string).
 848
 849     >>> words_count('hello world')
 850     2
 851     >>> words_count('one,two,three.stop')
 852     4
 853
 854     """
 855     if not is_string(in_str):
 856         raise ValueError(in_str)
 857     return len(WORDS_COUNT_RE.findall(in_str))
 858
 859
 860 def generate_uuid(as_hex: bool = False) -> str:
 861     """
 862     Generated an UUID string (using `uuid.uuid4()`).
 863
 864     generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 865     generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 866
 867     """
 868     uid = uuid4()
 869     if as_hex:
 870         return uid.hex
 871     return str(uid)
 872
 873
 874 def generate_random_alphanumeric_string(size: int) -> str:
 875     """
 876     Returns a string of the specified size containing random
 877     characters (uppercase/lowercase ascii letters and digits).
 878
 879     random_string(9) # possible output: "cx3QQbzYg"
 880
 881     """
 882     if size < 1:
 883         raise ValueError("size must be >= 1")
 884     chars = string.ascii_letters + string.digits
 885     buffer = [random.choice(chars) for _ in range(size)]
 886     return from_char_list(buffer)
 887
 888
 889 def reverse(in_str: str) -> str:
 890     """
 891     Returns the string with its chars reversed.
 892
 893     >>> reverse('test')
 894     'tset'
 895
 896     """
 897     if not is_string(in_str):
 898         raise ValueError(in_str)
 899     return in_str[::-1]
 900
 901
 902 def camel_case_to_snake_case(in_str, *, separator="_"):
 903     """
 904     Convert a camel case string into a snake case one.
 905     (The original string is returned if is not a valid camel case string)
 906
 907     >>> camel_case_to_snake_case('MacAddressExtractorFactory')
 908     'mac_address_extractor_factory'
 909     >>> camel_case_to_snake_case('Luke Skywalker')
 910     'Luke Skywalker'
 911     """
 912     if not is_string(in_str):
 913         raise ValueError(in_str)
 914     if not is_camel_case(in_str):
 915         return in_str
 916     return CAMEL_CASE_REPLACE_RE.sub(
 917         lambda m: m.group(1) + separator, in_str
 918     ).lower()
 919
 920
 921 def snake_case_to_camel_case(
 922     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 923 ) -> str:
 924     """
 925     Convert a snake case string into a camel case one.
 926     (The original string is returned if is not a valid snake case string)
 927
 928     >>> snake_case_to_camel_case('this_is_a_test')
 929     'ThisIsATest'
 930     >>> snake_case_to_camel_case('Han Solo')
 931     'Han Solo'
 932     """
 933     if not is_string(in_str):
 934         raise ValueError(in_str)
 935     if not is_snake_case(in_str, separator=separator):
 936         return in_str
 937     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 938     if not upper_case_first:
 939         tokens[0] = tokens[0].lower()
 940     return from_char_list(tokens)
 941
 942
 943 def to_char_list(in_str: str) -> List[str]:
 944     """Convert a string into a list of chars.
 945
 946     >>> to_char_list('test')
 947     ['t', 'e', 's', 't']
 948     """
 949     if not is_string(in_str):
 950         return []
 951     return list(in_str)
 952
 953
 954 def from_char_list(in_list: List[str]) -> str:
 955     """Convert a char list into a string.
 956
 957     >>> from_char_list(['t', 'e', 's', 't'])
 958     'test'
 959     """
 960     return "".join(in_list)
 961
 962
 963 def shuffle(in_str: str) -> str:
 964     """Return a new string containing same chars of the given one but in
 965     a randomized order.
 966     """
 967     if not is_string(in_str):
 968         raise ValueError(in_str)
 969
 970     # turn the string into a list of chars
 971     chars = to_char_list(in_str)
 972     random.shuffle(chars)
 973     return from_char_list(chars)
 974
 975
 976 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 977     """
 978     Remove html code contained into the given string.
 979
 980     >>> strip_html('test: <a href="foo/bar">click here</a>')
 981     'test: '
 982     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
 983     'test: click here'
 984     """
 985     if not is_string(in_str):
 986         raise ValueError(in_str)
 987     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 988     return r.sub("", in_str)
 989
 990
 991 def asciify(in_str: str) -> str:
 992     """
 993     Force string content to be ascii-only by translating all non-ascii
 994     chars into the closest possible representation (eg: ó -> o, Ë ->
 995     E, ç -> c...).
 996
 997     N.B. Some chars may be lost if impossible to translate.
 998
 999     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1000     'eeuuooaaeynAAACIINOE'
1001     """
1002     if not is_string(in_str):
1003         raise ValueError(in_str)
1004
1005     # "NFKD" is the algorithm which is able to successfully translate
1006     # the most of non-ascii chars.
1007     normalized = unicodedata.normalize("NFKD", in_str)
1008
1009     # encode string forcing ascii and ignore any errors
1010     # (unrepresentable chars will be stripped out)
1011     ascii_bytes = normalized.encode("ascii", "ignore")
1012
1013     # turns encoded bytes into an utf-8 string
1014     return ascii_bytes.decode("utf-8")
1015
1016
1017 def slugify(in_str: str, *, separator: str = "-") -> str:
1018     """
1019     Converts a string into a "slug" using provided separator.
1020     The returned string has the following properties:
1021
1022     - it has no spaces
1023     - all letters are in lower case
1024     - all punctuation signs and non alphanumeric chars are removed
1025     - words are divided using provided separator
1026     - all chars are encoded as ascii (by using `asciify()`)
1027     - is safe for URL
1028
1029     >>> slugify('Top 10 Reasons To Love Dogs!!!')
1030     'top-10-reasons-to-love-dogs'
1031     >>> slugify('Mönstér Mägnët')
1032     'monster-magnet'
1033     """
1034     if not is_string(in_str):
1035         raise ValueError(in_str)
1036
1037     # replace any character that is NOT letter or number with spaces
1038     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1039
1040     # replace spaces with join sign
1041     out = SPACES_RE.sub(separator, out)
1042
1043     # normalize joins (remove duplicates)
1044     out = re.sub(re.escape(separator) + r"+", separator, out)
1045     return asciify(out)
1046
1047
1048 def to_bool(in_str: str) -> bool:
1049     """
1050     Turns a string into a boolean based on its content (CASE INSENSITIVE).
1051
1052     A positive boolean (True) is returned if the string value is one
1053     of the following:
1054
1055     - "true"
1056     - "1"
1057     - "yes"
1058     - "y"
1059
1060     Otherwise False is returned.
1061
1062     >>> to_bool('True')
1063     True
1064
1065     >>> to_bool('1')
1066     True
1067
1068     >>> to_bool('yes')
1069     True
1070
1071     >>> to_bool('no')
1072     False
1073
1074     >>> to_bool('huh?')
1075     False
1076
1077     >>> to_bool('on')
1078     True
1079
1080     """
1081     if not is_string(in_str):
1082         raise ValueError(in_str)
1083     return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1084
1085
1086 def to_date(in_str: str) -> Optional[datetime.date]:
1087     """
1088     Parses a date string.  See DateParser docs for details.
1089     """
1090     import dateparse.dateparse_utils as dp
1091     try:
1092         d = dp.DateParser()
1093         d.parse(in_str)
1094         return d.get_date()
1095     except dp.ParseException:
1096         logger.warning(f'Unable to parse date {in_str}.')
1097     return None
1098
1099
1100 def valid_date(in_str: str) -> bool:
1101     """
1102     True if the string represents a valid date.
1103     """
1104     import dateparse.dateparse_utils as dp
1105     try:
1106         d = dp.DateParser()
1107         _ = d.parse(in_str)
1108         return True
1109     except dp.ParseException:
1110         logger.warning(f'Unable to parse date {in_str}.')
1111     return False
1112
1113
1114 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1115     """
1116     Parses a datetime string.  See DateParser docs for more info.
1117     """
1118     import dateparse.dateparse_utils as dp
1119     try:
1120         d = dp.DateParser()
1121         dt = d.parse(in_str)
1122         if type(dt) == datetime.datetime:
1123             return dt
1124     except ValueError:
1125         logger.warning(f'Unable to parse datetime {in_str}.')
1126     return None
1127
1128
1129 def valid_datetime(in_str: str) -> bool:
1130     """
1131     True if the string represents a valid datetime.
1132     """
1133     _ = to_datetime(in_str)
1134     if _ is not None:
1135         return True
1136     logger.warning(f'Unable to parse datetime {in_str}.')
1137     return False
1138
1139
1140 def dedent(in_str: str) -> str:
1141     """
1142     Removes tab indentation from multi line strings (inspired by analogous Scala function).
1143     """
1144     if not is_string(in_str):
1145         raise ValueError(in_str)
1146     line_separator = '\n'
1147     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1148     return line_separator.join(lines)
1149
1150
1151 def indent(in_str: str, amount: int) -> str:
1152     """
1153     Indents string by prepending amount spaces.
1154
1155     >>> indent('This is a test', 4)
1156     '    This is a test'
1157
1158     """
1159     if not is_string(in_str):
1160         raise ValueError(in_str)
1161     line_separator = '\n'
1162     lines = [" " * amount + line for line in in_str.split(line_separator)]
1163     return line_separator.join(lines)
1164
1165
1166 def sprintf(*args, **kwargs) -> str:
1167     """String printf, like in C"""
1168     ret = ""
1169
1170     sep = kwargs.pop("sep", None)
1171     if sep is not None:
1172         if not isinstance(sep, str):
1173             raise TypeError("sep must be None or a string")
1174
1175     end = kwargs.pop("end", None)
1176     if end is not None:
1177         if not isinstance(end, str):
1178             raise TypeError("end must be None or a string")
1179
1180     if kwargs:
1181         raise TypeError("invalid keyword arguments to sprint()")
1182
1183     if sep is None:
1184         sep = " "
1185     if end is None:
1186         end = "\n"
1187     for i, arg in enumerate(args):
1188         if i:
1189             ret += sep
1190         if isinstance(arg, str):
1191             ret += arg
1192         else:
1193             ret += str(arg)
1194     ret += end
1195     return ret
1196
1197
1198 class SprintfStdout(object):
1199     """
1200     A context manager that captures outputs to stdout.
1201
1202     with SprintfStdout() as buf:
1203         print("test")
1204     print(buf())
1205
1206     'test\n'
1207     """
1208     def __init__(self) -> None:
1209         self.destination = io.StringIO()
1210         self.recorder = None
1211
1212     def __enter__(self) -> Callable[[], str]:
1213         self.recorder = contextlib.redirect_stdout(self.destination)
1214         self.recorder.__enter__()
1215         return lambda: self.destination.getvalue()
1216
1217     def __exit__(self, *args) -> None:
1218         self.recorder.__exit__(*args)
1219         self.destination.seek(0)
1220         return None  # don't suppress exceptions
1221
1222
1223 def is_are(n: int) -> str:
1224     """Is or are?
1225
1226     >>> is_are(1)
1227     'is'
1228     >>> is_are(2)
1229     'are'
1230
1231     """
1232     if n == 1:
1233         return "is"
1234     return "are"
1235
1236
1237 def pluralize(n: int) -> str:
1238     """Add an s?
1239
1240     >>> pluralize(15)
1241     's'
1242     >>> count = 1
1243     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1244     There is 1 file.
1245     >>> count = 4
1246     >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1247     There are 4 files.
1248
1249     """
1250     if n == 1:
1251         return ""
1252     return "s"
1253
1254
1255 def thify(n: int) -> str:
1256     """Return the proper cardinal suffix for a number.
1257
1258     >>> thify(1)
1259     'st'
1260     >>> thify(33)
1261     'rd'
1262     >>> thify(16)
1263     'th'
1264
1265     """
1266     digit = str(n)
1267     assert is_integer_number(digit)
1268     digit = digit[-1:]
1269     if digit == "1":
1270         return "st"
1271     elif digit == "2":
1272         return "nd"
1273     elif digit == "3":
1274         return "rd"
1275     else:
1276         return "th"
1277
1278
1279 def ngrams(txt: str, n: int):
1280     """Return the ngrams from a string.
1281
1282     >>> [x for x in ngrams('This is a test', 2)]
1283     ['This is', 'is a', 'a test']
1284
1285     """
1286     words = txt.split()
1287     for ngram in ngrams_presplit(words, n):
1288         return ' '.join(ngram)
1289
1290
1291 def ngrams_presplit(words: Iterable[str], n: int):
1292     return list_utils.ngrams(words, n)
1293
1294
1295 def bigrams(txt: str):
1296     return ngrams(txt, 2)
1297
1298
1299 def trigrams(txt: str):
1300     return ngrams(txt, 3)
1301
1302
1303 def shuffle_columns_into_list(
1304         input_lines: Iterable[str],
1305         column_specs: Iterable[Iterable[int]],
1306         delim=''
1307 ) -> Iterable[str]:
1308     """Helper to shuffle / parse columnar data and return the results as a
1309     list.  The column_specs argument is an iterable collection of
1310     numeric sequences that indicate one or more column numbers to
1311     copy.
1312
1313     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1314     >>> shuffle_columns_into_list(
1315     ...     cols,
1316     ...     [ [8], [2, 3], [5, 6, 7] ],
1317     ...     delim=' ',
1318     ... )
1319     ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1320
1321     """
1322     out = []
1323
1324     # Column specs map input lines' columns into outputs.
1325     # [col1, col2...]
1326     for spec in column_specs:
1327         chunk = ''
1328         for n in spec:
1329             chunk = chunk + delim + input_lines[n]
1330         chunk = chunk.strip(delim)
1331         out.append(chunk)
1332     return out
1333
1334
1335 def shuffle_columns_into_dict(
1336         input_lines: Iterable[str],
1337         column_specs: Iterable[Tuple[str, Iterable[int]]],
1338         delim=''
1339 ) -> Dict[str, str]:
1340     """Helper to shuffle / parse columnar data and return the results
1341     as a dict.
1342
1343     >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul  9 11:34 acl_test.py'.split()
1344     >>> shuffle_columns_into_dict(
1345     ...     cols,
1346     ...     [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1347     ...     delim=' ',
1348     ... )
1349     {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1350
1351     """
1352     out = {}
1353
1354     # Column specs map input lines' columns into outputs.
1355     # "key", [col1, col2...]
1356     for spec in column_specs:
1357         chunk = ''
1358         for n in spec[1]:
1359             chunk = chunk + delim + input_lines[n]
1360         chunk = chunk.strip(delim)
1361         out[spec[0]] = chunk
1362     return out
1363
1364
1365 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1366     """Interpolate a string with data from a dict.
1367
1368     >>> interpolate_using_dict('This is a {adjective} {noun}.',
1369     ...                        {'adjective': 'good', 'noun': 'example'})
1370     'This is a good example.'
1371
1372     """
1373     return sprintf(txt.format(**values), end='')
1374
1375
1376 if __name__ == '__main__':
1377     import doctest
1378     doctest.testmod()