string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import datetime
   4 from itertools import zip_longest
   5 import json
   6 import logging
   7 import random
   8 import re
   9 import string
  10 from typing import Any, List, Optional
  11 import unicodedata
  12 from uuid import uuid4
  13
  14 import dateparse.dateparse_utils as dp
  15
  16
  17 logger = logging.getLogger(__name__)
  18
  19 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  20
  21 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  22
  23 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  24
  25 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  26
  27 URLS_RAW_STRING = (
  28     r"([a-z-]+://)"  # scheme
  29     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  30     r"(www\.)?"  # www.
  31     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  32     r"(:\d{2,})?"  # port number
  33     r"(/[a-z\d_%+-]*)*"  # folders
  34     r"(\.[a-z\d_%+-]+)*"  # file extension
  35     r"(\?[a-z\d_+%-=]*)?"  # query string
  36     r"(#\S*)?"  # hash
  37 )
  38
  39 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  40
  41 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  42
  43 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  44
  45 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  46
  47 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  48
  49 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  50
  51 CAMEL_CASE_TEST_RE = re.compile(
  52     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  53 )
  54
  55 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  56
  57 SNAKE_CASE_TEST_RE = re.compile(
  58     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  59 )
  60
  61 SNAKE_CASE_TEST_DASH_RE = re.compile(
  62     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  63 )
  64
  65 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  66
  67 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  68
  69 CREDIT_CARDS = {
  70     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  71     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  72     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  73     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  74     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  75     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  76 }
  77
  78 JSON_WRAPPER_RE = re.compile(
  79     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  80 )
  81
  82 UUID_RE = re.compile(
  83     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  84 )
  85
  86 UUID_HEX_OK_RE = re.compile(
  87     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  88     re.IGNORECASE,
  89 )
  90
  91 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  92
  93 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
  94
  95 MAC_ADDRESS_RE = re.compile(
  96     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
  97 )
  98
  99 WORDS_COUNT_RE = re.compile(
 100     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 101 )
 102
 103 HTML_RE = re.compile(
 104     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 105     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 106 )
 107
 108 HTML_TAG_ONLY_RE = re.compile(
 109     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 110     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 111 )
 112
 113 SPACES_RE = re.compile(r"\s")
 114
 115 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 116     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 117 )
 118
 119 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 120
 121 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 122
 123 NUM_SUFFIXES = {
 124     "Pb": (1024 ** 5),
 125     "P": (1024 ** 5),
 126     "Tb": (1024 ** 4),
 127     "T": (1024 ** 4),
 128     "Gb": (1024 ** 3),
 129     "G": (1024 ** 3),
 130     "Mb": (1024 ** 2),
 131     "M": (1024 ** 2),
 132     "Kb": (1024 ** 1),
 133     "K": (1024 ** 1),
 134 }
 135
 136
 137 def is_none_or_empty(in_str: Optional[str]) -> bool:
 138     return in_str is None or len(in_str.strip()) == 0
 139
 140
 141 def is_string(obj: Any) -> bool:
 142     """
 143     Checks if an object is a string.
 144     """
 145     return isinstance(obj, str)
 146
 147
 148 def is_empty_string(in_str: Any) -> bool:
 149     return is_string(in_str) and in_str.strip() == ""
 150
 151
 152 def is_full_string(in_str: Any) -> bool:
 153     return is_string(in_str) and in_str.strip() != ""
 154
 155
 156 def is_number(in_str: str) -> bool:
 157     """
 158     Checks if a string is a valid number.
 159     """
 160     if not is_string(in_str):
 161         raise ValueError(in_str)
 162     return NUMBER_RE.match(in_str) is not None
 163
 164
 165 def is_integer_number(in_str: str) -> bool:
 166     """
 167     Checks whether the given string represents an integer or not.
 168
 169     An integer may be signed or unsigned or use a "scientific notation".
 170
 171     *Examples:*
 172
 173     >>> is_integer('42') # returns true
 174     >>> is_integer('42.0') # returns false
 175     """
 176     return (
 177         (is_number(in_str) and "." not in in_str) or
 178         is_hexidecimal_integer_number(in_str) or
 179         is_octal_integer_number(in_str) or
 180         is_binary_integer_number(in_str)
 181     )
 182
 183
 184 def is_hexidecimal_integer_number(in_str: str) -> bool:
 185     if not is_string(in_str):
 186         raise ValueError(in_str)
 187     return HEX_NUMBER_RE.match(in_str) is not None
 188
 189
 190 def is_octal_integer_number(in_str: str) -> bool:
 191     if not is_string(in_str):
 192         raise ValueError(in_str)
 193     return OCT_NUMBER_RE.match(in_str) is not None
 194
 195
 196 def is_binary_integer_number(in_str: str) -> bool:
 197     if not is_string(in_str):
 198         raise ValueError(in_str)
 199     return BIN_NUMBER_RE.match(in_str) is not None
 200
 201
 202 def to_int(in_str: str) -> int:
 203     if not is_string(in_str):
 204         raise ValueError(in_str)
 205     if is_binary_integer_number(in_str):
 206         return int(in_str, 2)
 207     if is_octal_integer_number(in_str):
 208         return int(in_str, 8)
 209     if is_hexidecimal_integer_number(in_str):
 210         return int(in_str, 16)
 211     return int(in_str)
 212
 213
 214 def is_decimal_number(in_str: str) -> bool:
 215     """
 216     Checks whether the given string represents a decimal or not.
 217
 218     A decimal may be signed or unsigned or use a "scientific notation".
 219
 220     >>> is_decimal('42.0') # returns true
 221     >>> is_decimal('42') # returns false
 222     """
 223     return is_number(in_str) and "." in in_str
 224
 225
 226 def strip_escape_sequences(in_str: str) -> str:
 227     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 228     return in_str
 229
 230
 231 def add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 232     if isinstance(in_str, int):
 233         in_str = f'{in_str}'
 234
 235     if is_number(in_str):
 236         return _add_thousands_separator(
 237             in_str,
 238             separator_char = separator_char,
 239             places = places
 240         )
 241     raise ValueError(in_str)
 242
 243
 244 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 245     decimal_part = ""
 246     if '.' in in_str:
 247         (in_str, decimal_part) = in_str.split('.')
 248     tmp = [iter(in_str[::-1])] * places
 249     ret = separator_char.join(
 250         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 251     if len(decimal_part) > 0:
 252         ret += '.'
 253         ret += decimal_part
 254     return ret
 255
 256
 257 # Full url example:
 258 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 259 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 260     """
 261     Check if a string is a valid url.
 262
 263     *Examples:*
 264
 265     >>> is_url('http://www.mysite.com') # returns true
 266     >>> is_url('https://mysite.com') # returns true
 267     >>> is_url('.mysite.com') # returns false
 268     """
 269     if not is_full_string(in_str):
 270         return False
 271
 272     valid = URL_RE.match(in_str) is not None
 273
 274     if allowed_schemes:
 275         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 276     return valid
 277
 278
 279 def is_email(in_str: Any) -> bool:
 280     """
 281     Check if a string is a valid email.
 282
 283     Reference: https://tools.ietf.org/html/rfc3696#section-3
 284
 285     *Examples:*
 286
 287     >>> is_email('[email protected]') # returns true
 288     >>> is_email('@gmail.com') # returns false
 289     """
 290     if (
 291         not is_full_string(in_str)
 292         or len(in_str) > 320
 293         or in_str.startswith(".")
 294     ):
 295         return False
 296
 297     try:
 298         # we expect 2 tokens, one before "@" and one after, otherwise
 299         # we have an exception and the email is not valid.
 300         head, tail = in_str.split("@")
 301
 302         # head's size must be <= 64, tail <= 255, head must not start
 303         # with a dot or contain multiple consecutive dots.
 304         if (
 305             len(head) > 64
 306             or len(tail) > 255
 307             or head.endswith(".")
 308             or (".." in head)
 309         ):
 310             return False
 311
 312         # removes escaped spaces, so that later on the test regex will
 313         # accept the string.
 314         head = head.replace("\\ ", "")
 315         if head.startswith('"') and head.endswith('"'):
 316             head = head.replace(" ", "")[1:-1]
 317         return EMAIL_RE.match(head + "@" + tail) is not None
 318
 319     except ValueError:
 320         # borderline case in which we have multiple "@" signs but the
 321         # head part is correctly escaped.
 322         if ESCAPED_AT_SIGN.search(in_str) is not None:
 323             # replace "@" with "a" in the head
 324             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 325         return False
 326
 327
 328 def suffix_string_to_number(in_str: str) -> Optional[int]:
 329     """Take a string like "33Gb" and convert it into a number (of bytes)
 330     like 34603008.  Return None if the input string is not valid.
 331     """
 332
 333     def suffix_capitalize(s: str) -> str:
 334         if len(s) == 1:
 335             return s.upper()
 336         elif len(s) == 2:
 337             return f"{s[0].upper()}{s[1].lower()}"
 338         return suffix_capitalize(s[0:1])
 339
 340     if is_string(in_str):
 341         if is_integer_number(in_str):
 342             return to_int(in_str)
 343         suffixes = [in_str[-2:], in_str[-1:]]
 344         rest = [in_str[:-2], in_str[:-1]]
 345         for x in range(len(suffixes)):
 346             s = suffixes[x]
 347             s = suffix_capitalize(s)
 348             multiplier = NUM_SUFFIXES.get(s, None)
 349             if multiplier is not None:
 350                 r = rest[x]
 351                 if is_integer_number(r):
 352                     return int(r) * multiplier
 353     return None
 354
 355
 356 def number_to_suffix_string(num: int) -> Optional[str]:
 357     """Take a number (of bytes) and returns a string like "43.8Gb".
 358     Returns none if the input is invalid.
 359     """
 360     d = 0.0
 361     suffix = None
 362     for (sfx, size) in NUM_SUFFIXES.items():
 363         if num >= size:
 364             d = num / size
 365             suffix = sfx
 366             break
 367     if suffix is not None:
 368         return f"{d:.1f}{suffix}"
 369     else:
 370         return f'{num:d}'
 371
 372
 373 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 374     """
 375     Checks if a string is a valid credit card number.
 376     If card type is provided then it checks against that specific type only,
 377     otherwise any known credit card number will be accepted.
 378
 379     Supported card types are the following:
 380
 381     - VISA
 382     - MASTERCARD
 383     - AMERICAN_EXPRESS
 384     - DINERS_CLUB
 385     - DISCOVER
 386     - JCB
 387     """
 388     if not is_full_string(in_str):
 389         return False
 390
 391     if card_type is not None:
 392         if card_type not in CREDIT_CARDS:
 393             raise KeyError(
 394                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 395             )
 396         return CREDIT_CARDS[card_type].match(in_str) is not None
 397     for c in CREDIT_CARDS:
 398         if CREDIT_CARDS[c].match(in_str) is not None:
 399             return True
 400     return False
 401
 402
 403 def is_camel_case(in_str: Any) -> bool:
 404     """
 405     Checks if a string is formatted as camel case.
 406
 407     A string is considered camel case when:
 408
 409     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 410     - it contains both lowercase and uppercase letters
 411     - it does not start with a number
 412     """
 413     return (
 414         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 415     )
 416
 417
 418 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 419     """
 420     Checks if a string is formatted as "snake case".
 421
 422     A string is considered snake case when:
 423
 424     - it's composed only by lowercase/uppercase letters and digits
 425     - it contains at least one underscore (or provided separator)
 426     - it does not start with a number
 427     """
 428     if is_full_string(in_str):
 429         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 430         re_template = (
 431             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 432         )
 433         r = re_map.get(
 434             separator,
 435             re.compile(
 436                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 437             ),
 438         )
 439         return r.match(in_str) is not None
 440     return False
 441
 442
 443 def is_json(in_str: Any) -> bool:
 444     """
 445     Check if a string is a valid json.
 446
 447     *Examples:*
 448
 449     >>> is_json('{"name": "Peter"}') # returns true
 450     >>> is_json('[1, 2, 3]') # returns true
 451     >>> is_json('{nope}') # returns false
 452     """
 453     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 454         try:
 455             return isinstance(json.loads(in_str), (dict, list))
 456         except (TypeError, ValueError, OverflowError):
 457             pass
 458     return False
 459
 460
 461 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 462     """
 463     Check if a string is a valid UUID.
 464
 465     *Example:*
 466
 467     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') # returns true
 468     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf') # returns false
 469     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True) # returns true
 470     """
 471     # string casting is used to allow UUID itself as input data type
 472     s = str(in_str)
 473     if allow_hex:
 474         return UUID_HEX_OK_RE.match(s) is not None
 475     return UUID_RE.match(s) is not None
 476
 477
 478 def is_ip_v4(in_str: Any) -> bool:
 479     """
 480     Checks if a string is a valid ip v4.
 481
 482     *Examples:*
 483
 484     >>> is_ip_v4('255.200.100.75') # returns true
 485     >>> is_ip_v4('nope') # returns false (not an ip)
 486     >>> is_ip_v4('255.200.100.999') # returns false (999 is out of range)
 487     """
 488     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 489         return False
 490
 491     # checks that each entry in the ip is in the valid range (0 to 255)
 492     for token in in_str.split("."):
 493         if not 0 <= int(token) <= 255:
 494             return False
 495     return True
 496
 497
 498 def extract_ip_v4(in_str: Any) -> Optional[str]:
 499     """
 500     Extracts the IPv4 chunk of a string or None.
 501     """
 502     if not is_full_string(in_str):
 503         return None
 504     in_str.strip()
 505     m = SHALLOW_IP_V4_RE.match(in_str)
 506     if m is not None:
 507         return m.group(0)
 508     return None
 509
 510
 511 def is_ip_v6(in_str: Any) -> bool:
 512     """
 513     Checks if a string is a valid ip v6.
 514
 515     *Examples:*
 516
 517     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
 518     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # returns false (invalid "?")
 519     """
 520     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 521
 522
 523 def extract_ip_v6(in_str: Any) -> Optional[str]:
 524     """
 525     Extract IPv6 chunk or None.
 526     """
 527     if not is_full_string(in_str):
 528         return None
 529     in_str.strip()
 530     m = IP_V6_RE.match(in_str)
 531     if m is not None:
 532         return m.group(0)
 533     return None
 534
 535
 536 def is_ip(in_str: Any) -> bool:
 537     """
 538     Checks if a string is a valid ip (either v4 or v6).
 539
 540     *Examples:*
 541
 542     >>> is_ip('255.200.100.75') # returns true
 543     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
 544     >>> is_ip('1.2.3') # returns false
 545     """
 546     return is_ip_v6(in_str) or is_ip_v4(in_str)
 547
 548
 549 def extract_ip(in_str: Any) -> Optional[str]:
 550     """Extract the IP address or None."""
 551     ip = extract_ip_v4(in_str)
 552     if ip is None:
 553         ip = extract_ip_v6(in_str)
 554     return ip
 555
 556
 557 def is_mac_address(in_str: Any) -> bool:
 558     """Return True if in_str is a valid MAC address false otherwise."""
 559     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 560
 561
 562 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 563     """Extract the MAC address from in_str"""
 564     if not is_full_string(in_str):
 565         return None
 566     in_str.strip()
 567     m = MAC_ADDRESS_RE.match(in_str)
 568     if m is not None:
 569         mac = m.group(0)
 570         mac.replace(":", separator)
 571         mac.replace("-", separator)
 572         return mac
 573     return None
 574
 575
 576 def is_slug(in_str: Any, separator: str = "-") -> bool:
 577     """
 578     Checks if a given string is a slug (as created by `slugify()`).
 579
 580     *Examples:*
 581
 582     >>> is_slug('my-blog-post-title') # returns true
 583     >>> is_slug('My blog post title') # returns false
 584
 585     :param in_str: String to check.
 586     :type in_str: str
 587     :param separator: Join sign used by the slug.
 588     :type separator: str
 589     :return: True if slug, false otherwise.
 590     """
 591     if not is_full_string(in_str):
 592         return False
 593     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 594     return re.match(rex, in_str) is not None
 595
 596
 597 def contains_html(in_str: str) -> bool:
 598     """
 599     Checks if the given string contains HTML/XML tags.
 600
 601     By design, this function matches ANY type of tag, so don't expect to use it
 602     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 603
 604     *Examples:*
 605
 606     >>> contains_html('my string is <strong>bold</strong>') # returns true
 607     >>> contains_html('my string is not bold') # returns false
 608     """
 609     if not is_string(in_str):
 610         raise ValueError(in_str)
 611     return HTML_RE.search(in_str) is not None
 612
 613
 614 def words_count(in_str: str) -> int:
 615     """
 616     Returns the number of words contained into the given string.
 617
 618     This method is smart, it does consider only sequence of one or more letter and/or numbers
 619     as "words", so a string like this: "! @ # % ... []" will return zero!
 620     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 621     will be 4 not 1 (even if there are no spaces in the string).
 622
 623     *Examples:*
 624
 625     >>> words_count('hello world') # returns 2
 626     >>> words_count('one,two,three.stop') # returns 4
 627     """
 628     if not is_string(in_str):
 629         raise ValueError(in_str)
 630     return len(WORDS_COUNT_RE.findall(in_str))
 631
 632
 633 def generate_uuid(as_hex: bool = False) -> str:
 634     """
 635     Generated an UUID string (using `uuid.uuid4()`).
 636
 637     *Examples:*
 638
 639     >>> uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 640     >>> uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 641     """
 642     uid = uuid4()
 643     if as_hex:
 644         return uid.hex
 645     return str(uid)
 646
 647
 648 def generate_random_alphanumeric_string(size: int) -> str:
 649     """
 650     Returns a string of the specified size containing random
 651     characters (uppercase/lowercase ascii letters and digits).
 652
 653     *Example:*
 654
 655     >>> random_string(9) # possible output: "cx3QQbzYg"
 656     """
 657     if size < 1:
 658         raise ValueError("size must be >= 1")
 659     chars = string.ascii_letters + string.digits
 660     buffer = [random.choice(chars) for _ in range(size)]
 661     return from_char_list(buffer)
 662
 663
 664 def reverse(in_str: str) -> str:
 665     """
 666     Returns the string with its chars reversed.
 667     """
 668     if not is_string(in_str):
 669         raise ValueError(in_str)
 670     return in_str[::-1]
 671
 672
 673 def camel_case_to_snake_case(in_str, *, separator="_"):
 674     """
 675     Convert a camel case string into a snake case one.
 676     (The original string is returned if is not a valid camel case string)
 677     """
 678     if not is_string(in_str):
 679         raise ValueError(in_str)
 680     if not is_camel_case(in_str):
 681         return in_str
 682     return CAMEL_CASE_REPLACE_RE.sub(
 683         lambda m: m.group(1) + separator, in_str
 684     ).lower()
 685
 686
 687 def snake_case_to_camel_case(
 688     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 689 ) -> str:
 690     """
 691     Convert a snake case string into a camel case one.
 692     (The original string is returned if is not a valid snake case string)
 693     """
 694     if not is_string(in_str):
 695         raise ValueError(in_str)
 696     if not is_snake_case(in_str, separator=separator):
 697         return in_str
 698     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 699     if not upper_case_first:
 700         tokens[0] = tokens[0].lower()
 701     return from_char_list(tokens)
 702
 703
 704 def to_char_list(in_str: str) -> List[str]:
 705     if not is_string(in_str):
 706         return []
 707     return list(in_str)
 708
 709
 710 def from_char_list(in_list: List[str]) -> str:
 711     return "".join(in_list)
 712
 713
 714 def shuffle(in_str: str) -> str:
 715     """Return a new string containing same chars of the given one but in
 716     a randomized order.
 717     """
 718     if not is_string(in_str):
 719         raise ValueError(in_str)
 720
 721     # turn the string into a list of chars
 722     chars = to_char_list(in_str)
 723     random.shuffle(chars)
 724     return from_char_list(chars)
 725
 726
 727 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 728     """
 729     Remove html code contained into the given string.
 730
 731     *Examples:*
 732
 733     >>> strip_html('test: <a href="foo/bar">click here</a>') # returns 'test: '
 734     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True) # returns 'test: click here'
 735     """
 736     if not is_string(in_str):
 737         raise ValueError(in_str)
 738     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 739     return r.sub("", in_str)
 740
 741
 742 def asciify(in_str: str) -> str:
 743     """
 744     Force string content to be ascii-only by translating all non-ascii chars into the closest possible representation
 745     (eg: ó -> o, Ë -> E, ç -> c...).
 746
 747     **Bear in mind**: Some chars may be lost if impossible to translate.
 748
 749     *Example:*
 750
 751     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') # returns 'eeuuooaaeynAAACIINOE'
 752     """
 753     if not is_string(in_str):
 754         raise ValueError(in_str)
 755
 756     # "NFKD" is the algorithm which is able to successfully translate
 757     # the most of non-ascii chars.
 758     normalized = unicodedata.normalize("NFKD", in_str)
 759
 760     # encode string forcing ascii and ignore any errors
 761     # (unrepresentable chars will be stripped out)
 762     ascii_bytes = normalized.encode("ascii", "ignore")
 763
 764     # turns encoded bytes into an utf-8 string
 765     return ascii_bytes.decode("utf-8")
 766
 767
 768 def slugify(in_str: str, *, separator: str = "-") -> str:
 769     """
 770     Converts a string into a "slug" using provided separator.
 771     The returned string has the following properties:
 772
 773     - it has no spaces
 774     - all letters are in lower case
 775     - all punctuation signs and non alphanumeric chars are removed
 776     - words are divided using provided separator
 777     - all chars are encoded as ascii (by using `asciify()`)
 778     - is safe for URL
 779
 780     *Examples:*
 781
 782     >>> slugify('Top 10 Reasons To Love Dogs!!!') # returns: 'top-10-reasons-to-love-dogs'
 783     >>> slugify('Mönstér Mägnët') # returns 'monster-magnet'
 784     """
 785     if not is_string(in_str):
 786         raise ValueError(in_str)
 787
 788     # replace any character that is NOT letter or number with spaces
 789     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
 790
 791     # replace spaces with join sign
 792     out = SPACES_RE.sub(separator, out)
 793
 794     # normalize joins (remove duplicates)
 795     out = re.sub(re.escape(separator) + r"+", separator, out)
 796     return asciify(out)
 797
 798
 799 def to_bool(in_str: str) -> bool:
 800     """
 801     Turns a string into a boolean based on its content (CASE INSENSITIVE).
 802
 803     A positive boolean (True) is returned if the string value is one of the following:
 804
 805     - "true"
 806     - "1"
 807     - "yes"
 808     - "y"
 809
 810     Otherwise False is returned.
 811     """
 812     if not is_string(in_str):
 813         raise ValueError(in_str)
 814     return in_str.lower() in ("true", "1", "yes", "y", "t")
 815
 816
 817 def to_date(in_str: str) -> Optional[datetime.date]:
 818     try:
 819         d = dp.DateParser()
 820         d.parse(in_str)
 821         return d.get_date()
 822     except dp.ParseException:
 823         logger.warning(f'Unable to parse date {in_str}.')
 824     return None
 825
 826
 827 def valid_date(in_str: str) -> bool:
 828     try:
 829         d = dp.DateParser()
 830         _ = d.parse(in_str)
 831         return True
 832     except dp.ParseException:
 833         logger.warning(f'Unable to parse date {in_str}.')
 834     return False
 835
 836
 837 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
 838     try:
 839         d = dp.DateParser()
 840         dt = d.parse(in_str)
 841         if type(dt) == datetime.datetime:
 842             return dt
 843     except ValueError:
 844         logger.warning(f'Unable to parse datetime {in_str}.')
 845     return None
 846
 847
 848 def valid_datetime(in_str: str) -> bool:
 849     _ = to_datetime(in_str)
 850     if _ is not None:
 851         return True
 852     logger.warning(f'Unable to parse datetime {in_str}.')
 853     return False
 854
 855
 856 def dedent(in_str: str) -> str:
 857     """
 858     Removes tab indentation from multi line strings (inspired by analogous Scala function).
 859
 860     *Example:*
 861
 862     >>> strip_margin('''
 863     >>>                 line 1
 864     >>>                 line 2
 865     >>>                 line 3
 866     >>> ''')
 867     >>> # returns:
 868     >>> '''
 869     >>> line 1
 870     >>> line 2
 871     >>> line 3
 872     >>> '''
 873     """
 874     if not is_string(in_str):
 875         raise ValueError(in_str)
 876     line_separator = '\n'
 877     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
 878     return line_separator.join(lines)
 879
 880
 881 def indent(in_str: str, amount: int) -> str:
 882     if not is_string(in_str):
 883         raise ValueError(in_str)
 884     line_separator = '\n'
 885     lines = [" " * amount + line for line in in_str.split(line_separator)]
 886     return line_separator.join(lines)
 887
 888
 889 def sprintf(*args, **kwargs) -> str:
 890     ret = ""
 891
 892     sep = kwargs.pop("sep", None)
 893     if sep is not None:
 894         if not isinstance(sep, str):
 895             raise TypeError("sep must be None or a string")
 896
 897     end = kwargs.pop("end", None)
 898     if end is not None:
 899         if not isinstance(end, str):
 900             raise TypeError("end must be None or a string")
 901
 902     if kwargs:
 903         raise TypeError("invalid keyword arguments to sprint()")
 904
 905     if sep is None:
 906         sep = " "
 907     if end is None:
 908         end = "\n"
 909     for i, arg in enumerate(args):
 910         if i:
 911             ret += sep
 912         if isinstance(arg, str):
 913             ret += arg
 914         else:
 915             ret += str(arg)
 916     ret += end
 917     return ret
 918
 919
 920 def is_are(n: int) -> str:
 921     if n == 1:
 922         return "is"
 923     return "are"
 924
 925
 926 def pluralize(n: int) -> str:
 927     if n == 1:
 928         return ""
 929     return "s"
 930
 931
 932 def thify(n: int) -> str:
 933     digit = str(n)
 934     assert is_integer_number(digit)
 935     digit = digit[-1:]
 936     if digit == "1":
 937         return "st"
 938     elif digit == "2":
 939         return "nd"
 940     elif digit == "3":
 941         return "rd"
 942     else:
 943         return "th"