string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import datetime
   4 from itertools import zip_longest
   5 import json
   6 import logging
   7 import random
   8 import re
   9 import string
  10 from typing import Any, List, Optional
  11 import unicodedata
  12 from uuid import uuid4
  13
  14 logger = logging.getLogger(__name__)
  15
  16 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  17
  18 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  19
  20 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  21
  22 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  23
  24 URLS_RAW_STRING = (
  25     r"([a-z-]+://)"  # scheme
  26     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  27     r"(www\.)?"  # www.
  28     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  29     r"(:\d{2,})?"  # port number
  30     r"(/[a-z\d_%+-]*)*"  # folders
  31     r"(\.[a-z\d_%+-]+)*"  # file extension
  32     r"(\?[a-z\d_+%-=]*)?"  # query string
  33     r"(#\S*)?"  # hash
  34 )
  35
  36 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  37
  38 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  39
  40 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  41
  42 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  43
  44 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  45
  46 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  47
  48 CAMEL_CASE_TEST_RE = re.compile(
  49     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  50 )
  51
  52 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  53
  54 SNAKE_CASE_TEST_RE = re.compile(
  55     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  56 )
  57
  58 SNAKE_CASE_TEST_DASH_RE = re.compile(
  59     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  60 )
  61
  62 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  63
  64 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  65
  66 CREDIT_CARDS = {
  67     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  68     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  69     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  70     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  71     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  72     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  73 }
  74
  75 JSON_WRAPPER_RE = re.compile(
  76     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  77 )
  78
  79 UUID_RE = re.compile(
  80     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  81 )
  82
  83 UUID_HEX_OK_RE = re.compile(
  84     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  85     re.IGNORECASE,
  86 )
  87
  88 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  89
  90 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
  91
  92 MAC_ADDRESS_RE = re.compile(
  93     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
  94 )
  95
  96 WORDS_COUNT_RE = re.compile(
  97     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
  98 )
  99
 100 HTML_RE = re.compile(
 101     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 102     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 103 )
 104
 105 HTML_TAG_ONLY_RE = re.compile(
 106     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 107     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 108 )
 109
 110 SPACES_RE = re.compile(r"\s")
 111
 112 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 113     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 114 )
 115
 116 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 117
 118 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 119
 120 NUM_SUFFIXES = {
 121     "Pb": (1024 ** 5),
 122     "P": (1024 ** 5),
 123     "Tb": (1024 ** 4),
 124     "T": (1024 ** 4),
 125     "Gb": (1024 ** 3),
 126     "G": (1024 ** 3),
 127     "Mb": (1024 ** 2),
 128     "M": (1024 ** 2),
 129     "Kb": (1024 ** 1),
 130     "K": (1024 ** 1),
 131 }
 132
 133
 134 def is_none_or_empty(in_str: Optional[str]) -> bool:
 135     return in_str is None or len(in_str.strip()) == 0
 136
 137
 138 def is_string(obj: Any) -> bool:
 139     """
 140     Checks if an object is a string.
 141     """
 142     return isinstance(obj, str)
 143
 144
 145 def is_empty_string(in_str: Any) -> bool:
 146     return is_string(in_str) and in_str.strip() == ""
 147
 148
 149 def is_full_string(in_str: Any) -> bool:
 150     return is_string(in_str) and in_str.strip() != ""
 151
 152
 153 def is_number(in_str: str) -> bool:
 154     """
 155     Checks if a string is a valid number.
 156     """
 157     if not is_string(in_str):
 158         raise ValueError(in_str)
 159     return NUMBER_RE.match(in_str) is not None
 160
 161
 162 def is_integer_number(in_str: str) -> bool:
 163     """
 164     Checks whether the given string represents an integer or not.
 165
 166     An integer may be signed or unsigned or use a "scientific notation".
 167
 168     *Examples:*
 169
 170     >>> is_integer('42') # returns true
 171     >>> is_integer('42.0') # returns false
 172     """
 173     return (
 174         (is_number(in_str) and "." not in in_str) or
 175         is_hexidecimal_integer_number(in_str) or
 176         is_octal_integer_number(in_str) or
 177         is_binary_integer_number(in_str)
 178     )
 179
 180
 181 def is_hexidecimal_integer_number(in_str: str) -> bool:
 182     if not is_string(in_str):
 183         raise ValueError(in_str)
 184     return HEX_NUMBER_RE.match(in_str) is not None
 185
 186
 187 def is_octal_integer_number(in_str: str) -> bool:
 188     if not is_string(in_str):
 189         raise ValueError(in_str)
 190     return OCT_NUMBER_RE.match(in_str) is not None
 191
 192
 193 def is_binary_integer_number(in_str: str) -> bool:
 194     if not is_string(in_str):
 195         raise ValueError(in_str)
 196     return BIN_NUMBER_RE.match(in_str) is not None
 197
 198
 199 def to_int(in_str: str) -> int:
 200     if not is_string(in_str):
 201         raise ValueError(in_str)
 202     if is_binary_integer_number(in_str):
 203         return int(in_str, 2)
 204     if is_octal_integer_number(in_str):
 205         return int(in_str, 8)
 206     if is_hexidecimal_integer_number(in_str):
 207         return int(in_str, 16)
 208     return int(in_str)
 209
 210
 211 def is_decimal_number(in_str: str) -> bool:
 212     """
 213     Checks whether the given string represents a decimal or not.
 214
 215     A decimal may be signed or unsigned or use a "scientific notation".
 216
 217     >>> is_decimal('42.0') # returns true
 218     >>> is_decimal('42') # returns false
 219     """
 220     return is_number(in_str) and "." in in_str
 221
 222
 223 def strip_escape_sequences(in_str: str) -> str:
 224     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 225     return in_str
 226
 227
 228 def add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 229     if isinstance(in_str, int):
 230         in_str = f'{in_str}'
 231
 232     if is_number(in_str):
 233         return _add_thousands_separator(
 234             in_str,
 235             separator_char = separator_char,
 236             places = places
 237         )
 238     raise ValueError(in_str)
 239
 240
 241 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 242     decimal_part = ""
 243     if '.' in in_str:
 244         (in_str, decimal_part) = in_str.split('.')
 245     tmp = [iter(in_str[::-1])] * places
 246     ret = separator_char.join(
 247         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 248     if len(decimal_part) > 0:
 249         ret += '.'
 250         ret += decimal_part
 251     return ret
 252
 253
 254 # Full url example:
 255 # scheme://username:password@www.domain.com:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 256 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 257     """
 258     Check if a string is a valid url.
 259
 260     *Examples:*
 261
 262     >>> is_url('http://www.mysite.com') # returns true
 263     >>> is_url('https://mysite.com') # returns true
 264     >>> is_url('.mysite.com') # returns false
 265     """
 266     if not is_full_string(in_str):
 267         return False
 268
 269     valid = URL_RE.match(in_str) is not None
 270
 271     if allowed_schemes:
 272         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 273     return valid
 274
 275
 276 def is_email(in_str: Any) -> bool:
 277     """
 278     Check if a string is a valid email.
 279
 280     Reference: https://tools.ietf.org/html/rfc3696#section-3
 281
 282     *Examples:*
 283
 284     >>> is_email('my.email@the-provider.com') # returns true
 285     >>> is_email('@gmail.com') # returns false
 286     """
 287     if (
 288         not is_full_string(in_str)
 289         or len(in_str) > 320
 290         or in_str.startswith(".")
 291     ):
 292         return False
 293
 294     try:
 295         # we expect 2 tokens, one before "@" and one after, otherwise
 296         # we have an exception and the email is not valid.
 297         head, tail = in_str.split("@")
 298
 299         # head's size must be <= 64, tail <= 255, head must not start
 300         # with a dot or contain multiple consecutive dots.
 301         if (
 302             len(head) > 64
 303             or len(tail) > 255
 304             or head.endswith(".")
 305             or (".." in head)
 306         ):
 307             return False
 308
 309         # removes escaped spaces, so that later on the test regex will
 310         # accept the string.
 311         head = head.replace("\\ ", "")
 312         if head.startswith('"') and head.endswith('"'):
 313             head = head.replace(" ", "")[1:-1]
 314         return EMAIL_RE.match(head + "@" + tail) is not None
 315
 316     except ValueError:
 317         # borderline case in which we have multiple "@" signs but the
 318         # head part is correctly escaped.
 319         if ESCAPED_AT_SIGN.search(in_str) is not None:
 320             # replace "@" with "a" in the head
 321             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 322         return False
 323
 324
 325 def suffix_string_to_number(in_str: str) -> Optional[int]:
 326     """Take a string like "33Gb" and convert it into a number (of bytes)
 327     like 34603008.  Return None if the input string is not valid.
 328     """
 329
 330     def suffix_capitalize(s: str) -> str:
 331         if len(s) == 1:
 332             return s.upper()
 333         elif len(s) == 2:
 334             return f"{s[0].upper()}{s[1].lower()}"
 335         return suffix_capitalize(s[0:1])
 336
 337     if is_string(in_str):
 338         if is_integer_number(in_str):
 339             return to_int(in_str)
 340         suffixes = [in_str[-2:], in_str[-1:]]
 341         rest = [in_str[:-2], in_str[:-1]]
 342         for x in range(len(suffixes)):
 343             s = suffixes[x]
 344             s = suffix_capitalize(s)
 345             multiplier = NUM_SUFFIXES.get(s, None)
 346             if multiplier is not None:
 347                 r = rest[x]
 348                 if is_integer_number(r):
 349                     return int(r) * multiplier
 350     return None
 351
 352
 353 def number_to_suffix_string(num: int) -> Optional[str]:
 354     """Take a number (of bytes) and returns a string like "43.8Gb".
 355     Returns none if the input is invalid.
 356     """
 357     d = 0.0
 358     suffix = None
 359     for (sfx, size) in NUM_SUFFIXES.items():
 360         if num >= size:
 361             d = num / size
 362             suffix = sfx
 363             break
 364     if suffix is not None:
 365         return f"{d:.1f}{suffix}"
 366     else:
 367         return f'{num:d}'
 368
 369
 370 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 371     """
 372     Checks if a string is a valid credit card number.
 373     If card type is provided then it checks against that specific type only,
 374     otherwise any known credit card number will be accepted.
 375
 376     Supported card types are the following:
 377
 378     - VISA
 379     - MASTERCARD
 380     - AMERICAN_EXPRESS
 381     - DINERS_CLUB
 382     - DISCOVER
 383     - JCB
 384     """
 385     if not is_full_string(in_str):
 386         return False
 387
 388     if card_type is not None:
 389         if card_type not in CREDIT_CARDS:
 390             raise KeyError(
 391                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 392             )
 393         return CREDIT_CARDS[card_type].match(in_str) is not None
 394     for c in CREDIT_CARDS:
 395         if CREDIT_CARDS[c].match(in_str) is not None:
 396             return True
 397     return False
 398
 399
 400 def is_camel_case(in_str: Any) -> bool:
 401     """
 402     Checks if a string is formatted as camel case.
 403
 404     A string is considered camel case when:
 405
 406     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 407     - it contains both lowercase and uppercase letters
 408     - it does not start with a number
 409     """
 410     return (
 411         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 412     )
 413
 414
 415 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 416     """
 417     Checks if a string is formatted as "snake case".
 418
 419     A string is considered snake case when:
 420
 421     - it's composed only by lowercase/uppercase letters and digits
 422     - it contains at least one underscore (or provided separator)
 423     - it does not start with a number
 424     """
 425     if is_full_string(in_str):
 426         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 427         re_template = (
 428             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 429         )
 430         r = re_map.get(
 431             separator,
 432             re.compile(
 433                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 434             ),
 435         )
 436         return r.match(in_str) is not None
 437     return False
 438
 439
 440 def is_json(in_str: Any) -> bool:
 441     """
 442     Check if a string is a valid json.
 443
 444     *Examples:*
 445
 446     >>> is_json('{"name": "Peter"}') # returns true
 447     >>> is_json('[1, 2, 3]') # returns true
 448     >>> is_json('{nope}') # returns false
 449     """
 450     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 451         try:
 452             return isinstance(json.loads(in_str), (dict, list))
 453         except (TypeError, ValueError, OverflowError):
 454             pass
 455     return False
 456
 457
 458 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 459     """
 460     Check if a string is a valid UUID.
 461
 462     *Example:*
 463
 464     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') # returns true
 465     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf') # returns false
 466     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True) # returns true
 467     """
 468     # string casting is used to allow UUID itself as input data type
 469     s = str(in_str)
 470     if allow_hex:
 471         return UUID_HEX_OK_RE.match(s) is not None
 472     return UUID_RE.match(s) is not None
 473
 474
 475 def is_ip_v4(in_str: Any) -> bool:
 476     """
 477     Checks if a string is a valid ip v4.
 478
 479     *Examples:*
 480
 481     >>> is_ip_v4('255.200.100.75') # returns true
 482     >>> is_ip_v4('nope') # returns false (not an ip)
 483     >>> is_ip_v4('255.200.100.999') # returns false (999 is out of range)
 484     """
 485     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 486         return False
 487
 488     # checks that each entry in the ip is in the valid range (0 to 255)
 489     for token in in_str.split("."):
 490         if not 0 <= int(token) <= 255:
 491             return False
 492     return True
 493
 494
 495 def extract_ip_v4(in_str: Any) -> Optional[str]:
 496     """
 497     Extracts the IPv4 chunk of a string or None.
 498     """
 499     if not is_full_string(in_str):
 500         return None
 501     in_str.strip()
 502     m = SHALLOW_IP_V4_RE.match(in_str)
 503     if m is not None:
 504         return m.group(0)
 505     return None
 506
 507
 508 def is_ip_v6(in_str: Any) -> bool:
 509     """
 510     Checks if a string is a valid ip v6.
 511
 512     *Examples:*
 513
 514     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
 515     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # returns false (invalid "?")
 516     """
 517     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 518
 519
 520 def extract_ip_v6(in_str: Any) -> Optional[str]:
 521     """
 522     Extract IPv6 chunk or None.
 523     """
 524     if not is_full_string(in_str):
 525         return None
 526     in_str.strip()
 527     m = IP_V6_RE.match(in_str)
 528     if m is not None:
 529         return m.group(0)
 530     return None
 531
 532
 533 def is_ip(in_str: Any) -> bool:
 534     """
 535     Checks if a string is a valid ip (either v4 or v6).
 536
 537     *Examples:*
 538
 539     >>> is_ip('255.200.100.75') # returns true
 540     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
 541     >>> is_ip('1.2.3') # returns false
 542     """
 543     return is_ip_v6(in_str) or is_ip_v4(in_str)
 544
 545
 546 def extract_ip(in_str: Any) -> Optional[str]:
 547     """Extract the IP address or None."""
 548     ip = extract_ip_v4(in_str)
 549     if ip is None:
 550         ip = extract_ip_v6(in_str)
 551     return ip
 552
 553
 554 def is_mac_address(in_str: Any) -> bool:
 555     """Return True if in_str is a valid MAC address false otherwise."""
 556     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 557
 558
 559 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 560     """Extract the MAC address from in_str"""
 561     if not is_full_string(in_str):
 562         return None
 563     in_str.strip()
 564     m = MAC_ADDRESS_RE.match(in_str)
 565     if m is not None:
 566         mac = m.group(0)
 567         mac.replace(":", separator)
 568         mac.replace("-", separator)
 569         return mac
 570     return None
 571
 572
 573 def is_slug(in_str: Any, separator: str = "-") -> bool:
 574     """
 575     Checks if a given string is a slug (as created by `slugify()`).
 576
 577     *Examples:*
 578
 579     >>> is_slug('my-blog-post-title') # returns true
 580     >>> is_slug('My blog post title') # returns false
 581
 582     :param in_str: String to check.
 583     :type in_str: str
 584     :param separator: Join sign used by the slug.
 585     :type separator: str
 586     :return: True if slug, false otherwise.
 587     """
 588     if not is_full_string(in_str):
 589         return False
 590     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 591     return re.match(rex, in_str) is not None
 592
 593
 594 def contains_html(in_str: str) -> bool:
 595     """
 596     Checks if the given string contains HTML/XML tags.
 597
 598     By design, this function matches ANY type of tag, so don't expect to use it
 599     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 600
 601     *Examples:*
 602
 603     >>> contains_html('my string is <strong>bold</strong>') # returns true
 604     >>> contains_html('my string is not bold') # returns false
 605     """
 606     if not is_string(in_str):
 607         raise ValueError(in_str)
 608     return HTML_RE.search(in_str) is not None
 609
 610
 611 def words_count(in_str: str) -> int:
 612     """
 613     Returns the number of words contained into the given string.
 614
 615     This method is smart, it does consider only sequence of one or more letter and/or numbers
 616     as "words", so a string like this: "! @ # % ... []" will return zero!
 617     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 618     will be 4 not 1 (even if there are no spaces in the string).
 619
 620     *Examples:*
 621
 622     >>> words_count('hello world') # returns 2
 623     >>> words_count('one,two,three.stop') # returns 4
 624     """
 625     if not is_string(in_str):
 626         raise ValueError(in_str)
 627     return len(WORDS_COUNT_RE.findall(in_str))
 628
 629
 630 def generate_uuid(as_hex: bool = False) -> str:
 631     """
 632     Generated an UUID string (using `uuid.uuid4()`).
 633
 634     *Examples:*
 635
 636     >>> uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 637     >>> uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 638     """
 639     uid = uuid4()
 640     if as_hex:
 641         return uid.hex
 642     return str(uid)
 643
 644
 645 def generate_random_alphanumeric_string(size: int) -> str:
 646     """
 647     Returns a string of the specified size containing random
 648     characters (uppercase/lowercase ascii letters and digits).
 649
 650     *Example:*
 651
 652     >>> random_string(9) # possible output: "cx3QQbzYg"
 653     """
 654     if size < 1:
 655         raise ValueError("size must be >= 1")
 656     chars = string.ascii_letters + string.digits
 657     buffer = [random.choice(chars) for _ in range(size)]
 658     return from_char_list(buffer)
 659
 660
 661 def reverse(in_str: str) -> str:
 662     """
 663     Returns the string with its chars reversed.
 664     """
 665     if not is_string(in_str):
 666         raise ValueError(in_str)
 667     return in_str[::-1]
 668
 669
 670 def camel_case_to_snake_case(in_str, *, separator="_"):
 671     """
 672     Convert a camel case string into a snake case one.
 673     (The original string is returned if is not a valid camel case string)
 674     """
 675     if not is_string(in_str):
 676         raise ValueError(in_str)
 677     if not is_camel_case(in_str):
 678         return in_str
 679     return CAMEL_CASE_REPLACE_RE.sub(
 680         lambda m: m.group(1) + separator, in_str
 681     ).lower()
 682
 683
 684 def snake_case_to_camel_case(
 685     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 686 ) -> str:
 687     """
 688     Convert a snake case string into a camel case one.
 689     (The original string is returned if is not a valid snake case string)
 690     """
 691     if not is_string(in_str):
 692         raise ValueError(in_str)
 693     if not is_snake_case(in_str, separator=separator):
 694         return in_str
 695     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 696     if not upper_case_first:
 697         tokens[0] = tokens[0].lower()
 698     return from_char_list(tokens)
 699
 700
 701 def to_char_list(in_str: str) -> List[str]:
 702     if not is_string(in_str):
 703         return []
 704     return list(in_str)
 705
 706
 707 def from_char_list(in_list: List[str]) -> str:
 708     return "".join(in_list)
 709
 710
 711 def shuffle(in_str: str) -> str:
 712     """Return a new string containing same chars of the given one but in
 713     a randomized order.
 714     """
 715     if not is_string(in_str):
 716         raise ValueError(in_str)
 717
 718     # turn the string into a list of chars
 719     chars = to_char_list(in_str)
 720     random.shuffle(chars)
 721     return from_char_list(chars)
 722
 723
 724 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 725     """
 726     Remove html code contained into the given string.
 727
 728     *Examples:*
 729
 730     >>> strip_html('test: <a href="foo/bar">click here</a>') # returns 'test: '
 731     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True) # returns 'test: click here'
 732     """
 733     if not is_string(in_str):
 734         raise ValueError(in_str)
 735     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 736     return r.sub("", in_str)
 737
 738
 739 def asciify(in_str: str) -> str:
 740     """
 741     Force string content to be ascii-only by translating all non-ascii chars into the closest possible representation
 742     (eg: ó -> o, Ë -> E, ç -> c...).
 743
 744     **Bear in mind**: Some chars may be lost if impossible to translate.
 745
 746     *Example:*
 747
 748     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') # returns 'eeuuooaaeynAAACIINOE'
 749     """
 750     if not is_string(in_str):
 751         raise ValueError(in_str)
 752
 753     # "NFKD" is the algorithm which is able to successfully translate
 754     # the most of non-ascii chars.
 755     normalized = unicodedata.normalize("NFKD", in_str)
 756
 757     # encode string forcing ascii and ignore any errors
 758     # (unrepresentable chars will be stripped out)
 759     ascii_bytes = normalized.encode("ascii", "ignore")
 760
 761     # turns encoded bytes into an utf-8 string
 762     return ascii_bytes.decode("utf-8")
 763
 764
 765 def slugify(in_str: str, *, separator: str = "-") -> str:
 766     """
 767     Converts a string into a "slug" using provided separator.
 768     The returned string has the following properties:
 769
 770     - it has no spaces
 771     - all letters are in lower case
 772     - all punctuation signs and non alphanumeric chars are removed
 773     - words are divided using provided separator
 774     - all chars are encoded as ascii (by using `asciify()`)
 775     - is safe for URL
 776
 777     *Examples:*
 778
 779     >>> slugify('Top 10 Reasons To Love Dogs!!!') # returns: 'top-10-reasons-to-love-dogs'
 780     >>> slugify('Mönstér Mägnët') # returns 'monster-magnet'
 781     """
 782     if not is_string(in_str):
 783         raise ValueError(in_str)
 784
 785     # replace any character that is NOT letter or number with spaces
 786     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
 787
 788     # replace spaces with join sign
 789     out = SPACES_RE.sub(separator, out)
 790
 791     # normalize joins (remove duplicates)
 792     out = re.sub(re.escape(separator) + r"+", separator, out)
 793     return asciify(out)
 794
 795
 796 def to_bool(in_str: str) -> bool:
 797     """
 798     Turns a string into a boolean based on its content (CASE INSENSITIVE).
 799
 800     A positive boolean (True) is returned if the string value is one of the following:
 801
 802     - "true"
 803     - "1"
 804     - "yes"
 805     - "y"
 806
 807     Otherwise False is returned.
 808     """
 809     if not is_string(in_str):
 810         raise ValueError(in_str)
 811     return in_str.lower() in ("true", "1", "yes", "y", "t")
 812
 813
 814 def to_date(in_str: str) -> Optional[datetime.date]:
 815     import dateparse.dateparse_utils as dp
 816     try:
 817         d = dp.DateParser()
 818         d.parse(in_str)
 819         return d.get_date()
 820     except dp.ParseException:
 821         logger.warning(f'Unable to parse date {in_str}.')
 822     return None
 823
 824
 825 def valid_date(in_str: str) -> bool:
 826     import dateparse.dateparse_utils as dp
 827     try:
 828         d = dp.DateParser()
 829         _ = d.parse(in_str)
 830         return True
 831     except dp.ParseException:
 832         logger.warning(f'Unable to parse date {in_str}.')
 833     return False
 834
 835
 836 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
 837     import dateparse.dateparse_utils as dp
 838     try:
 839         d = dp.DateParser()
 840         dt = d.parse(in_str)
 841         if type(dt) == datetime.datetime:
 842             return dt
 843     except ValueError:
 844         logger.warning(f'Unable to parse datetime {in_str}.')
 845     return None
 846
 847
 848 def valid_datetime(in_str: str) -> bool:
 849     _ = to_datetime(in_str)
 850     if _ is not None:
 851         return True
 852     logger.warning(f'Unable to parse datetime {in_str}.')
 853     return False
 854
 855
 856 def dedent(in_str: str) -> str:
 857     """
 858     Removes tab indentation from multi line strings (inspired by analogous Scala function).
 859
 860     *Example:*
 861
 862     >>> strip_margin('''
 863     >>>                 line 1
 864     >>>                 line 2
 865     >>>                 line 3
 866     >>> ''')
 867     >>> # returns:
 868     >>> '''
 869     >>> line 1
 870     >>> line 2
 871     >>> line 3
 872     >>> '''
 873     """
 874     if not is_string(in_str):
 875         raise ValueError(in_str)
 876     line_separator = '\n'
 877     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
 878     return line_separator.join(lines)
 879
 880
 881 def indent(in_str: str, amount: int) -> str:
 882     if not is_string(in_str):
 883         raise ValueError(in_str)
 884     line_separator = '\n'
 885     lines = [" " * amount + line for line in in_str.split(line_separator)]
 886     return line_separator.join(lines)
 887
 888
 889 def sprintf(*args, **kwargs) -> str:
 890     ret = ""
 891
 892     sep = kwargs.pop("sep", None)
 893     if sep is not None:
 894         if not isinstance(sep, str):
 895             raise TypeError("sep must be None or a string")
 896
 897     end = kwargs.pop("end", None)
 898     if end is not None:
 899         if not isinstance(end, str):
 900             raise TypeError("end must be None or a string")
 901
 902     if kwargs:
 903         raise TypeError("invalid keyword arguments to sprint()")
 904
 905     if sep is None:
 906         sep = " "
 907     if end is None:
 908         end = "\n"
 909     for i, arg in enumerate(args):
 910         if i:
 911             ret += sep
 912         if isinstance(arg, str):
 913             ret += arg
 914         else:
 915             ret += str(arg)
 916     ret += end
 917     return ret
 918
 919
 920 def is_are(n: int) -> str:
 921     if n == 1:
 922         return "is"
 923     return "are"
 924
 925
 926 def pluralize(n: int) -> str:
 927     if n == 1:
 928         return ""
 929     return "s"
 930
 931
 932 def thify(n: int) -> str:
 933     digit = str(n)
 934     assert is_integer_number(digit)
 935     digit = digit[-1:]
 936     if digit == "1":
 937         return "st"
 938     elif digit == "2":
 939         return "nd"
 940     elif digit == "3":
 941         return "rd"
 942     else:
 943         return "th"