string_utils.py

   1 #!/usr/bin/env python3
   2
   3 import contextlib
   4 import datetime
   5 import io
   6 from itertools import zip_longest
   7 import json
   8 import logging
   9 import random
  10 import re
  11 import string
  12 from typing import Any, Callable, List, Optional
  13 import unicodedata
  14 from uuid import uuid4
  15
  16 logger = logging.getLogger(__name__)
  17
  18 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
  19
  20 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
  21
  22 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
  23
  24 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
  25
  26 URLS_RAW_STRING = (
  27     r"([a-z-]+://)"  # scheme
  28     r"([a-z_\d-]+:[a-z_\d-]+@)?"  # user:password
  29     r"(www\.)?"  # www.
  30     r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)"  # domain
  31     r"(:\d{2,})?"  # port number
  32     r"(/[a-z\d_%+-]*)*"  # folders
  33     r"(\.[a-z\d_%+-]+)*"  # file extension
  34     r"(\?[a-z\d_+%-=]*)?"  # query string
  35     r"(#\S*)?"  # hash
  36 )
  37
  38 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
  39
  40 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
  41
  42 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
  43
  44 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
  45
  46 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
  47
  48 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
  49
  50 CAMEL_CASE_TEST_RE = re.compile(
  51     r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
  52 )
  53
  54 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
  55
  56 SNAKE_CASE_TEST_RE = re.compile(
  57     r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
  58 )
  59
  60 SNAKE_CASE_TEST_DASH_RE = re.compile(
  61     r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
  62 )
  63
  64 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
  65
  66 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
  67
  68 CREDIT_CARDS = {
  69     "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
  70     "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
  71     "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
  72     "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
  73     "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
  74     "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
  75 }
  76
  77 JSON_WRAPPER_RE = re.compile(
  78     r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
  79 )
  80
  81 UUID_RE = re.compile(
  82     r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
  83 )
  84
  85 UUID_HEX_OK_RE = re.compile(
  86     r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
  87     re.IGNORECASE,
  88 )
  89
  90 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
  91
  92 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
  93
  94 MAC_ADDRESS_RE = re.compile(
  95     r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
  96 )
  97
  98 WORDS_COUNT_RE = re.compile(
  99     r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
 100 )
 101
 102 HTML_RE = re.compile(
 103     r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
 104     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 105 )
 106
 107 HTML_TAG_ONLY_RE = re.compile(
 108     r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
 109     re.IGNORECASE | re.MULTILINE | re.DOTALL,
 110 )
 111
 112 SPACES_RE = re.compile(r"\s")
 113
 114 NO_LETTERS_OR_NUMBERS_RE = re.compile(
 115     r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
 116 )
 117
 118 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
 119
 120 ESCAPE_SEQUENCE_RE = re.compile(r"\e\[[^A-Za-z]*[A-Za-z]")
 121
 122 NUM_SUFFIXES = {
 123     "Pb": (1024 ** 5),
 124     "P": (1024 ** 5),
 125     "Tb": (1024 ** 4),
 126     "T": (1024 ** 4),
 127     "Gb": (1024 ** 3),
 128     "G": (1024 ** 3),
 129     "Mb": (1024 ** 2),
 130     "M": (1024 ** 2),
 131     "Kb": (1024 ** 1),
 132     "K": (1024 ** 1),
 133 }
 134
 135
 136 def is_none_or_empty(in_str: Optional[str]) -> bool:
 137     return in_str is None or len(in_str.strip()) == 0
 138
 139
 140 def is_string(obj: Any) -> bool:
 141     """
 142     Checks if an object is a string.
 143     """
 144     return isinstance(obj, str)
 145
 146
 147 def is_empty_string(in_str: Any) -> bool:
 148     return is_string(in_str) and in_str.strip() == ""
 149
 150
 151 def is_full_string(in_str: Any) -> bool:
 152     return is_string(in_str) and in_str.strip() != ""
 153
 154
 155 def is_number(in_str: str) -> bool:
 156     """
 157     Checks if a string is a valid number.
 158     """
 159     if not is_string(in_str):
 160         raise ValueError(in_str)
 161     return NUMBER_RE.match(in_str) is not None
 162
 163
 164 def is_integer_number(in_str: str) -> bool:
 165     """
 166     Checks whether the given string represents an integer or not.
 167
 168     An integer may be signed or unsigned or use a "scientific notation".
 169
 170     *Examples:*
 171
 172     >>> is_integer('42') # returns true
 173     >>> is_integer('42.0') # returns false
 174     """
 175     return (
 176         (is_number(in_str) and "." not in in_str) or
 177         is_hexidecimal_integer_number(in_str) or
 178         is_octal_integer_number(in_str) or
 179         is_binary_integer_number(in_str)
 180     )
 181
 182
 183 def is_hexidecimal_integer_number(in_str: str) -> bool:
 184     if not is_string(in_str):
 185         raise ValueError(in_str)
 186     return HEX_NUMBER_RE.match(in_str) is not None
 187
 188
 189 def is_octal_integer_number(in_str: str) -> bool:
 190     if not is_string(in_str):
 191         raise ValueError(in_str)
 192     return OCT_NUMBER_RE.match(in_str) is not None
 193
 194
 195 def is_binary_integer_number(in_str: str) -> bool:
 196     if not is_string(in_str):
 197         raise ValueError(in_str)
 198     return BIN_NUMBER_RE.match(in_str) is not None
 199
 200
 201 def to_int(in_str: str) -> int:
 202     if not is_string(in_str):
 203         raise ValueError(in_str)
 204     if is_binary_integer_number(in_str):
 205         return int(in_str, 2)
 206     if is_octal_integer_number(in_str):
 207         return int(in_str, 8)
 208     if is_hexidecimal_integer_number(in_str):
 209         return int(in_str, 16)
 210     return int(in_str)
 211
 212
 213 def is_decimal_number(in_str: str) -> bool:
 214     """
 215     Checks whether the given string represents a decimal or not.
 216
 217     A decimal may be signed or unsigned or use a "scientific notation".
 218
 219     >>> is_decimal('42.0') # returns true
 220     >>> is_decimal('42') # returns false
 221     """
 222     return is_number(in_str) and "." in in_str
 223
 224
 225 def strip_escape_sequences(in_str: str) -> str:
 226     in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
 227     return in_str
 228
 229
 230 def add_thousands_separator(
 231         in_str: str,
 232         *,
 233         separator_char = ',',
 234         places = 3
 235 ) -> str:
 236     if isinstance(in_str, int):
 237         in_str = f'{in_str}'
 238     if is_number(in_str):
 239         return _add_thousands_separator(
 240             in_str,
 241             separator_char = separator_char,
 242             places = places
 243         )
 244     raise ValueError(in_str)
 245
 246
 247 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
 248     decimal_part = ""
 249     if '.' in in_str:
 250         (in_str, decimal_part) = in_str.split('.')
 251     tmp = [iter(in_str[::-1])] * places
 252     ret = separator_char.join(
 253         "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
 254     if len(decimal_part) > 0:
 255         ret += '.'
 256         ret += decimal_part
 257     return ret
 258
 259
 260 # Full url example:
 261 # scheme://username:[email protected]:8042/folder/subfolder/file.extension?param=value&param2=value2#hash
 262 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
 263     """
 264     Check if a string is a valid url.
 265
 266     *Examples:*
 267
 268     >>> is_url('http://www.mysite.com') # returns true
 269     >>> is_url('https://mysite.com') # returns true
 270     >>> is_url('.mysite.com') # returns false
 271     """
 272     if not is_full_string(in_str):
 273         return False
 274
 275     valid = URL_RE.match(in_str) is not None
 276
 277     if allowed_schemes:
 278         return valid and any([in_str.startswith(s) for s in allowed_schemes])
 279     return valid
 280
 281
 282 def is_email(in_str: Any) -> bool:
 283     """
 284     Check if a string is a valid email.
 285
 286     Reference: https://tools.ietf.org/html/rfc3696#section-3
 287
 288     *Examples:*
 289
 290     >>> is_email('[email protected]') # returns true
 291     >>> is_email('@gmail.com') # returns false
 292     """
 293     if (
 294         not is_full_string(in_str)
 295         or len(in_str) > 320
 296         or in_str.startswith(".")
 297     ):
 298         return False
 299
 300     try:
 301         # we expect 2 tokens, one before "@" and one after, otherwise
 302         # we have an exception and the email is not valid.
 303         head, tail = in_str.split("@")
 304
 305         # head's size must be <= 64, tail <= 255, head must not start
 306         # with a dot or contain multiple consecutive dots.
 307         if (
 308             len(head) > 64
 309             or len(tail) > 255
 310             or head.endswith(".")
 311             or (".." in head)
 312         ):
 313             return False
 314
 315         # removes escaped spaces, so that later on the test regex will
 316         # accept the string.
 317         head = head.replace("\\ ", "")
 318         if head.startswith('"') and head.endswith('"'):
 319             head = head.replace(" ", "")[1:-1]
 320         return EMAIL_RE.match(head + "@" + tail) is not None
 321
 322     except ValueError:
 323         # borderline case in which we have multiple "@" signs but the
 324         # head part is correctly escaped.
 325         if ESCAPED_AT_SIGN.search(in_str) is not None:
 326             # replace "@" with "a" in the head
 327             return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
 328         return False
 329
 330
 331 def suffix_string_to_number(in_str: str) -> Optional[int]:
 332     """Take a string like "33Gb" and convert it into a number (of bytes)
 333     like 34603008.  Return None if the input string is not valid.
 334     """
 335
 336     def suffix_capitalize(s: str) -> str:
 337         if len(s) == 1:
 338             return s.upper()
 339         elif len(s) == 2:
 340             return f"{s[0].upper()}{s[1].lower()}"
 341         return suffix_capitalize(s[0:1])
 342
 343     if is_string(in_str):
 344         if is_integer_number(in_str):
 345             return to_int(in_str)
 346         suffixes = [in_str[-2:], in_str[-1:]]
 347         rest = [in_str[:-2], in_str[:-1]]
 348         for x in range(len(suffixes)):
 349             s = suffixes[x]
 350             s = suffix_capitalize(s)
 351             multiplier = NUM_SUFFIXES.get(s, None)
 352             if multiplier is not None:
 353                 r = rest[x]
 354                 if is_integer_number(r):
 355                     return int(r) * multiplier
 356     return None
 357
 358
 359 def number_to_suffix_string(num: int) -> Optional[str]:
 360     """Take a number (of bytes) and returns a string like "43.8Gb".
 361     Returns none if the input is invalid.
 362     """
 363     d = 0.0
 364     suffix = None
 365     for (sfx, size) in NUM_SUFFIXES.items():
 366         if num >= size:
 367             d = num / size
 368             suffix = sfx
 369             break
 370     if suffix is not None:
 371         return f"{d:.1f}{suffix}"
 372     else:
 373         return f'{num:d}'
 374
 375
 376 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
 377     """
 378     Checks if a string is a valid credit card number.
 379     If card type is provided then it checks against that specific type only,
 380     otherwise any known credit card number will be accepted.
 381
 382     Supported card types are the following:
 383
 384     - VISA
 385     - MASTERCARD
 386     - AMERICAN_EXPRESS
 387     - DINERS_CLUB
 388     - DISCOVER
 389     - JCB
 390     """
 391     if not is_full_string(in_str):
 392         return False
 393
 394     if card_type is not None:
 395         if card_type not in CREDIT_CARDS:
 396             raise KeyError(
 397                 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
 398             )
 399         return CREDIT_CARDS[card_type].match(in_str) is not None
 400     for c in CREDIT_CARDS:
 401         if CREDIT_CARDS[c].match(in_str) is not None:
 402             return True
 403     return False
 404
 405
 406 def is_camel_case(in_str: Any) -> bool:
 407     """
 408     Checks if a string is formatted as camel case.
 409
 410     A string is considered camel case when:
 411
 412     - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
 413     - it contains both lowercase and uppercase letters
 414     - it does not start with a number
 415     """
 416     return (
 417         is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
 418     )
 419
 420
 421 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
 422     """
 423     Checks if a string is formatted as "snake case".
 424
 425     A string is considered snake case when:
 426
 427     - it's composed only by lowercase/uppercase letters and digits
 428     - it contains at least one underscore (or provided separator)
 429     - it does not start with a number
 430     """
 431     if is_full_string(in_str):
 432         re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
 433         re_template = (
 434             r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
 435         )
 436         r = re_map.get(
 437             separator,
 438             re.compile(
 439                 re_template.format(sign=re.escape(separator)), re.IGNORECASE
 440             ),
 441         )
 442         return r.match(in_str) is not None
 443     return False
 444
 445
 446 def is_json(in_str: Any) -> bool:
 447     """
 448     Check if a string is a valid json.
 449
 450     *Examples:*
 451
 452     >>> is_json('{"name": "Peter"}') # returns true
 453     >>> is_json('[1, 2, 3]') # returns true
 454     >>> is_json('{nope}') # returns false
 455     """
 456     if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
 457         try:
 458             return isinstance(json.loads(in_str), (dict, list))
 459         except (TypeError, ValueError, OverflowError):
 460             pass
 461     return False
 462
 463
 464 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
 465     """
 466     Check if a string is a valid UUID.
 467
 468     *Example:*
 469
 470     >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') # returns true
 471     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf') # returns false
 472     >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True) # returns true
 473     """
 474     # string casting is used to allow UUID itself as input data type
 475     s = str(in_str)
 476     if allow_hex:
 477         return UUID_HEX_OK_RE.match(s) is not None
 478     return UUID_RE.match(s) is not None
 479
 480
 481 def is_ip_v4(in_str: Any) -> bool:
 482     """
 483     Checks if a string is a valid ip v4.
 484
 485     *Examples:*
 486
 487     >>> is_ip_v4('255.200.100.75') # returns true
 488     >>> is_ip_v4('nope') # returns false (not an ip)
 489     >>> is_ip_v4('255.200.100.999') # returns false (999 is out of range)
 490     """
 491     if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
 492         return False
 493
 494     # checks that each entry in the ip is in the valid range (0 to 255)
 495     for token in in_str.split("."):
 496         if not 0 <= int(token) <= 255:
 497             return False
 498     return True
 499
 500
 501 def extract_ip_v4(in_str: Any) -> Optional[str]:
 502     """
 503     Extracts the IPv4 chunk of a string or None.
 504     """
 505     if not is_full_string(in_str):
 506         return None
 507     in_str.strip()
 508     m = SHALLOW_IP_V4_RE.match(in_str)
 509     if m is not None:
 510         return m.group(0)
 511     return None
 512
 513
 514 def is_ip_v6(in_str: Any) -> bool:
 515     """
 516     Checks if a string is a valid ip v6.
 517
 518     *Examples:*
 519
 520     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
 521     >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # returns false (invalid "?")
 522     """
 523     return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
 524
 525
 526 def extract_ip_v6(in_str: Any) -> Optional[str]:
 527     """
 528     Extract IPv6 chunk or None.
 529     """
 530     if not is_full_string(in_str):
 531         return None
 532     in_str.strip()
 533     m = IP_V6_RE.match(in_str)
 534     if m is not None:
 535         return m.group(0)
 536     return None
 537
 538
 539 def is_ip(in_str: Any) -> bool:
 540     """
 541     Checks if a string is a valid ip (either v4 or v6).
 542
 543     *Examples:*
 544
 545     >>> is_ip('255.200.100.75') # returns true
 546     >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
 547     >>> is_ip('1.2.3') # returns false
 548     """
 549     return is_ip_v6(in_str) or is_ip_v4(in_str)
 550
 551
 552 def extract_ip(in_str: Any) -> Optional[str]:
 553     """Extract the IP address or None."""
 554     ip = extract_ip_v4(in_str)
 555     if ip is None:
 556         ip = extract_ip_v6(in_str)
 557     return ip
 558
 559
 560 def is_mac_address(in_str: Any) -> bool:
 561     """Return True if in_str is a valid MAC address false otherwise."""
 562     return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
 563
 564
 565 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
 566     """Extract the MAC address from in_str"""
 567     if not is_full_string(in_str):
 568         return None
 569     in_str.strip()
 570     m = MAC_ADDRESS_RE.match(in_str)
 571     if m is not None:
 572         mac = m.group(0)
 573         mac.replace(":", separator)
 574         mac.replace("-", separator)
 575         return mac
 576     return None
 577
 578
 579 def is_slug(in_str: Any, separator: str = "-") -> bool:
 580     """
 581     Checks if a given string is a slug (as created by `slugify()`).
 582
 583     *Examples:*
 584
 585     >>> is_slug('my-blog-post-title') # returns true
 586     >>> is_slug('My blog post title') # returns false
 587
 588     :param in_str: String to check.
 589     :type in_str: str
 590     :param separator: Join sign used by the slug.
 591     :type separator: str
 592     :return: True if slug, false otherwise.
 593     """
 594     if not is_full_string(in_str):
 595         return False
 596     rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
 597     return re.match(rex, in_str) is not None
 598
 599
 600 def contains_html(in_str: str) -> bool:
 601     """
 602     Checks if the given string contains HTML/XML tags.
 603
 604     By design, this function matches ANY type of tag, so don't expect to use it
 605     as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
 606
 607     *Examples:*
 608
 609     >>> contains_html('my string is <strong>bold</strong>') # returns true
 610     >>> contains_html('my string is not bold') # returns false
 611     """
 612     if not is_string(in_str):
 613         raise ValueError(in_str)
 614     return HTML_RE.search(in_str) is not None
 615
 616
 617 def words_count(in_str: str) -> int:
 618     """
 619     Returns the number of words contained into the given string.
 620
 621     This method is smart, it does consider only sequence of one or more letter and/or numbers
 622     as "words", so a string like this: "! @ # % ... []" will return zero!
 623     Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
 624     will be 4 not 1 (even if there are no spaces in the string).
 625
 626     *Examples:*
 627
 628     >>> words_count('hello world') # returns 2
 629     >>> words_count('one,two,three.stop') # returns 4
 630     """
 631     if not is_string(in_str):
 632         raise ValueError(in_str)
 633     return len(WORDS_COUNT_RE.findall(in_str))
 634
 635
 636 def generate_uuid(as_hex: bool = False) -> str:
 637     """
 638     Generated an UUID string (using `uuid.uuid4()`).
 639
 640     *Examples:*
 641
 642     >>> uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
 643     >>> uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
 644     """
 645     uid = uuid4()
 646     if as_hex:
 647         return uid.hex
 648     return str(uid)
 649
 650
 651 def generate_random_alphanumeric_string(size: int) -> str:
 652     """
 653     Returns a string of the specified size containing random
 654     characters (uppercase/lowercase ascii letters and digits).
 655
 656     *Example:*
 657
 658     >>> random_string(9) # possible output: "cx3QQbzYg"
 659     """
 660     if size < 1:
 661         raise ValueError("size must be >= 1")
 662     chars = string.ascii_letters + string.digits
 663     buffer = [random.choice(chars) for _ in range(size)]
 664     return from_char_list(buffer)
 665
 666
 667 def reverse(in_str: str) -> str:
 668     """
 669     Returns the string with its chars reversed.
 670     """
 671     if not is_string(in_str):
 672         raise ValueError(in_str)
 673     return in_str[::-1]
 674
 675
 676 def camel_case_to_snake_case(in_str, *, separator="_"):
 677     """
 678     Convert a camel case string into a snake case one.
 679     (The original string is returned if is not a valid camel case string)
 680     """
 681     if not is_string(in_str):
 682         raise ValueError(in_str)
 683     if not is_camel_case(in_str):
 684         return in_str
 685     return CAMEL_CASE_REPLACE_RE.sub(
 686         lambda m: m.group(1) + separator, in_str
 687     ).lower()
 688
 689
 690 def snake_case_to_camel_case(
 691     in_str: str, *, upper_case_first: bool = True, separator: str = "_"
 692 ) -> str:
 693     """
 694     Convert a snake case string into a camel case one.
 695     (The original string is returned if is not a valid snake case string)
 696     """
 697     if not is_string(in_str):
 698         raise ValueError(in_str)
 699     if not is_snake_case(in_str, separator=separator):
 700         return in_str
 701     tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
 702     if not upper_case_first:
 703         tokens[0] = tokens[0].lower()
 704     return from_char_list(tokens)
 705
 706
 707 def to_char_list(in_str: str) -> List[str]:
 708     if not is_string(in_str):
 709         return []
 710     return list(in_str)
 711
 712
 713 def from_char_list(in_list: List[str]) -> str:
 714     return "".join(in_list)
 715
 716
 717 def shuffle(in_str: str) -> str:
 718     """Return a new string containing same chars of the given one but in
 719     a randomized order.
 720     """
 721     if not is_string(in_str):
 722         raise ValueError(in_str)
 723
 724     # turn the string into a list of chars
 725     chars = to_char_list(in_str)
 726     random.shuffle(chars)
 727     return from_char_list(chars)
 728
 729
 730 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
 731     """
 732     Remove html code contained into the given string.
 733
 734     *Examples:*
 735
 736     >>> strip_html('test: <a href="foo/bar">click here</a>') # returns 'test: '
 737     >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True) # returns 'test: click here'
 738     """
 739     if not is_string(in_str):
 740         raise ValueError(in_str)
 741     r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
 742     return r.sub("", in_str)
 743
 744
 745 def asciify(in_str: str) -> str:
 746     """
 747     Force string content to be ascii-only by translating all non-ascii chars into the closest possible representation
 748     (eg: ó -> o, Ë -> E, ç -> c...).
 749
 750     **Bear in mind**: Some chars may be lost if impossible to translate.
 751
 752     *Example:*
 753
 754     >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') # returns 'eeuuooaaeynAAACIINOE'
 755     """
 756     if not is_string(in_str):
 757         raise ValueError(in_str)
 758
 759     # "NFKD" is the algorithm which is able to successfully translate
 760     # the most of non-ascii chars.
 761     normalized = unicodedata.normalize("NFKD", in_str)
 762
 763     # encode string forcing ascii and ignore any errors
 764     # (unrepresentable chars will be stripped out)
 765     ascii_bytes = normalized.encode("ascii", "ignore")
 766
 767     # turns encoded bytes into an utf-8 string
 768     return ascii_bytes.decode("utf-8")
 769
 770
 771 def slugify(in_str: str, *, separator: str = "-") -> str:
 772     """
 773     Converts a string into a "slug" using provided separator.
 774     The returned string has the following properties:
 775
 776     - it has no spaces
 777     - all letters are in lower case
 778     - all punctuation signs and non alphanumeric chars are removed
 779     - words are divided using provided separator
 780     - all chars are encoded as ascii (by using `asciify()`)
 781     - is safe for URL
 782
 783     *Examples:*
 784
 785     >>> slugify('Top 10 Reasons To Love Dogs!!!') # returns: 'top-10-reasons-to-love-dogs'
 786     >>> slugify('Mönstér Mägnët') # returns 'monster-magnet'
 787     """
 788     if not is_string(in_str):
 789         raise ValueError(in_str)
 790
 791     # replace any character that is NOT letter or number with spaces
 792     out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
 793
 794     # replace spaces with join sign
 795     out = SPACES_RE.sub(separator, out)
 796
 797     # normalize joins (remove duplicates)
 798     out = re.sub(re.escape(separator) + r"+", separator, out)
 799     return asciify(out)
 800
 801
 802 def to_bool(in_str: str) -> bool:
 803     """
 804     Turns a string into a boolean based on its content (CASE INSENSITIVE).
 805
 806     A positive boolean (True) is returned if the string value is one of the following:
 807
 808     - "true"
 809     - "1"
 810     - "yes"
 811     - "y"
 812
 813     Otherwise False is returned.
 814     """
 815     if not is_string(in_str):
 816         raise ValueError(in_str)
 817     return in_str.lower() in ("true", "1", "yes", "y", "t")
 818
 819
 820 def to_date(in_str: str) -> Optional[datetime.date]:
 821     import dateparse.dateparse_utils as dp
 822     try:
 823         d = dp.DateParser()
 824         d.parse(in_str)
 825         return d.get_date()
 826     except dp.ParseException:
 827         logger.warning(f'Unable to parse date {in_str}.')
 828     return None
 829
 830
 831 def valid_date(in_str: str) -> bool:
 832     import dateparse.dateparse_utils as dp
 833     try:
 834         d = dp.DateParser()
 835         _ = d.parse(in_str)
 836         return True
 837     except dp.ParseException:
 838         logger.warning(f'Unable to parse date {in_str}.')
 839     return False
 840
 841
 842 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
 843     import dateparse.dateparse_utils as dp
 844     try:
 845         d = dp.DateParser()
 846         dt = d.parse(in_str)
 847         if type(dt) == datetime.datetime:
 848             return dt
 849     except ValueError:
 850         logger.warning(f'Unable to parse datetime {in_str}.')
 851     return None
 852
 853
 854 def valid_datetime(in_str: str) -> bool:
 855     _ = to_datetime(in_str)
 856     if _ is not None:
 857         return True
 858     logger.warning(f'Unable to parse datetime {in_str}.')
 859     return False
 860
 861
 862 def dedent(in_str: str) -> str:
 863     """
 864     Removes tab indentation from multi line strings (inspired by analogous Scala function).
 865
 866     *Example:*
 867
 868     >>> strip_margin('''
 869     >>>                 line 1
 870     >>>                 line 2
 871     >>>                 line 3
 872     >>> ''')
 873     >>> # returns:
 874     >>> '''
 875     >>> line 1
 876     >>> line 2
 877     >>> line 3
 878     >>> '''
 879     """
 880     if not is_string(in_str):
 881         raise ValueError(in_str)
 882     line_separator = '\n'
 883     lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
 884     return line_separator.join(lines)
 885
 886
 887 def indent(in_str: str, amount: int) -> str:
 888     if not is_string(in_str):
 889         raise ValueError(in_str)
 890     line_separator = '\n'
 891     lines = [" " * amount + line for line in in_str.split(line_separator)]
 892     return line_separator.join(lines)
 893
 894
 895 def sprintf(*args, **kwargs) -> str:
 896     ret = ""
 897
 898     sep = kwargs.pop("sep", None)
 899     if sep is not None:
 900         if not isinstance(sep, str):
 901             raise TypeError("sep must be None or a string")
 902
 903     end = kwargs.pop("end", None)
 904     if end is not None:
 905         if not isinstance(end, str):
 906             raise TypeError("end must be None or a string")
 907
 908     if kwargs:
 909         raise TypeError("invalid keyword arguments to sprint()")
 910
 911     if sep is None:
 912         sep = " "
 913     if end is None:
 914         end = "\n"
 915     for i, arg in enumerate(args):
 916         if i:
 917             ret += sep
 918         if isinstance(arg, str):
 919             ret += arg
 920         else:
 921             ret += str(arg)
 922     ret += end
 923     return ret
 924
 925
 926 class SprintfStdout(object):
 927     def __init__(self) -> None:
 928         self.destination = io.StringIO()
 929         self.recorder = None
 930
 931     def __enter__(self) -> Callable[[], str]:
 932         self.recorder = contextlib.redirect_stdout(self.destination)
 933         self.recorder.__enter__()
 934         return lambda: self.destination.getvalue()
 935
 936     def __exit__(self, *args) -> None:
 937         self.recorder.__exit__(*args)
 938         self.destination.seek(0)
 939         return None  # don't suppress exceptions
 940
 941
 942 def is_are(n: int) -> str:
 943     if n == 1:
 944         return "is"
 945     return "are"
 946
 947
 948 def pluralize(n: int) -> str:
 949     if n == 1:
 950         return ""
 951     return "s"
 952
 953
 954 def thify(n: int) -> str:
 955     digit = str(n)
 956     assert is_integer_number(digit)
 957     digit = digit[-1:]
 958     if digit == "1":
 959         return "st"
 960     elif digit == "2":
 961         return "nd"
 962     elif digit == "3":
 963         return "rd"
 964     else:
 965         return "th"