7 from typing import Any, List, Optional
11 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
13 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
15 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
17 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
20 r"([a-z-]+://)" # scheme
21 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
23 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
24 r"(:\d{2,})?" # port number
25 r"(/[a-z\d_%+-]*)*" # folders
26 r"(\.[a-z\d_%+-]+)*" # file extension
27 r"(\?[a-z\d_+%-=]*)?" # query string
31 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
33 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
35 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
37 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
39 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
41 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
43 CAMEL_CASE_TEST_RE = re.compile(
44 r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
47 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
49 SNAKE_CASE_TEST_RE = re.compile(
50 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
53 SNAKE_CASE_TEST_DASH_RE = re.compile(
54 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
57 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
59 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
62 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
63 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
64 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
65 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
66 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
67 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
70 JSON_WRAPPER_RE = re.compile(
71 r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
75 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
78 UUID_HEX_OK_RE = re.compile(
79 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
83 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
85 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
87 MAC_ADDRESS_RE = re.compile(
88 r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
91 WORDS_COUNT_RE = re.compile(
92 r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
96 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
97 re.IGNORECASE | re.MULTILINE | re.DOTALL,
100 HTML_TAG_ONLY_RE = re.compile(
101 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
102 re.IGNORECASE | re.MULTILINE | re.DOTALL,
105 SPACES_RE = re.compile(r"\s")
107 NO_LETTERS_OR_NUMBERS_RE = re.compile(
108 r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
111 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
113 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
129 def is_none_or_empty(in_str: Optional[str]) -> bool:
130 return in_str is None or len(in_str.strip()) == 0
133 def is_string(obj: Any) -> bool:
135 Checks if an object is a string.
137 return isinstance(obj, str)
140 def is_empty_string(in_str: Any) -> bool:
141 return is_string(in_str) and in_str.strip() == ""
144 def is_full_string(in_str: Any) -> bool:
145 return is_string(in_str) and in_str.strip() != ""
148 def is_number(in_str: str) -> bool:
150 Checks if a string is a valid number.
152 if not is_string(in_str):
153 raise ValueError(in_str)
154 return NUMBER_RE.match(in_str) is not None
157 def is_integer_number(in_str: str) -> bool:
159 Checks whether the given string represents an integer or not.
161 An integer may be signed or unsigned or use a "scientific notation".
165 >>> is_integer('42') # returns true
166 >>> is_integer('42.0') # returns false
169 (is_number(in_str) and "." not in in_str) or
170 is_hexidecimal_integer_number(in_str) or
171 is_octal_integer_number(in_str) or
172 is_binary_integer_number(in_str)
176 def is_hexidecimal_integer_number(in_str: str) -> bool:
177 if not is_string(in_str):
178 raise ValueError(in_str)
179 return HEX_NUMBER_RE.match(in_str) is not None
182 def is_octal_integer_number(in_str: str) -> bool:
183 if not is_string(in_str):
184 raise ValueError(in_str)
185 return OCT_NUMBER_RE.match(in_str) is not None
188 def is_binary_integer_number(in_str: str) -> bool:
189 if not is_string(in_str):
190 raise ValueError(in_str)
191 return BIN_NUMBER_RE.match(in_str) is not None
194 def to_int(in_str: str) -> int:
195 if not is_string(in_str):
196 raise ValueError(in_str)
197 if is_binary_integer_number(in_str):
198 return int(in_str, 2)
199 if is_octal_integer_number(in_str):
200 return int(in_str, 8)
201 if is_hexidecimal_integer_number(in_str):
202 return int(in_str, 16)
206 def is_decimal_number(in_str: str) -> bool:
208 Checks whether the given string represents a decimal or not.
210 A decimal may be signed or unsigned or use a "scientific notation".
212 >>> is_decimal('42.0') # returns true
213 >>> is_decimal('42') # returns false
215 return is_number(in_str) and "." in in_str
218 def strip_escape_sequences(in_str: str) -> str:
219 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
224 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
225 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
227 Check if a string is a valid url.
231 >>> is_url('http://www.mysite.com') # returns true
232 >>> is_url('https://mysite.com') # returns true
233 >>> is_url('.mysite.com') # returns false
235 if not is_full_string(in_str):
238 valid = URL_RE.match(in_str) is not None
241 return valid and any([in_str.startswith(s) for s in allowed_schemes])
245 def is_email(in_str: Any) -> bool:
247 Check if a string is a valid email.
249 Reference: https://tools.ietf.org/html/rfc3696#section-3
254 >>> is_email('@gmail.com') # returns false
257 not is_full_string(in_str)
259 or in_str.startswith(".")
264 # we expect 2 tokens, one before "@" and one after, otherwise
265 # we have an exception and the email is not valid.
266 head, tail = in_str.split("@")
268 # head's size must be <= 64, tail <= 255, head must not start
269 # with a dot or contain multiple consecutive dots.
273 or head.endswith(".")
278 # removes escaped spaces, so that later on the test regex will
280 head = head.replace("\\ ", "")
281 if head.startswith('"') and head.endswith('"'):
282 head = head.replace(" ", "")[1:-1]
283 return EMAIL_RE.match(head + "@" + tail) is not None
286 # borderline case in which we have multiple "@" signs but the
287 # head part is correctly escaped.
288 if ESCAPED_AT_SIGN.search(in_str) is not None:
289 # replace "@" with "a" in the head
290 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
294 def suffix_string_to_number(in_str: str) -> Optional[int]:
295 """Take a string like "33Gb" and convert it into a number (of bytes)
296 like 34603008. Return None if the input string is not valid.
299 def suffix_capitalize(s: str) -> str:
303 return f"{s[0].upper()}{s[1].lower()}"
304 return suffix_capitalize(s[0:1])
306 if is_string(in_str):
307 if is_integer_number(in_str):
308 return to_int(in_str)
309 suffixes = [in_str[-2:], in_str[-1:]]
310 rest = [in_str[:-2], in_str[:-1]]
311 for x in range(len(suffixes)):
313 s = suffix_capitalize(s)
314 multiplier = NUM_SUFFIXES.get(s, None)
315 if multiplier is not None:
317 if is_integer_number(r):
318 return int(r) * multiplier
322 def number_to_suffix_string(num: int) -> Optional[str]:
323 """Take a number (of bytes) and returns a string like "43.8Gb".
324 Returns none if the input is invalid.
328 for (sfx, size) in NUM_SUFFIXES.items():
333 if suffix is not None:
334 return f"{d:.1f}{suffix}"
338 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
340 Checks if a string is a valid credit card number.
341 If card type is provided then it checks against that specific type only,
342 otherwise any known credit card number will be accepted.
344 Supported card types are the following:
353 if not is_full_string(in_str):
356 if card_type is not None:
357 if card_type not in CREDIT_CARDS:
359 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
361 return CREDIT_CARDS[card_type].match(in_str) is not None
362 for c in CREDIT_CARDS:
363 if CREDIT_CARDS[c].match(in_str) is not None:
368 def is_camel_case(in_str: Any) -> bool:
370 Checks if a string is formatted as camel case.
372 A string is considered camel case when:
374 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
375 - it contains both lowercase and uppercase letters
376 - it does not start with a number
379 is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
383 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
385 Checks if a string is formatted as "snake case".
387 A string is considered snake case when:
389 - it's composed only by lowercase/uppercase letters and digits
390 - it contains at least one underscore (or provided separator)
391 - it does not start with a number
393 if is_full_string(in_str):
394 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
396 r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
401 re_template.format(sign=re.escape(separator)), re.IGNORECASE
404 return r.match(in_str) is not None
408 def is_json(in_str: Any) -> bool:
410 Check if a string is a valid json.
414 >>> is_json('{"name": "Peter"}') # returns true
415 >>> is_json('[1, 2, 3]') # returns true
416 >>> is_json('{nope}') # returns false
418 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
420 return isinstance(json.loads(in_str), (dict, list))
421 except (TypeError, ValueError, OverflowError):
426 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
428 Check if a string is a valid UUID.
432 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') # returns true
433 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf') # returns false
434 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True) # returns true
436 # string casting is used to allow UUID itself as input data type
439 return UUID_HEX_OK_RE.match(s) is not None
440 return UUID_RE.match(s) is not None
443 def is_ip_v4(in_str: Any) -> bool:
445 Checks if a string is a valid ip v4.
449 >>> is_ip_v4('255.200.100.75') # returns true
450 >>> is_ip_v4('nope') # returns false (not an ip)
451 >>> is_ip_v4('255.200.100.999') # returns false (999 is out of range)
453 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
456 # checks that each entry in the ip is in the valid range (0 to 255)
457 for token in in_str.split("."):
458 if not 0 <= int(token) <= 255:
463 def extract_ip_v4(in_str: Any) -> Optional[str]:
465 Extracts the IPv4 chunk of a string or None.
467 if not is_full_string(in_str):
470 m = SHALLOW_IP_V4_RE.match(in_str)
476 def is_ip_v6(in_str: Any) -> bool:
478 Checks if a string is a valid ip v6.
482 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
483 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # returns false (invalid "?")
485 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
488 def extract_ip_v6(in_str: Any) -> Optional[str]:
490 Extract IPv6 chunk or None.
492 if not is_full_string(in_str):
495 m = IP_V6_RE.match(in_str)
501 def is_ip(in_str: Any) -> bool:
503 Checks if a string is a valid ip (either v4 or v6).
507 >>> is_ip('255.200.100.75') # returns true
508 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
509 >>> is_ip('1.2.3') # returns false
511 return is_ip_v6(in_str) or is_ip_v4(in_str)
514 def extract_ip(in_str: Any) -> Optional[str]:
515 """Extract the IP address or None."""
516 ip = extract_ip_v4(in_str)
518 ip = extract_ip_v6(in_str)
522 def is_mac_address(in_str: Any) -> bool:
523 """Return True if in_str is a valid MAC address false otherwise."""
524 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
527 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
528 """Extract the MAC address from in_str"""
529 if not is_full_string(in_str):
532 m = MAC_ADDRESS_RE.match(in_str)
535 mac.replace(":", separator)
536 mac.replace("-", separator)
541 def is_slug(in_str: Any, separator: str = "-") -> bool:
543 Checks if a given string is a slug (as created by `slugify()`).
547 >>> is_slug('my-blog-post-title') # returns true
548 >>> is_slug('My blog post title') # returns false
550 :param in_str: String to check.
552 :param separator: Join sign used by the slug.
554 :return: True if slug, false otherwise.
556 if not is_full_string(in_str):
558 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
559 return re.match(rex, in_str) is not None
562 def contains_html(in_str: str) -> bool:
564 Checks if the given string contains HTML/XML tags.
566 By design, this function matches ANY type of tag, so don't expect to use it
567 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
571 >>> contains_html('my string is <strong>bold</strong>') # returns true
572 >>> contains_html('my string is not bold') # returns false
574 if not is_string(in_str):
575 raise ValueError(in_str)
576 return HTML_RE.search(in_str) is not None
579 def words_count(in_str: str) -> int:
581 Returns the number of words contained into the given string.
583 This method is smart, it does consider only sequence of one or more letter and/or numbers
584 as "words", so a string like this: "! @ # % ... []" will return zero!
585 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
586 will be 4 not 1 (even if there are no spaces in the string).
590 >>> words_count('hello world') # returns 2
591 >>> words_count('one,two,three.stop') # returns 4
593 if not is_string(in_str):
594 raise ValueError(in_str)
595 return len(WORDS_COUNT_RE.findall(in_str))
598 def generate_uuid(as_hex: bool = False) -> str:
600 Generated an UUID string (using `uuid.uuid4()`).
604 >>> uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
605 >>> uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
613 def generate_random_alphanumeric_string(size: int) -> str:
615 Returns a string of the specified size containing random
616 characters (uppercase/lowercase ascii letters and digits).
620 >>> random_string(9) # possible output: "cx3QQbzYg"
623 raise ValueError("size must be >= 1")
624 chars = string.ascii_letters + string.digits
625 buffer = [random.choice(chars) for _ in range(size)]
626 return from_char_list(buffer)
629 def reverse(in_str: str) -> str:
631 Returns the string with its chars reversed.
633 if not is_string(in_str):
634 raise ValueError(in_str)
638 def camel_case_to_snake_case(in_str, *, separator="_"):
640 Convert a camel case string into a snake case one.
641 (The original string is returned if is not a valid camel case string)
643 if not is_string(in_str):
644 raise ValueError(in_str)
645 if not is_camel_case(in_str):
647 return CAMEL_CASE_REPLACE_RE.sub(
648 lambda m: m.group(1) + separator, in_str
652 def snake_case_to_camel_case(
653 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
656 Convert a snake case string into a camel case one.
657 (The original string is returned if is not a valid snake case string)
659 if not is_string(in_str):
660 raise ValueError(in_str)
661 if not is_snake_case(in_str, separator=separator):
663 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
664 if not upper_case_first:
665 tokens[0] = tokens[0].lower()
666 return from_char_list(tokens)
669 def to_char_list(in_str: str) -> List[str]:
670 if not is_string(in_str):
675 def from_char_list(in_list: List[str]) -> str:
676 return "".join(in_list)
679 def shuffle(in_str: str) -> str:
680 """Return a new string containing same chars of the given one but in
683 if not is_string(in_str):
684 raise ValueError(in_str)
686 # turn the string into a list of chars
687 chars = to_char_list(in_str)
688 random.shuffle(chars)
689 return from_char_list(chars)
692 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
694 Remove html code contained into the given string.
698 >>> strip_html('test: <a href="foo/bar">click here</a>') # returns 'test: '
699 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True) # returns 'test: click here'
701 if not is_string(in_str):
702 raise ValueError(in_str)
703 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
704 return r.sub("", in_str)
707 def asciify(in_str: str) -> str:
709 Force string content to be ascii-only by translating all non-ascii chars into the closest possible representation
710 (eg: ó -> o, Ë -> E, ç -> c...).
712 **Bear in mind**: Some chars may be lost if impossible to translate.
716 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') # returns 'eeuuooaaeynAAACIINOE'
718 if not is_string(in_str):
719 raise ValueError(in_str)
721 # "NFKD" is the algorithm which is able to successfully translate
722 # the most of non-ascii chars.
723 normalized = unicodedata.normalize("NFKD", in_str)
725 # encode string forcing ascii and ignore any errors
726 # (unrepresentable chars will be stripped out)
727 ascii_bytes = normalized.encode("ascii", "ignore")
729 # turns encoded bytes into an utf-8 string
730 return ascii_bytes.decode("utf-8")
733 def slugify(in_str: str, *, separator: str = "-") -> str:
735 Converts a string into a "slug" using provided separator.
736 The returned string has the following properties:
739 - all letters are in lower case
740 - all punctuation signs and non alphanumeric chars are removed
741 - words are divided using provided separator
742 - all chars are encoded as ascii (by using `asciify()`)
747 >>> slugify('Top 10 Reasons To Love Dogs!!!') # returns: 'top-10-reasons-to-love-dogs'
748 >>> slugify('Mönstér Mägnët') # returns 'monster-magnet'
750 if not is_string(in_str):
751 raise ValueError(in_str)
753 # replace any character that is NOT letter or number with spaces
754 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
756 # replace spaces with join sign
757 out = SPACES_RE.sub(separator, out)
759 # normalize joins (remove duplicates)
760 out = re.sub(re.escape(separator) + r"+", separator, out)
764 def to_bool(in_str: str) -> bool:
766 Turns a string into a boolean based on its content (CASE INSENSITIVE).
768 A positive boolean (True) is returned if the string value is one of the following:
775 Otherwise False is returned.
777 if not is_string(in_str):
778 raise ValueError(in_str)
779 return in_str.lower() in ("true", "1", "yes", "y", "t")
782 def dedent(in_str: str) -> str:
784 Removes tab indentation from multi line strings (inspired by analogous Scala function).
800 if not is_string(in_str):
801 raise ValueError(in_str)
802 line_separator = '\n'
803 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
804 return line_separator.join(lines)
807 def indent(in_str: str, amount: int) -> str:
808 if not is_string(in_str):
809 raise ValueError(in_str)
810 line_separator = '\n'
811 lines = [" " * amount + line for line in in_str.split(line_separator)]
812 return line_separator.join(lines)
815 def sprintf(*args, **kwargs) -> str:
818 sep = kwargs.pop("sep", None)
820 if not isinstance(sep, str):
821 raise TypeError("sep must be None or a string")
823 end = kwargs.pop("end", None)
825 if not isinstance(end, str):
826 raise TypeError("end must be None or a string")
829 raise TypeError("invalid keyword arguments to sprint()")
835 for i, arg in enumerate(args):
838 if isinstance(arg, str):