3 from itertools import zip_longest
8 from typing import Any, List, Optional
10 from uuid import uuid4
12 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
14 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
16 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
18 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
21 r"([a-z-]+://)" # scheme
22 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
24 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
25 r"(:\d{2,})?" # port number
26 r"(/[a-z\d_%+-]*)*" # folders
27 r"(\.[a-z\d_%+-]+)*" # file extension
28 r"(\?[a-z\d_+%-=]*)?" # query string
32 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
34 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
36 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
38 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
40 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
42 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
44 CAMEL_CASE_TEST_RE = re.compile(
45 r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
48 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
50 SNAKE_CASE_TEST_RE = re.compile(
51 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
54 SNAKE_CASE_TEST_DASH_RE = re.compile(
55 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
58 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
60 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
63 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
64 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
65 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
66 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
67 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
68 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
71 JSON_WRAPPER_RE = re.compile(
72 r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
76 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
79 UUID_HEX_OK_RE = re.compile(
80 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
84 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
86 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
88 MAC_ADDRESS_RE = re.compile(
89 r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
92 WORDS_COUNT_RE = re.compile(
93 r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
97 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
98 re.IGNORECASE | re.MULTILINE | re.DOTALL,
101 HTML_TAG_ONLY_RE = re.compile(
102 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
103 re.IGNORECASE | re.MULTILINE | re.DOTALL,
106 SPACES_RE = re.compile(r"\s")
108 NO_LETTERS_OR_NUMBERS_RE = re.compile(
109 r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
112 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
114 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
130 def is_none_or_empty(in_str: Optional[str]) -> bool:
131 return in_str is None or len(in_str.strip()) == 0
134 def is_string(obj: Any) -> bool:
136 Checks if an object is a string.
138 return isinstance(obj, str)
141 def is_empty_string(in_str: Any) -> bool:
142 return is_string(in_str) and in_str.strip() == ""
145 def is_full_string(in_str: Any) -> bool:
146 return is_string(in_str) and in_str.strip() != ""
149 def is_number(in_str: str) -> bool:
151 Checks if a string is a valid number.
153 if not is_string(in_str):
154 raise ValueError(in_str)
155 return NUMBER_RE.match(in_str) is not None
158 def is_integer_number(in_str: str) -> bool:
160 Checks whether the given string represents an integer or not.
162 An integer may be signed or unsigned or use a "scientific notation".
166 >>> is_integer('42') # returns true
167 >>> is_integer('42.0') # returns false
170 (is_number(in_str) and "." not in in_str) or
171 is_hexidecimal_integer_number(in_str) or
172 is_octal_integer_number(in_str) or
173 is_binary_integer_number(in_str)
177 def is_hexidecimal_integer_number(in_str: str) -> bool:
178 if not is_string(in_str):
179 raise ValueError(in_str)
180 return HEX_NUMBER_RE.match(in_str) is not None
183 def is_octal_integer_number(in_str: str) -> bool:
184 if not is_string(in_str):
185 raise ValueError(in_str)
186 return OCT_NUMBER_RE.match(in_str) is not None
189 def is_binary_integer_number(in_str: str) -> bool:
190 if not is_string(in_str):
191 raise ValueError(in_str)
192 return BIN_NUMBER_RE.match(in_str) is not None
195 def to_int(in_str: str) -> int:
196 if not is_string(in_str):
197 raise ValueError(in_str)
198 if is_binary_integer_number(in_str):
199 return int(in_str, 2)
200 if is_octal_integer_number(in_str):
201 return int(in_str, 8)
202 if is_hexidecimal_integer_number(in_str):
203 return int(in_str, 16)
207 def is_decimal_number(in_str: str) -> bool:
209 Checks whether the given string represents a decimal or not.
211 A decimal may be signed or unsigned or use a "scientific notation".
213 >>> is_decimal('42.0') # returns true
214 >>> is_decimal('42') # returns false
216 return is_number(in_str) and "." in in_str
219 def strip_escape_sequences(in_str: str) -> str:
220 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
224 def add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
225 if isinstance(in_str, int):
228 if is_number(in_str):
229 return _add_thousands_separator(
231 separator_char = separator_char,
234 raise ValueError(in_str)
237 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
240 (in_str, decimal_part) = in_str.split('.')
241 tmp = [iter(in_str[::-1])] * places
242 ret = separator_char.join(
243 "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
244 if len(decimal_part) > 0:
252 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
253 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
255 Check if a string is a valid url.
259 >>> is_url('http://www.mysite.com') # returns true
260 >>> is_url('https://mysite.com') # returns true
261 >>> is_url('.mysite.com') # returns false
263 if not is_full_string(in_str):
266 valid = URL_RE.match(in_str) is not None
269 return valid and any([in_str.startswith(s) for s in allowed_schemes])
273 def is_email(in_str: Any) -> bool:
275 Check if a string is a valid email.
277 Reference: https://tools.ietf.org/html/rfc3696#section-3
282 >>> is_email('@gmail.com') # returns false
285 not is_full_string(in_str)
287 or in_str.startswith(".")
292 # we expect 2 tokens, one before "@" and one after, otherwise
293 # we have an exception and the email is not valid.
294 head, tail = in_str.split("@")
296 # head's size must be <= 64, tail <= 255, head must not start
297 # with a dot or contain multiple consecutive dots.
301 or head.endswith(".")
306 # removes escaped spaces, so that later on the test regex will
308 head = head.replace("\\ ", "")
309 if head.startswith('"') and head.endswith('"'):
310 head = head.replace(" ", "")[1:-1]
311 return EMAIL_RE.match(head + "@" + tail) is not None
314 # borderline case in which we have multiple "@" signs but the
315 # head part is correctly escaped.
316 if ESCAPED_AT_SIGN.search(in_str) is not None:
317 # replace "@" with "a" in the head
318 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
322 def suffix_string_to_number(in_str: str) -> Optional[int]:
323 """Take a string like "33Gb" and convert it into a number (of bytes)
324 like 34603008. Return None if the input string is not valid.
327 def suffix_capitalize(s: str) -> str:
331 return f"{s[0].upper()}{s[1].lower()}"
332 return suffix_capitalize(s[0:1])
334 if is_string(in_str):
335 if is_integer_number(in_str):
336 return to_int(in_str)
337 suffixes = [in_str[-2:], in_str[-1:]]
338 rest = [in_str[:-2], in_str[:-1]]
339 for x in range(len(suffixes)):
341 s = suffix_capitalize(s)
342 multiplier = NUM_SUFFIXES.get(s, None)
343 if multiplier is not None:
345 if is_integer_number(r):
346 return int(r) * multiplier
350 def number_to_suffix_string(num: int) -> Optional[str]:
351 """Take a number (of bytes) and returns a string like "43.8Gb".
352 Returns none if the input is invalid.
356 for (sfx, size) in NUM_SUFFIXES.items():
361 if suffix is not None:
362 return f"{d:.1f}{suffix}"
366 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
368 Checks if a string is a valid credit card number.
369 If card type is provided then it checks against that specific type only,
370 otherwise any known credit card number will be accepted.
372 Supported card types are the following:
381 if not is_full_string(in_str):
384 if card_type is not None:
385 if card_type not in CREDIT_CARDS:
387 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
389 return CREDIT_CARDS[card_type].match(in_str) is not None
390 for c in CREDIT_CARDS:
391 if CREDIT_CARDS[c].match(in_str) is not None:
396 def is_camel_case(in_str: Any) -> bool:
398 Checks if a string is formatted as camel case.
400 A string is considered camel case when:
402 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
403 - it contains both lowercase and uppercase letters
404 - it does not start with a number
407 is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
411 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
413 Checks if a string is formatted as "snake case".
415 A string is considered snake case when:
417 - it's composed only by lowercase/uppercase letters and digits
418 - it contains at least one underscore (or provided separator)
419 - it does not start with a number
421 if is_full_string(in_str):
422 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
424 r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
429 re_template.format(sign=re.escape(separator)), re.IGNORECASE
432 return r.match(in_str) is not None
436 def is_json(in_str: Any) -> bool:
438 Check if a string is a valid json.
442 >>> is_json('{"name": "Peter"}') # returns true
443 >>> is_json('[1, 2, 3]') # returns true
444 >>> is_json('{nope}') # returns false
446 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
448 return isinstance(json.loads(in_str), (dict, list))
449 except (TypeError, ValueError, OverflowError):
454 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
456 Check if a string is a valid UUID.
460 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') # returns true
461 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf') # returns false
462 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True) # returns true
464 # string casting is used to allow UUID itself as input data type
467 return UUID_HEX_OK_RE.match(s) is not None
468 return UUID_RE.match(s) is not None
471 def is_ip_v4(in_str: Any) -> bool:
473 Checks if a string is a valid ip v4.
477 >>> is_ip_v4('255.200.100.75') # returns true
478 >>> is_ip_v4('nope') # returns false (not an ip)
479 >>> is_ip_v4('255.200.100.999') # returns false (999 is out of range)
481 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
484 # checks that each entry in the ip is in the valid range (0 to 255)
485 for token in in_str.split("."):
486 if not 0 <= int(token) <= 255:
491 def extract_ip_v4(in_str: Any) -> Optional[str]:
493 Extracts the IPv4 chunk of a string or None.
495 if not is_full_string(in_str):
498 m = SHALLOW_IP_V4_RE.match(in_str)
504 def is_ip_v6(in_str: Any) -> bool:
506 Checks if a string is a valid ip v6.
510 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
511 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # returns false (invalid "?")
513 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
516 def extract_ip_v6(in_str: Any) -> Optional[str]:
518 Extract IPv6 chunk or None.
520 if not is_full_string(in_str):
523 m = IP_V6_RE.match(in_str)
529 def is_ip(in_str: Any) -> bool:
531 Checks if a string is a valid ip (either v4 or v6).
535 >>> is_ip('255.200.100.75') # returns true
536 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true
537 >>> is_ip('1.2.3') # returns false
539 return is_ip_v6(in_str) or is_ip_v4(in_str)
542 def extract_ip(in_str: Any) -> Optional[str]:
543 """Extract the IP address or None."""
544 ip = extract_ip_v4(in_str)
546 ip = extract_ip_v6(in_str)
550 def is_mac_address(in_str: Any) -> bool:
551 """Return True if in_str is a valid MAC address false otherwise."""
552 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
555 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
556 """Extract the MAC address from in_str"""
557 if not is_full_string(in_str):
560 m = MAC_ADDRESS_RE.match(in_str)
563 mac.replace(":", separator)
564 mac.replace("-", separator)
569 def is_slug(in_str: Any, separator: str = "-") -> bool:
571 Checks if a given string is a slug (as created by `slugify()`).
575 >>> is_slug('my-blog-post-title') # returns true
576 >>> is_slug('My blog post title') # returns false
578 :param in_str: String to check.
580 :param separator: Join sign used by the slug.
582 :return: True if slug, false otherwise.
584 if not is_full_string(in_str):
586 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
587 return re.match(rex, in_str) is not None
590 def contains_html(in_str: str) -> bool:
592 Checks if the given string contains HTML/XML tags.
594 By design, this function matches ANY type of tag, so don't expect to use it
595 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
599 >>> contains_html('my string is <strong>bold</strong>') # returns true
600 >>> contains_html('my string is not bold') # returns false
602 if not is_string(in_str):
603 raise ValueError(in_str)
604 return HTML_RE.search(in_str) is not None
607 def words_count(in_str: str) -> int:
609 Returns the number of words contained into the given string.
611 This method is smart, it does consider only sequence of one or more letter and/or numbers
612 as "words", so a string like this: "! @ # % ... []" will return zero!
613 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
614 will be 4 not 1 (even if there are no spaces in the string).
618 >>> words_count('hello world') # returns 2
619 >>> words_count('one,two,three.stop') # returns 4
621 if not is_string(in_str):
622 raise ValueError(in_str)
623 return len(WORDS_COUNT_RE.findall(in_str))
626 def generate_uuid(as_hex: bool = False) -> str:
628 Generated an UUID string (using `uuid.uuid4()`).
632 >>> uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
633 >>> uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
641 def generate_random_alphanumeric_string(size: int) -> str:
643 Returns a string of the specified size containing random
644 characters (uppercase/lowercase ascii letters and digits).
648 >>> random_string(9) # possible output: "cx3QQbzYg"
651 raise ValueError("size must be >= 1")
652 chars = string.ascii_letters + string.digits
653 buffer = [random.choice(chars) for _ in range(size)]
654 return from_char_list(buffer)
657 def reverse(in_str: str) -> str:
659 Returns the string with its chars reversed.
661 if not is_string(in_str):
662 raise ValueError(in_str)
666 def camel_case_to_snake_case(in_str, *, separator="_"):
668 Convert a camel case string into a snake case one.
669 (The original string is returned if is not a valid camel case string)
671 if not is_string(in_str):
672 raise ValueError(in_str)
673 if not is_camel_case(in_str):
675 return CAMEL_CASE_REPLACE_RE.sub(
676 lambda m: m.group(1) + separator, in_str
680 def snake_case_to_camel_case(
681 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
684 Convert a snake case string into a camel case one.
685 (The original string is returned if is not a valid snake case string)
687 if not is_string(in_str):
688 raise ValueError(in_str)
689 if not is_snake_case(in_str, separator=separator):
691 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
692 if not upper_case_first:
693 tokens[0] = tokens[0].lower()
694 return from_char_list(tokens)
697 def to_char_list(in_str: str) -> List[str]:
698 if not is_string(in_str):
703 def from_char_list(in_list: List[str]) -> str:
704 return "".join(in_list)
707 def shuffle(in_str: str) -> str:
708 """Return a new string containing same chars of the given one but in
711 if not is_string(in_str):
712 raise ValueError(in_str)
714 # turn the string into a list of chars
715 chars = to_char_list(in_str)
716 random.shuffle(chars)
717 return from_char_list(chars)
720 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
722 Remove html code contained into the given string.
726 >>> strip_html('test: <a href="foo/bar">click here</a>') # returns 'test: '
727 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True) # returns 'test: click here'
729 if not is_string(in_str):
730 raise ValueError(in_str)
731 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
732 return r.sub("", in_str)
735 def asciify(in_str: str) -> str:
737 Force string content to be ascii-only by translating all non-ascii chars into the closest possible representation
738 (eg: ó -> o, Ë -> E, ç -> c...).
740 **Bear in mind**: Some chars may be lost if impossible to translate.
744 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') # returns 'eeuuooaaeynAAACIINOE'
746 if not is_string(in_str):
747 raise ValueError(in_str)
749 # "NFKD" is the algorithm which is able to successfully translate
750 # the most of non-ascii chars.
751 normalized = unicodedata.normalize("NFKD", in_str)
753 # encode string forcing ascii and ignore any errors
754 # (unrepresentable chars will be stripped out)
755 ascii_bytes = normalized.encode("ascii", "ignore")
757 # turns encoded bytes into an utf-8 string
758 return ascii_bytes.decode("utf-8")
761 def slugify(in_str: str, *, separator: str = "-") -> str:
763 Converts a string into a "slug" using provided separator.
764 The returned string has the following properties:
767 - all letters are in lower case
768 - all punctuation signs and non alphanumeric chars are removed
769 - words are divided using provided separator
770 - all chars are encoded as ascii (by using `asciify()`)
775 >>> slugify('Top 10 Reasons To Love Dogs!!!') # returns: 'top-10-reasons-to-love-dogs'
776 >>> slugify('Mönstér Mägnët') # returns 'monster-magnet'
778 if not is_string(in_str):
779 raise ValueError(in_str)
781 # replace any character that is NOT letter or number with spaces
782 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
784 # replace spaces with join sign
785 out = SPACES_RE.sub(separator, out)
787 # normalize joins (remove duplicates)
788 out = re.sub(re.escape(separator) + r"+", separator, out)
792 def to_bool(in_str: str) -> bool:
794 Turns a string into a boolean based on its content (CASE INSENSITIVE).
796 A positive boolean (True) is returned if the string value is one of the following:
803 Otherwise False is returned.
805 if not is_string(in_str):
806 raise ValueError(in_str)
807 return in_str.lower() in ("true", "1", "yes", "y", "t")
810 def dedent(in_str: str) -> str:
812 Removes tab indentation from multi line strings (inspired by analogous Scala function).
828 if not is_string(in_str):
829 raise ValueError(in_str)
830 line_separator = '\n'
831 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
832 return line_separator.join(lines)
835 def indent(in_str: str, amount: int) -> str:
836 if not is_string(in_str):
837 raise ValueError(in_str)
838 line_separator = '\n'
839 lines = [" " * amount + line for line in in_str.split(line_separator)]
840 return line_separator.join(lines)
843 def sprintf(*args, **kwargs) -> str:
846 sep = kwargs.pop("sep", None)
848 if not isinstance(sep, str):
849 raise TypeError("sep must be None or a string")
851 end = kwargs.pop("end", None)
853 if not isinstance(end, str):
854 raise TypeError("end must be None or a string")
857 raise TypeError("invalid keyword arguments to sprint()")
863 for i, arg in enumerate(args):
866 if isinstance(arg, str):