6 from itertools import zip_longest
13 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
15 from uuid import uuid4
17 logger = logging.getLogger(__name__)
19 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
21 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
23 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
25 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
28 r"([a-z-]+://)" # scheme
29 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
31 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
32 r"(:\d{2,})?" # port number
33 r"(/[a-z\d_%+-]*)*" # folders
34 r"(\.[a-z\d_%+-]+)*" # file extension
35 r"(\?[a-z\d_+%-=]*)?" # query string
39 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
41 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
43 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
45 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
47 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
49 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
51 CAMEL_CASE_TEST_RE = re.compile(
52 r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
55 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
57 SNAKE_CASE_TEST_RE = re.compile(
58 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
61 SNAKE_CASE_TEST_DASH_RE = re.compile(
62 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
65 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
67 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
70 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
71 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
72 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
73 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
74 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
75 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
78 JSON_WRAPPER_RE = re.compile(
79 r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
83 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
86 UUID_HEX_OK_RE = re.compile(
87 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
91 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
93 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
95 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
97 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
99 MAC_ADDRESS_RE = re.compile(
100 r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
103 ANYWHERE_MAC_ADDRESS_RE = re.compile(
104 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
107 WORDS_COUNT_RE = re.compile(
108 r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
111 HTML_RE = re.compile(
112 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
113 re.IGNORECASE | re.MULTILINE | re.DOTALL,
116 HTML_TAG_ONLY_RE = re.compile(
117 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
118 re.IGNORECASE | re.MULTILINE | re.DOTALL,
121 SPACES_RE = re.compile(r"\s")
123 NO_LETTERS_OR_NUMBERS_RE = re.compile(
124 r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
127 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
129 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
145 def is_none_or_empty(in_str: Optional[str]) -> bool:
147 Returns true if the input string is either None or an empty string.
149 >>> is_none_or_empty("")
151 >>> is_none_or_empty(None)
153 >>> is_none_or_empty(" \t ")
155 >>> is_none_or_empty('Test')
158 return in_str is None or len(in_str.strip()) == 0
161 def is_string(obj: Any) -> bool:
163 Checks if an object is a string.
165 >>> is_string('test')
171 >>> is_string([1, 2, 3])
174 return isinstance(obj, str)
177 def is_empty_string(in_str: Any) -> bool:
178 return is_empty(in_str)
181 def is_empty(in_str: Any) -> bool:
183 Checks if input is a string and empty or only whitespace.
187 >>> is_empty(' \t\t ')
193 >>> is_empty([1, 2, 3])
196 return is_string(in_str) and in_str.strip() == ""
199 def is_full_string(in_str: Any) -> bool:
201 Checks that input is a string and is not empty ('') or only whitespace.
203 >>> is_full_string('test!')
205 >>> is_full_string('')
207 >>> is_full_string(' ')
209 >>> is_full_string(100.999)
211 >>> is_full_string({"a": 1, "b": 2})
214 return is_string(in_str) and in_str.strip() != ""
217 def is_number(in_str: str) -> bool:
219 Checks if a string is a valid number.
222 Traceback (most recent call last):
225 >>> is_number("100.5")
227 >>> is_number("test")
231 >>> is_number([1, 2, 3])
232 Traceback (most recent call last):
234 ValueError: [1, 2, 3]
236 if not is_string(in_str):
237 raise ValueError(in_str)
238 return NUMBER_RE.match(in_str) is not None
241 def is_integer_number(in_str: str) -> bool:
243 Checks whether the given string represents an integer or not.
245 An integer may be signed or unsigned or use a "scientific notation".
247 >>> is_integer_number('42')
249 >>> is_integer_number('42.0')
253 (is_number(in_str) and "." not in in_str) or
254 is_hexidecimal_integer_number(in_str) or
255 is_octal_integer_number(in_str) or
256 is_binary_integer_number(in_str)
260 def is_hexidecimal_integer_number(in_str: str) -> bool:
262 Checks whether a string is a hex integer number.
264 >>> is_hexidecimal_integer_number('0x12345')
266 >>> is_hexidecimal_integer_number('0x1A3E')
268 >>> is_hexidecimal_integer_number('1234') # Needs 0x
270 >>> is_hexidecimal_integer_number('-0xff')
272 >>> is_hexidecimal_integer_number('test')
274 >>> is_hexidecimal_integer_number(12345) # Not a string
275 Traceback (most recent call last):
278 >>> is_hexidecimal_integer_number(101.4)
279 Traceback (most recent call last):
282 >>> is_hexidecimal_integer_number(0x1A3E)
283 Traceback (most recent call last):
287 if not is_string(in_str):
288 raise ValueError(in_str)
289 return HEX_NUMBER_RE.match(in_str) is not None
292 def is_octal_integer_number(in_str: str) -> bool:
294 Checks whether a string is an octal number.
296 >>> is_octal_integer_number('0o777')
298 >>> is_octal_integer_number('-0O115')
300 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
302 >>> is_octal_integer_number('7777') # Needs 0o
304 >>> is_octal_integer_number('test')
307 if not is_string(in_str):
308 raise ValueError(in_str)
309 return OCT_NUMBER_RE.match(in_str) is not None
312 def is_binary_integer_number(in_str: str) -> bool:
314 Returns whether a string contains a binary number.
316 >>> is_binary_integer_number('0b10111')
318 >>> is_binary_integer_number('-0b111')
320 >>> is_binary_integer_number('0B10101')
322 >>> is_binary_integer_number('0b10102')
324 >>> is_binary_integer_number('0xFFF')
326 >>> is_binary_integer_number('test')
329 if not is_string(in_str):
330 raise ValueError(in_str)
331 return BIN_NUMBER_RE.match(in_str) is not None
334 def to_int(in_str: str) -> int:
335 """Returns the integral value of the string or raises on error.
340 Traceback (most recent call last):
342 ValueError: invalid literal for int() with base 10: 'test'
344 if not is_string(in_str):
345 raise ValueError(in_str)
346 if is_binary_integer_number(in_str):
347 return int(in_str, 2)
348 if is_octal_integer_number(in_str):
349 return int(in_str, 8)
350 if is_hexidecimal_integer_number(in_str):
351 return int(in_str, 16)
355 def is_decimal_number(in_str: str) -> bool:
357 Checks whether the given string represents a decimal or not.
359 A decimal may be signed or unsigned or use a "scientific notation".
361 >>> is_decimal_number('42.0')
363 >>> is_decimal_number('42')
366 return is_number(in_str) and "." in in_str
369 def strip_escape_sequences(in_str: str) -> str:
371 Remove escape sequences in the input string.
373 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
376 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
380 def add_thousands_separator(
383 separator_char = ',',
387 Add thousands separator to a numeric string. Also handles numbers.
389 >>> add_thousands_separator('12345678')
391 >>> add_thousands_separator(12345678)
393 >>> add_thousands_separator(12345678.99)
395 >>> add_thousands_separator('test')
396 Traceback (most recent call last):
401 if isinstance(in_str, numbers.Number):
403 if is_number(in_str):
404 return _add_thousands_separator(
406 separator_char = separator_char,
409 raise ValueError(in_str)
412 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
415 (in_str, decimal_part) = in_str.split('.')
416 tmp = [iter(in_str[::-1])] * places
417 ret = separator_char.join(
418 "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
419 if len(decimal_part) > 0:
426 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
427 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
429 Check if a string is a valid url.
431 >>> is_url('http://www.mysite.com')
433 >>> is_url('https://mysite.com')
435 >>> is_url('.mysite.com')
438 if not is_full_string(in_str):
441 valid = URL_RE.match(in_str) is not None
444 return valid and any([in_str.startswith(s) for s in allowed_schemes])
448 def is_email(in_str: Any) -> bool:
450 Check if a string is a valid email.
452 Reference: https://tools.ietf.org/html/rfc3696#section-3
456 >>> is_email('@gmail.com')
460 not is_full_string(in_str)
462 or in_str.startswith(".")
467 # we expect 2 tokens, one before "@" and one after, otherwise
468 # we have an exception and the email is not valid.
469 head, tail = in_str.split("@")
471 # head's size must be <= 64, tail <= 255, head must not start
472 # with a dot or contain multiple consecutive dots.
476 or head.endswith(".")
481 # removes escaped spaces, so that later on the test regex will
483 head = head.replace("\\ ", "")
484 if head.startswith('"') and head.endswith('"'):
485 head = head.replace(" ", "")[1:-1]
486 return EMAIL_RE.match(head + "@" + tail) is not None
489 # borderline case in which we have multiple "@" signs but the
490 # head part is correctly escaped.
491 if ESCAPED_AT_SIGN.search(in_str) is not None:
492 # replace "@" with "a" in the head
493 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
497 def suffix_string_to_number(in_str: str) -> Optional[int]:
498 """Take a string like "33Gb" and convert it into a number (of bytes)
499 like 34603008. Return None if the input string is not valid.
501 >>> suffix_string_to_number('1Mb')
503 >>> suffix_string_to_number('13.1Gb')
506 def suffix_capitalize(s: str) -> str:
510 return f"{s[0].upper()}{s[1].lower()}"
511 return suffix_capitalize(s[0:1])
513 if is_string(in_str):
514 if is_integer_number(in_str):
515 return to_int(in_str)
516 suffixes = [in_str[-2:], in_str[-1:]]
517 rest = [in_str[:-2], in_str[:-1]]
518 for x in range(len(suffixes)):
520 s = suffix_capitalize(s)
521 multiplier = NUM_SUFFIXES.get(s, None)
522 if multiplier is not None:
524 if is_integer_number(r):
525 return to_int(r) * multiplier
526 if is_decimal_number(r):
527 return int(float(r) * multiplier)
531 def number_to_suffix_string(num: int) -> Optional[str]:
532 """Take a number (of bytes) and returns a string like "43.8Gb".
533 Returns none if the input is invalid.
535 >>> number_to_suffix_string(14066017894)
537 >>> number_to_suffix_string(1024 * 1024)
543 for (sfx, size) in NUM_SUFFIXES.items():
548 if suffix is not None:
549 return f"{d:.1f}{suffix}"
554 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
556 Checks if a string is a valid credit card number.
557 If card type is provided then it checks against that specific type only,
558 otherwise any known credit card number will be accepted.
560 Supported card types are the following:
569 if not is_full_string(in_str):
572 if card_type is not None:
573 if card_type not in CREDIT_CARDS:
575 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
577 return CREDIT_CARDS[card_type].match(in_str) is not None
578 for c in CREDIT_CARDS:
579 if CREDIT_CARDS[c].match(in_str) is not None:
584 def is_camel_case(in_str: Any) -> bool:
586 Checks if a string is formatted as camel case.
588 A string is considered camel case when:
590 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
591 - it contains both lowercase and uppercase letters
592 - it does not start with a number
595 is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
599 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
601 Checks if a string is formatted as "snake case".
603 A string is considered snake case when:
605 - it's composed only by lowercase/uppercase letters and digits
606 - it contains at least one underscore (or provided separator)
607 - it does not start with a number
609 >>> is_snake_case('this_is_a_test')
611 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
613 >>> is_snake_case('this-is-a-test')
615 >>> is_snake_case('this-is-a-test', separator='-')
619 if is_full_string(in_str):
620 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
622 r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
627 re_template.format(sign=re.escape(separator)), re.IGNORECASE
630 return r.match(in_str) is not None
634 def is_json(in_str: Any) -> bool:
636 Check if a string is a valid json.
638 >>> is_json('{"name": "Peter"}')
640 >>> is_json('[1, 2, 3]')
642 >>> is_json('{nope}')
645 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
647 return isinstance(json.loads(in_str), (dict, list))
648 except (TypeError, ValueError, OverflowError):
653 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
655 Check if a string is a valid UUID.
657 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
659 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
661 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
664 # string casting is used to allow UUID itself as input data type
667 return UUID_HEX_OK_RE.match(s) is not None
668 return UUID_RE.match(s) is not None
671 def is_ip_v4(in_str: Any) -> bool:
673 Checks if a string is a valid ip v4.
675 >>> is_ip_v4('255.200.100.75')
679 >>> is_ip_v4('255.200.100.999') # 999 out of range
682 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
685 # checks that each entry in the ip is in the valid range (0 to 255)
686 for token in in_str.split("."):
687 if not 0 <= int(token) <= 255:
692 def extract_ip_v4(in_str: Any) -> Optional[str]:
694 Extracts the IPv4 chunk of a string or None.
696 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
698 >>> extract_ip_v4('Your mom dresses you funny.')
700 if not is_full_string(in_str):
702 m = ANYWHERE_IP_V4_RE.search(in_str)
708 def is_ip_v6(in_str: Any) -> bool:
710 Checks if a string is a valid ip v6.
712 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
714 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
717 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
720 def extract_ip_v6(in_str: Any) -> Optional[str]:
722 Extract IPv6 chunk or None.
724 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
725 '2001:db8:85a3:0000:0000:8a2e:370:7334'
726 >>> extract_ip_v6("(and she's ugly too, btw)")
728 if not is_full_string(in_str):
730 m = ANYWHERE_IP_V6_RE.search(in_str)
736 def is_ip(in_str: Any) -> bool:
738 Checks if a string is a valid ip (either v4 or v6).
740 >>> is_ip('255.200.100.75')
742 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
746 >>> is_ip('1.2.3.999')
749 return is_ip_v6(in_str) or is_ip_v4(in_str)
752 def extract_ip(in_str: Any) -> Optional[str]:
754 Extract the IP address or None.
756 >>> extract_ip('Attacker: 255.200.100.75')
758 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
759 '2001:db8:85a3:0000:0000:8a2e:370:7334'
760 >>> extract_ip('1.2.3')
763 ip = extract_ip_v4(in_str)
765 ip = extract_ip_v6(in_str)
769 def is_mac_address(in_str: Any) -> bool:
770 """Return True if in_str is a valid MAC address false otherwise.
772 >>> is_mac_address("34:29:8F:12:0D:2F")
774 >>> is_mac_address('34:29:8f:12:0d:2f')
776 >>> is_mac_address('34-29-8F-12-0D-2F')
778 >>> is_mac_address("test")
781 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
784 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
786 Extract the MAC address from in_str.
788 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
792 if not is_full_string(in_str):
795 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
798 mac.replace(":", separator)
799 mac.replace("-", separator)
804 def is_slug(in_str: Any, separator: str = "-") -> bool:
806 Checks if a given string is a slug (as created by `slugify()`).
808 >>> is_slug('my-blog-post-title')
810 >>> is_slug('My blog post title')
814 if not is_full_string(in_str):
816 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
817 return re.match(rex, in_str) is not None
820 def contains_html(in_str: str) -> bool:
822 Checks if the given string contains HTML/XML tags.
824 By design, this function matches ANY type of tag, so don't expect to use it
825 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
827 >>> contains_html('my string is <strong>bold</strong>')
829 >>> contains_html('my string is not bold')
833 if not is_string(in_str):
834 raise ValueError(in_str)
835 return HTML_RE.search(in_str) is not None
838 def words_count(in_str: str) -> int:
840 Returns the number of words contained into the given string.
842 This method is smart, it does consider only sequence of one or more letter and/or numbers
843 as "words", so a string like this: "! @ # % ... []" will return zero!
844 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
845 will be 4 not 1 (even if there are no spaces in the string).
847 >>> words_count('hello world')
849 >>> words_count('one,two,three.stop')
853 if not is_string(in_str):
854 raise ValueError(in_str)
855 return len(WORDS_COUNT_RE.findall(in_str))
858 def generate_uuid(as_hex: bool = False) -> str:
860 Generated an UUID string (using `uuid.uuid4()`).
862 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
863 generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
872 def generate_random_alphanumeric_string(size: int) -> str:
874 Returns a string of the specified size containing random
875 characters (uppercase/lowercase ascii letters and digits).
877 random_string(9) # possible output: "cx3QQbzYg"
881 raise ValueError("size must be >= 1")
882 chars = string.ascii_letters + string.digits
883 buffer = [random.choice(chars) for _ in range(size)]
884 return from_char_list(buffer)
887 def reverse(in_str: str) -> str:
889 Returns the string with its chars reversed.
895 if not is_string(in_str):
896 raise ValueError(in_str)
900 def camel_case_to_snake_case(in_str, *, separator="_"):
902 Convert a camel case string into a snake case one.
903 (The original string is returned if is not a valid camel case string)
905 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
906 'mac_address_extractor_factory'
907 >>> camel_case_to_snake_case('Luke Skywalker')
910 if not is_string(in_str):
911 raise ValueError(in_str)
912 if not is_camel_case(in_str):
914 return CAMEL_CASE_REPLACE_RE.sub(
915 lambda m: m.group(1) + separator, in_str
919 def snake_case_to_camel_case(
920 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
923 Convert a snake case string into a camel case one.
924 (The original string is returned if is not a valid snake case string)
926 >>> snake_case_to_camel_case('this_is_a_test')
928 >>> snake_case_to_camel_case('Han Solo')
931 if not is_string(in_str):
932 raise ValueError(in_str)
933 if not is_snake_case(in_str, separator=separator):
935 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
936 if not upper_case_first:
937 tokens[0] = tokens[0].lower()
938 return from_char_list(tokens)
941 def to_char_list(in_str: str) -> List[str]:
942 """Convert a string into a list of chars.
944 >>> to_char_list('test')
947 if not is_string(in_str):
952 def from_char_list(in_list: List[str]) -> str:
953 """Convert a char list into a string.
955 >>> from_char_list(['t', 'e', 's', 't'])
958 return "".join(in_list)
961 def shuffle(in_str: str) -> str:
962 """Return a new string containing same chars of the given one but in
965 if not is_string(in_str):
966 raise ValueError(in_str)
968 # turn the string into a list of chars
969 chars = to_char_list(in_str)
970 random.shuffle(chars)
971 return from_char_list(chars)
974 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
976 Remove html code contained into the given string.
978 >>> strip_html('test: <a href="foo/bar">click here</a>')
980 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
983 if not is_string(in_str):
984 raise ValueError(in_str)
985 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
986 return r.sub("", in_str)
989 def asciify(in_str: str) -> str:
991 Force string content to be ascii-only by translating all non-ascii
992 chars into the closest possible representation (eg: ó -> o, Ë ->
995 N.B. Some chars may be lost if impossible to translate.
997 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
998 'eeuuooaaeynAAACIINOE'
1000 if not is_string(in_str):
1001 raise ValueError(in_str)
1003 # "NFKD" is the algorithm which is able to successfully translate
1004 # the most of non-ascii chars.
1005 normalized = unicodedata.normalize("NFKD", in_str)
1007 # encode string forcing ascii and ignore any errors
1008 # (unrepresentable chars will be stripped out)
1009 ascii_bytes = normalized.encode("ascii", "ignore")
1011 # turns encoded bytes into an utf-8 string
1012 return ascii_bytes.decode("utf-8")
1015 def slugify(in_str: str, *, separator: str = "-") -> str:
1017 Converts a string into a "slug" using provided separator.
1018 The returned string has the following properties:
1021 - all letters are in lower case
1022 - all punctuation signs and non alphanumeric chars are removed
1023 - words are divided using provided separator
1024 - all chars are encoded as ascii (by using `asciify()`)
1027 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1028 'top-10-reasons-to-love-dogs'
1029 >>> slugify('Mönstér Mägnët')
1032 if not is_string(in_str):
1033 raise ValueError(in_str)
1035 # replace any character that is NOT letter or number with spaces
1036 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1038 # replace spaces with join sign
1039 out = SPACES_RE.sub(separator, out)
1041 # normalize joins (remove duplicates)
1042 out = re.sub(re.escape(separator) + r"+", separator, out)
1046 def to_bool(in_str: str) -> bool:
1048 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1050 A positive boolean (True) is returned if the string value is one
1058 Otherwise False is returned.
1071 if not is_string(in_str):
1072 raise ValueError(in_str)
1073 return in_str.lower() in ("true", "1", "yes", "y", "t")
1076 def to_date(in_str: str) -> Optional[datetime.date]:
1078 Parses a date string. See DateParser docs for details.
1080 import dateparse.dateparse_utils as dp
1085 except dp.ParseException:
1086 logger.warning(f'Unable to parse date {in_str}.')
1090 def valid_date(in_str: str) -> bool:
1092 True if the string represents a valid date.
1094 import dateparse.dateparse_utils as dp
1099 except dp.ParseException:
1100 logger.warning(f'Unable to parse date {in_str}.')
1104 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1106 Parses a datetime string. See DateParser docs for more info.
1108 import dateparse.dateparse_utils as dp
1111 dt = d.parse(in_str)
1112 if type(dt) == datetime.datetime:
1115 logger.warning(f'Unable to parse datetime {in_str}.')
1119 def valid_datetime(in_str: str) -> bool:
1121 True if the string represents a valid datetime.
1123 _ = to_datetime(in_str)
1126 logger.warning(f'Unable to parse datetime {in_str}.')
1130 def dedent(in_str: str) -> str:
1132 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1134 if not is_string(in_str):
1135 raise ValueError(in_str)
1136 line_separator = '\n'
1137 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1138 return line_separator.join(lines)
1141 def indent(in_str: str, amount: int) -> str:
1143 Indents string by prepending amount spaces.
1145 >>> indent('This is a test', 4)
1149 if not is_string(in_str):
1150 raise ValueError(in_str)
1151 line_separator = '\n'
1152 lines = [" " * amount + line for line in in_str.split(line_separator)]
1153 return line_separator.join(lines)
1156 def sprintf(*args, **kwargs) -> str:
1157 """String printf, like in C"""
1160 sep = kwargs.pop("sep", None)
1162 if not isinstance(sep, str):
1163 raise TypeError("sep must be None or a string")
1165 end = kwargs.pop("end", None)
1167 if not isinstance(end, str):
1168 raise TypeError("end must be None or a string")
1171 raise TypeError("invalid keyword arguments to sprint()")
1177 for i, arg in enumerate(args):
1180 if isinstance(arg, str):
1188 class SprintfStdout(object):
1190 A context manager that captures outputs to stdout.
1192 with SprintfStdout() as buf:
1198 def __init__(self) -> None:
1199 self.destination = io.StringIO()
1200 self.recorder = None
1202 def __enter__(self) -> Callable[[], str]:
1203 self.recorder = contextlib.redirect_stdout(self.destination)
1204 self.recorder.__enter__()
1205 return lambda: self.destination.getvalue()
1207 def __exit__(self, *args) -> None:
1208 self.recorder.__exit__(*args)
1209 self.destination.seek(0)
1210 return None # don't suppress exceptions
1213 def is_are(n: int) -> str:
1227 def pluralize(n: int) -> str:
1233 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1236 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1245 def thify(n: int) -> str:
1246 """Return the proper cardinal suffix for a number.
1257 assert is_integer_number(digit)
1269 def ngrams(txt: str, n: int):
1270 """Return the ngrams from a string.
1272 >>> [x for x in ngrams('This is a test', 2)]
1273 ['This is', 'is a', 'a test']
1277 return ngrams_presplit(words, n)
1280 def ngrams_presplit(words: Iterable[str], n: int):
1281 for ngram in zip(*[words[i:] for i in range(n)]):
1282 yield(' '.join(ngram))
1285 def bigrams(txt: str):
1286 return ngrams(txt, 2)
1289 def trigrams(txt: str):
1290 return ngrams(txt, 3)
1293 def shuffle_columns_into_list(
1294 input_lines: Iterable[str],
1295 column_specs: Iterable[Iterable[int]],
1298 """Helper to shuffle / parse columnar data and return the results as a
1299 list. The column_specs argument is an iterable collection of
1300 numeric sequences that indicate one or more column numbers to
1303 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1304 >>> shuffle_columns_into_list(
1306 ... [ [8], [2, 3], [5, 6, 7] ],
1309 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1314 # Column specs map input lines' columns into outputs.
1316 for spec in column_specs:
1319 chunk = chunk + delim + input_lines[n]
1320 chunk = chunk.strip(delim)
1325 def shuffle_columns_into_dict(
1326 input_lines: Iterable[str],
1327 column_specs: Iterable[Tuple[str, Iterable[int]]],
1329 ) -> Dict[str, str]:
1330 """Helper to shuffle / parse columnar data and return the results
1333 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1334 >>> shuffle_columns_into_dict(
1336 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1339 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1344 # Column specs map input lines' columns into outputs.
1345 # "key", [col1, col2...]
1346 for spec in column_specs:
1349 chunk = chunk + delim + input_lines[n]
1350 chunk = chunk.strip(delim)
1351 out[spec[0]] = chunk
1355 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1356 """Interpolate a string with data from a dict.
1358 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1359 ... {'adjective': 'good', 'noun': 'example'})
1360 'This is a good example.'
1363 return sprintf(txt.format(**values), end='')
1366 if __name__ == '__main__':