6 from itertools import zip_longest
13 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
15 from uuid import uuid4
19 logger = logging.getLogger(__name__)
21 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
23 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
25 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
27 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
30 r"([a-z-]+://)" # scheme
31 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
33 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
34 r"(:\d{2,})?" # port number
35 r"(/[a-z\d_%+-]*)*" # folders
36 r"(\.[a-z\d_%+-]+)*" # file extension
37 r"(\?[a-z\d_+%-=]*)?" # query string
41 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
43 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
45 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
47 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
49 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
51 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
53 CAMEL_CASE_TEST_RE = re.compile(
54 r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
57 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
59 SNAKE_CASE_TEST_RE = re.compile(
60 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
63 SNAKE_CASE_TEST_DASH_RE = re.compile(
64 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
67 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
69 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
72 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
73 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
74 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
75 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
76 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
77 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
80 JSON_WRAPPER_RE = re.compile(
81 r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
85 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
88 UUID_HEX_OK_RE = re.compile(
89 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
93 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
95 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
97 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
99 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
101 MAC_ADDRESS_RE = re.compile(
102 r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
105 ANYWHERE_MAC_ADDRESS_RE = re.compile(
106 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
109 WORDS_COUNT_RE = re.compile(
110 r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
113 HTML_RE = re.compile(
114 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
115 re.IGNORECASE | re.MULTILINE | re.DOTALL,
118 HTML_TAG_ONLY_RE = re.compile(
119 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
120 re.IGNORECASE | re.MULTILINE | re.DOTALL,
123 SPACES_RE = re.compile(r"\s")
125 NO_LETTERS_OR_NUMBERS_RE = re.compile(
126 r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
129 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
131 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
147 def is_none_or_empty(in_str: Optional[str]) -> bool:
149 Returns true if the input string is either None or an empty string.
151 >>> is_none_or_empty("")
153 >>> is_none_or_empty(None)
155 >>> is_none_or_empty(" \t ")
157 >>> is_none_or_empty('Test')
160 return in_str is None or len(in_str.strip()) == 0
163 def is_string(obj: Any) -> bool:
165 Checks if an object is a string.
167 >>> is_string('test')
173 >>> is_string([1, 2, 3])
176 return isinstance(obj, str)
179 def is_empty_string(in_str: Any) -> bool:
180 return is_empty(in_str)
183 def is_empty(in_str: Any) -> bool:
185 Checks if input is a string and empty or only whitespace.
189 >>> is_empty(' \t\t ')
195 >>> is_empty([1, 2, 3])
198 return is_string(in_str) and in_str.strip() == ""
201 def is_full_string(in_str: Any) -> bool:
203 Checks that input is a string and is not empty ('') or only whitespace.
205 >>> is_full_string('test!')
207 >>> is_full_string('')
209 >>> is_full_string(' ')
211 >>> is_full_string(100.999)
213 >>> is_full_string({"a": 1, "b": 2})
216 return is_string(in_str) and in_str.strip() != ""
219 def is_number(in_str: str) -> bool:
221 Checks if a string is a valid number.
224 Traceback (most recent call last):
227 >>> is_number("100.5")
229 >>> is_number("test")
233 >>> is_number([1, 2, 3])
234 Traceback (most recent call last):
236 ValueError: [1, 2, 3]
238 if not is_string(in_str):
239 raise ValueError(in_str)
240 return NUMBER_RE.match(in_str) is not None
243 def is_integer_number(in_str: str) -> bool:
245 Checks whether the given string represents an integer or not.
247 An integer may be signed or unsigned or use a "scientific notation".
249 >>> is_integer_number('42')
251 >>> is_integer_number('42.0')
255 (is_number(in_str) and "." not in in_str) or
256 is_hexidecimal_integer_number(in_str) or
257 is_octal_integer_number(in_str) or
258 is_binary_integer_number(in_str)
262 def is_hexidecimal_integer_number(in_str: str) -> bool:
264 Checks whether a string is a hex integer number.
266 >>> is_hexidecimal_integer_number('0x12345')
268 >>> is_hexidecimal_integer_number('0x1A3E')
270 >>> is_hexidecimal_integer_number('1234') # Needs 0x
272 >>> is_hexidecimal_integer_number('-0xff')
274 >>> is_hexidecimal_integer_number('test')
276 >>> is_hexidecimal_integer_number(12345) # Not a string
277 Traceback (most recent call last):
280 >>> is_hexidecimal_integer_number(101.4)
281 Traceback (most recent call last):
284 >>> is_hexidecimal_integer_number(0x1A3E)
285 Traceback (most recent call last):
289 if not is_string(in_str):
290 raise ValueError(in_str)
291 return HEX_NUMBER_RE.match(in_str) is not None
294 def is_octal_integer_number(in_str: str) -> bool:
296 Checks whether a string is an octal number.
298 >>> is_octal_integer_number('0o777')
300 >>> is_octal_integer_number('-0O115')
302 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
304 >>> is_octal_integer_number('7777') # Needs 0o
306 >>> is_octal_integer_number('test')
309 if not is_string(in_str):
310 raise ValueError(in_str)
311 return OCT_NUMBER_RE.match(in_str) is not None
314 def is_binary_integer_number(in_str: str) -> bool:
316 Returns whether a string contains a binary number.
318 >>> is_binary_integer_number('0b10111')
320 >>> is_binary_integer_number('-0b111')
322 >>> is_binary_integer_number('0B10101')
324 >>> is_binary_integer_number('0b10102')
326 >>> is_binary_integer_number('0xFFF')
328 >>> is_binary_integer_number('test')
331 if not is_string(in_str):
332 raise ValueError(in_str)
333 return BIN_NUMBER_RE.match(in_str) is not None
336 def to_int(in_str: str) -> int:
337 """Returns the integral value of the string or raises on error.
342 Traceback (most recent call last):
344 ValueError: invalid literal for int() with base 10: 'test'
346 if not is_string(in_str):
347 raise ValueError(in_str)
348 if is_binary_integer_number(in_str):
349 return int(in_str, 2)
350 if is_octal_integer_number(in_str):
351 return int(in_str, 8)
352 if is_hexidecimal_integer_number(in_str):
353 return int(in_str, 16)
357 def is_decimal_number(in_str: str) -> bool:
359 Checks whether the given string represents a decimal or not.
361 A decimal may be signed or unsigned or use a "scientific notation".
363 >>> is_decimal_number('42.0')
365 >>> is_decimal_number('42')
368 return is_number(in_str) and "." in in_str
371 def strip_escape_sequences(in_str: str) -> str:
373 Remove escape sequences in the input string.
375 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
378 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
382 def add_thousands_separator(
385 separator_char = ',',
389 Add thousands separator to a numeric string. Also handles numbers.
391 >>> add_thousands_separator('12345678')
393 >>> add_thousands_separator(12345678)
395 >>> add_thousands_separator(12345678.99)
397 >>> add_thousands_separator('test')
398 Traceback (most recent call last):
403 if isinstance(in_str, numbers.Number):
405 if is_number(in_str):
406 return _add_thousands_separator(
408 separator_char = separator_char,
411 raise ValueError(in_str)
414 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
417 (in_str, decimal_part) = in_str.split('.')
418 tmp = [iter(in_str[::-1])] * places
419 ret = separator_char.join(
420 "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
421 if len(decimal_part) > 0:
428 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
429 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
431 Check if a string is a valid url.
433 >>> is_url('http://www.mysite.com')
435 >>> is_url('https://mysite.com')
437 >>> is_url('.mysite.com')
440 if not is_full_string(in_str):
443 valid = URL_RE.match(in_str) is not None
446 return valid and any([in_str.startswith(s) for s in allowed_schemes])
450 def is_email(in_str: Any) -> bool:
452 Check if a string is a valid email.
454 Reference: https://tools.ietf.org/html/rfc3696#section-3
458 >>> is_email('@gmail.com')
462 not is_full_string(in_str)
464 or in_str.startswith(".")
469 # we expect 2 tokens, one before "@" and one after, otherwise
470 # we have an exception and the email is not valid.
471 head, tail = in_str.split("@")
473 # head's size must be <= 64, tail <= 255, head must not start
474 # with a dot or contain multiple consecutive dots.
478 or head.endswith(".")
483 # removes escaped spaces, so that later on the test regex will
485 head = head.replace("\\ ", "")
486 if head.startswith('"') and head.endswith('"'):
487 head = head.replace(" ", "")[1:-1]
488 return EMAIL_RE.match(head + "@" + tail) is not None
491 # borderline case in which we have multiple "@" signs but the
492 # head part is correctly escaped.
493 if ESCAPED_AT_SIGN.search(in_str) is not None:
494 # replace "@" with "a" in the head
495 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
499 def suffix_string_to_number(in_str: str) -> Optional[int]:
500 """Take a string like "33Gb" and convert it into a number (of bytes)
501 like 34603008. Return None if the input string is not valid.
503 >>> suffix_string_to_number('1Mb')
505 >>> suffix_string_to_number('13.1Gb')
508 def suffix_capitalize(s: str) -> str:
512 return f"{s[0].upper()}{s[1].lower()}"
513 return suffix_capitalize(s[0:1])
515 if is_string(in_str):
516 if is_integer_number(in_str):
517 return to_int(in_str)
518 suffixes = [in_str[-2:], in_str[-1:]]
519 rest = [in_str[:-2], in_str[:-1]]
520 for x in range(len(suffixes)):
522 s = suffix_capitalize(s)
523 multiplier = NUM_SUFFIXES.get(s, None)
524 if multiplier is not None:
526 if is_integer_number(r):
527 return to_int(r) * multiplier
528 if is_decimal_number(r):
529 return int(float(r) * multiplier)
533 def number_to_suffix_string(num: int) -> Optional[str]:
534 """Take a number (of bytes) and returns a string like "43.8Gb".
535 Returns none if the input is invalid.
537 >>> number_to_suffix_string(14066017894)
539 >>> number_to_suffix_string(1024 * 1024)
545 for (sfx, size) in NUM_SUFFIXES.items():
550 if suffix is not None:
551 return f"{d:.1f}{suffix}"
556 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
558 Checks if a string is a valid credit card number.
559 If card type is provided then it checks against that specific type only,
560 otherwise any known credit card number will be accepted.
562 Supported card types are the following:
571 if not is_full_string(in_str):
574 if card_type is not None:
575 if card_type not in CREDIT_CARDS:
577 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
579 return CREDIT_CARDS[card_type].match(in_str) is not None
580 for c in CREDIT_CARDS:
581 if CREDIT_CARDS[c].match(in_str) is not None:
586 def is_camel_case(in_str: Any) -> bool:
588 Checks if a string is formatted as camel case.
590 A string is considered camel case when:
592 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
593 - it contains both lowercase and uppercase letters
594 - it does not start with a number
597 is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
601 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
603 Checks if a string is formatted as "snake case".
605 A string is considered snake case when:
607 - it's composed only by lowercase/uppercase letters and digits
608 - it contains at least one underscore (or provided separator)
609 - it does not start with a number
611 >>> is_snake_case('this_is_a_test')
613 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
615 >>> is_snake_case('this-is-a-test')
617 >>> is_snake_case('this-is-a-test', separator='-')
621 if is_full_string(in_str):
622 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
624 r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
629 re_template.format(sign=re.escape(separator)), re.IGNORECASE
632 return r.match(in_str) is not None
636 def is_json(in_str: Any) -> bool:
638 Check if a string is a valid json.
640 >>> is_json('{"name": "Peter"}')
642 >>> is_json('[1, 2, 3]')
644 >>> is_json('{nope}')
647 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
649 return isinstance(json.loads(in_str), (dict, list))
650 except (TypeError, ValueError, OverflowError):
655 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
657 Check if a string is a valid UUID.
659 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
661 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
663 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
666 # string casting is used to allow UUID itself as input data type
669 return UUID_HEX_OK_RE.match(s) is not None
670 return UUID_RE.match(s) is not None
673 def is_ip_v4(in_str: Any) -> bool:
675 Checks if a string is a valid ip v4.
677 >>> is_ip_v4('255.200.100.75')
681 >>> is_ip_v4('255.200.100.999') # 999 out of range
684 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
687 # checks that each entry in the ip is in the valid range (0 to 255)
688 for token in in_str.split("."):
689 if not 0 <= int(token) <= 255:
694 def extract_ip_v4(in_str: Any) -> Optional[str]:
696 Extracts the IPv4 chunk of a string or None.
698 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
700 >>> extract_ip_v4('Your mom dresses you funny.')
702 if not is_full_string(in_str):
704 m = ANYWHERE_IP_V4_RE.search(in_str)
710 def is_ip_v6(in_str: Any) -> bool:
712 Checks if a string is a valid ip v6.
714 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
716 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
719 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
722 def extract_ip_v6(in_str: Any) -> Optional[str]:
724 Extract IPv6 chunk or None.
726 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
727 '2001:db8:85a3:0000:0000:8a2e:370:7334'
728 >>> extract_ip_v6("(and she's ugly too, btw)")
730 if not is_full_string(in_str):
732 m = ANYWHERE_IP_V6_RE.search(in_str)
738 def is_ip(in_str: Any) -> bool:
740 Checks if a string is a valid ip (either v4 or v6).
742 >>> is_ip('255.200.100.75')
744 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
748 >>> is_ip('1.2.3.999')
751 return is_ip_v6(in_str) or is_ip_v4(in_str)
754 def extract_ip(in_str: Any) -> Optional[str]:
756 Extract the IP address or None.
758 >>> extract_ip('Attacker: 255.200.100.75')
760 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
761 '2001:db8:85a3:0000:0000:8a2e:370:7334'
762 >>> extract_ip('1.2.3')
765 ip = extract_ip_v4(in_str)
767 ip = extract_ip_v6(in_str)
771 def is_mac_address(in_str: Any) -> bool:
772 """Return True if in_str is a valid MAC address false otherwise.
774 >>> is_mac_address("34:29:8F:12:0D:2F")
776 >>> is_mac_address('34:29:8f:12:0d:2f')
778 >>> is_mac_address('34-29-8F-12-0D-2F')
780 >>> is_mac_address("test")
783 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
786 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
788 Extract the MAC address from in_str.
790 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
794 if not is_full_string(in_str):
797 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
800 mac.replace(":", separator)
801 mac.replace("-", separator)
806 def is_slug(in_str: Any, separator: str = "-") -> bool:
808 Checks if a given string is a slug (as created by `slugify()`).
810 >>> is_slug('my-blog-post-title')
812 >>> is_slug('My blog post title')
816 if not is_full_string(in_str):
818 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
819 return re.match(rex, in_str) is not None
822 def contains_html(in_str: str) -> bool:
824 Checks if the given string contains HTML/XML tags.
826 By design, this function matches ANY type of tag, so don't expect to use it
827 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
829 >>> contains_html('my string is <strong>bold</strong>')
831 >>> contains_html('my string is not bold')
835 if not is_string(in_str):
836 raise ValueError(in_str)
837 return HTML_RE.search(in_str) is not None
840 def words_count(in_str: str) -> int:
842 Returns the number of words contained into the given string.
844 This method is smart, it does consider only sequence of one or more letter and/or numbers
845 as "words", so a string like this: "! @ # % ... []" will return zero!
846 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
847 will be 4 not 1 (even if there are no spaces in the string).
849 >>> words_count('hello world')
851 >>> words_count('one,two,three.stop')
855 if not is_string(in_str):
856 raise ValueError(in_str)
857 return len(WORDS_COUNT_RE.findall(in_str))
860 def generate_uuid(as_hex: bool = False) -> str:
862 Generated an UUID string (using `uuid.uuid4()`).
864 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
865 generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
874 def generate_random_alphanumeric_string(size: int) -> str:
876 Returns a string of the specified size containing random
877 characters (uppercase/lowercase ascii letters and digits).
879 random_string(9) # possible output: "cx3QQbzYg"
883 raise ValueError("size must be >= 1")
884 chars = string.ascii_letters + string.digits
885 buffer = [random.choice(chars) for _ in range(size)]
886 return from_char_list(buffer)
889 def reverse(in_str: str) -> str:
891 Returns the string with its chars reversed.
897 if not is_string(in_str):
898 raise ValueError(in_str)
902 def camel_case_to_snake_case(in_str, *, separator="_"):
904 Convert a camel case string into a snake case one.
905 (The original string is returned if is not a valid camel case string)
907 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
908 'mac_address_extractor_factory'
909 >>> camel_case_to_snake_case('Luke Skywalker')
912 if not is_string(in_str):
913 raise ValueError(in_str)
914 if not is_camel_case(in_str):
916 return CAMEL_CASE_REPLACE_RE.sub(
917 lambda m: m.group(1) + separator, in_str
921 def snake_case_to_camel_case(
922 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
925 Convert a snake case string into a camel case one.
926 (The original string is returned if is not a valid snake case string)
928 >>> snake_case_to_camel_case('this_is_a_test')
930 >>> snake_case_to_camel_case('Han Solo')
933 if not is_string(in_str):
934 raise ValueError(in_str)
935 if not is_snake_case(in_str, separator=separator):
937 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
938 if not upper_case_first:
939 tokens[0] = tokens[0].lower()
940 return from_char_list(tokens)
943 def to_char_list(in_str: str) -> List[str]:
944 """Convert a string into a list of chars.
946 >>> to_char_list('test')
949 if not is_string(in_str):
954 def from_char_list(in_list: List[str]) -> str:
955 """Convert a char list into a string.
957 >>> from_char_list(['t', 'e', 's', 't'])
960 return "".join(in_list)
963 def shuffle(in_str: str) -> str:
964 """Return a new string containing same chars of the given one but in
967 if not is_string(in_str):
968 raise ValueError(in_str)
970 # turn the string into a list of chars
971 chars = to_char_list(in_str)
972 random.shuffle(chars)
973 return from_char_list(chars)
976 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
978 Remove html code contained into the given string.
980 >>> strip_html('test: <a href="foo/bar">click here</a>')
982 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
985 if not is_string(in_str):
986 raise ValueError(in_str)
987 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
988 return r.sub("", in_str)
991 def asciify(in_str: str) -> str:
993 Force string content to be ascii-only by translating all non-ascii
994 chars into the closest possible representation (eg: ó -> o, Ë ->
997 N.B. Some chars may be lost if impossible to translate.
999 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1000 'eeuuooaaeynAAACIINOE'
1002 if not is_string(in_str):
1003 raise ValueError(in_str)
1005 # "NFKD" is the algorithm which is able to successfully translate
1006 # the most of non-ascii chars.
1007 normalized = unicodedata.normalize("NFKD", in_str)
1009 # encode string forcing ascii and ignore any errors
1010 # (unrepresentable chars will be stripped out)
1011 ascii_bytes = normalized.encode("ascii", "ignore")
1013 # turns encoded bytes into an utf-8 string
1014 return ascii_bytes.decode("utf-8")
1017 def slugify(in_str: str, *, separator: str = "-") -> str:
1019 Converts a string into a "slug" using provided separator.
1020 The returned string has the following properties:
1023 - all letters are in lower case
1024 - all punctuation signs and non alphanumeric chars are removed
1025 - words are divided using provided separator
1026 - all chars are encoded as ascii (by using `asciify()`)
1029 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1030 'top-10-reasons-to-love-dogs'
1031 >>> slugify('Mönstér Mägnët')
1034 if not is_string(in_str):
1035 raise ValueError(in_str)
1037 # replace any character that is NOT letter or number with spaces
1038 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1040 # replace spaces with join sign
1041 out = SPACES_RE.sub(separator, out)
1043 # normalize joins (remove duplicates)
1044 out = re.sub(re.escape(separator) + r"+", separator, out)
1048 def to_bool(in_str: str) -> bool:
1050 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1052 A positive boolean (True) is returned if the string value is one
1060 Otherwise False is returned.
1081 if not is_string(in_str):
1082 raise ValueError(in_str)
1083 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1086 def to_date(in_str: str) -> Optional[datetime.date]:
1088 Parses a date string. See DateParser docs for details.
1090 import dateparse.dateparse_utils as dp
1095 except dp.ParseException:
1096 logger.warning(f'Unable to parse date {in_str}.')
1100 def valid_date(in_str: str) -> bool:
1102 True if the string represents a valid date.
1104 import dateparse.dateparse_utils as dp
1109 except dp.ParseException:
1110 logger.warning(f'Unable to parse date {in_str}.')
1114 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1116 Parses a datetime string. See DateParser docs for more info.
1118 import dateparse.dateparse_utils as dp
1121 dt = d.parse(in_str)
1122 if type(dt) == datetime.datetime:
1125 logger.warning(f'Unable to parse datetime {in_str}.')
1129 def valid_datetime(in_str: str) -> bool:
1131 True if the string represents a valid datetime.
1133 _ = to_datetime(in_str)
1136 logger.warning(f'Unable to parse datetime {in_str}.')
1140 def dedent(in_str: str) -> str:
1142 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1144 if not is_string(in_str):
1145 raise ValueError(in_str)
1146 line_separator = '\n'
1147 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1148 return line_separator.join(lines)
1151 def indent(in_str: str, amount: int) -> str:
1153 Indents string by prepending amount spaces.
1155 >>> indent('This is a test', 4)
1159 if not is_string(in_str):
1160 raise ValueError(in_str)
1161 line_separator = '\n'
1162 lines = [" " * amount + line for line in in_str.split(line_separator)]
1163 return line_separator.join(lines)
1166 def sprintf(*args, **kwargs) -> str:
1167 """String printf, like in C"""
1170 sep = kwargs.pop("sep", None)
1172 if not isinstance(sep, str):
1173 raise TypeError("sep must be None or a string")
1175 end = kwargs.pop("end", None)
1177 if not isinstance(end, str):
1178 raise TypeError("end must be None or a string")
1181 raise TypeError("invalid keyword arguments to sprint()")
1187 for i, arg in enumerate(args):
1190 if isinstance(arg, str):
1198 class SprintfStdout(object):
1200 A context manager that captures outputs to stdout.
1202 with SprintfStdout() as buf:
1208 def __init__(self) -> None:
1209 self.destination = io.StringIO()
1210 self.recorder = None
1212 def __enter__(self) -> Callable[[], str]:
1213 self.recorder = contextlib.redirect_stdout(self.destination)
1214 self.recorder.__enter__()
1215 return lambda: self.destination.getvalue()
1217 def __exit__(self, *args) -> None:
1218 self.recorder.__exit__(*args)
1219 self.destination.seek(0)
1220 return None # don't suppress exceptions
1223 def is_are(n: int) -> str:
1237 def pluralize(n: int) -> str:
1243 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1246 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1255 def thify(n: int) -> str:
1256 """Return the proper cardinal suffix for a number.
1267 assert is_integer_number(digit)
1279 def ngrams(txt: str, n: int):
1280 """Return the ngrams from a string.
1282 >>> [x for x in ngrams('This is a test', 2)]
1283 ['This is', 'is a', 'a test']
1287 for ngram in ngrams_presplit(words, n):
1288 return ' '.join(ngram)
1291 def ngrams_presplit(words: Iterable[str], n: int):
1292 return list_utils.ngrams(words, n)
1295 def bigrams(txt: str):
1296 return ngrams(txt, 2)
1299 def trigrams(txt: str):
1300 return ngrams(txt, 3)
1303 def shuffle_columns_into_list(
1304 input_lines: Iterable[str],
1305 column_specs: Iterable[Iterable[int]],
1308 """Helper to shuffle / parse columnar data and return the results as a
1309 list. The column_specs argument is an iterable collection of
1310 numeric sequences that indicate one or more column numbers to
1313 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1314 >>> shuffle_columns_into_list(
1316 ... [ [8], [2, 3], [5, 6, 7] ],
1319 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1324 # Column specs map input lines' columns into outputs.
1326 for spec in column_specs:
1329 chunk = chunk + delim + input_lines[n]
1330 chunk = chunk.strip(delim)
1335 def shuffle_columns_into_dict(
1336 input_lines: Iterable[str],
1337 column_specs: Iterable[Tuple[str, Iterable[int]]],
1339 ) -> Dict[str, str]:
1340 """Helper to shuffle / parse columnar data and return the results
1343 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1344 >>> shuffle_columns_into_dict(
1346 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1349 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1354 # Column specs map input lines' columns into outputs.
1355 # "key", [col1, col2...]
1356 for spec in column_specs:
1359 chunk = chunk + delim + input_lines[n]
1360 chunk = chunk.strip(delim)
1361 out[spec[0]] = chunk
1365 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1366 """Interpolate a string with data from a dict.
1368 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1369 ... {'adjective': 'good', 'noun': 'example'})
1370 'This is a good example.'
1373 return sprintf(txt.format(**values), end='')
1376 if __name__ == '__main__':