7 from itertools import zip_longest
25 from uuid import uuid4
30 logger = logging.getLogger(__name__)
32 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
34 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
36 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
38 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
41 r"([a-z-]+://)" # scheme
42 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
44 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
45 r"(:\d{2,})?" # port number
46 r"(/[a-z\d_%+-]*)*" # folders
47 r"(\.[a-z\d_%+-]+)*" # file extension
48 r"(\?[a-z\d_+%-=]*)?" # query string
52 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
54 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
56 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
58 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
60 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
62 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
64 CAMEL_CASE_TEST_RE = re.compile(
65 r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
68 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
70 SNAKE_CASE_TEST_RE = re.compile(
71 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
74 SNAKE_CASE_TEST_DASH_RE = re.compile(
75 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
78 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
80 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
83 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
84 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
85 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
86 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
87 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
88 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
91 JSON_WRAPPER_RE = re.compile(
92 r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
96 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
99 UUID_HEX_OK_RE = re.compile(
100 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
104 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
106 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
108 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
110 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
112 MAC_ADDRESS_RE = re.compile(
113 r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
116 ANYWHERE_MAC_ADDRESS_RE = re.compile(
117 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
120 WORDS_COUNT_RE = re.compile(
121 r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
124 HTML_RE = re.compile(
125 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
126 re.IGNORECASE | re.MULTILINE | re.DOTALL,
129 HTML_TAG_ONLY_RE = re.compile(
130 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
131 re.IGNORECASE | re.MULTILINE | re.DOTALL,
134 SPACES_RE = re.compile(r"\s")
136 NO_LETTERS_OR_NUMBERS_RE = re.compile(
137 r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
140 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
142 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
158 def is_none_or_empty(in_str: Optional[str]) -> bool:
160 Returns true if the input string is either None or an empty string.
162 >>> is_none_or_empty("")
164 >>> is_none_or_empty(None)
166 >>> is_none_or_empty(" \t ")
168 >>> is_none_or_empty('Test')
171 return in_str is None or len(in_str.strip()) == 0
174 def is_string(obj: Any) -> bool:
176 Checks if an object is a string.
178 >>> is_string('test')
184 >>> is_string([1, 2, 3])
187 return isinstance(obj, str)
190 def is_empty_string(in_str: Any) -> bool:
191 return is_empty(in_str)
194 def is_empty(in_str: Any) -> bool:
196 Checks if input is a string and empty or only whitespace.
200 >>> is_empty(' \t\t ')
206 >>> is_empty([1, 2, 3])
209 return is_string(in_str) and in_str.strip() == ""
212 def is_full_string(in_str: Any) -> bool:
214 Checks that input is a string and is not empty ('') or only whitespace.
216 >>> is_full_string('test!')
218 >>> is_full_string('')
220 >>> is_full_string(' ')
222 >>> is_full_string(100.999)
224 >>> is_full_string({"a": 1, "b": 2})
227 return is_string(in_str) and in_str.strip() != ""
230 def is_number(in_str: str) -> bool:
232 Checks if a string is a valid number.
235 Traceback (most recent call last):
238 >>> is_number("100.5")
240 >>> is_number("test")
244 >>> is_number([1, 2, 3])
245 Traceback (most recent call last):
247 ValueError: [1, 2, 3]
249 if not is_string(in_str):
250 raise ValueError(in_str)
251 return NUMBER_RE.match(in_str) is not None
254 def is_integer_number(in_str: str) -> bool:
256 Checks whether the given string represents an integer or not.
258 An integer may be signed or unsigned or use a "scientific notation".
260 >>> is_integer_number('42')
262 >>> is_integer_number('42.0')
266 (is_number(in_str) and "." not in in_str)
267 or is_hexidecimal_integer_number(in_str)
268 or is_octal_integer_number(in_str)
269 or is_binary_integer_number(in_str)
273 def is_hexidecimal_integer_number(in_str: str) -> bool:
275 Checks whether a string is a hex integer number.
277 >>> is_hexidecimal_integer_number('0x12345')
279 >>> is_hexidecimal_integer_number('0x1A3E')
281 >>> is_hexidecimal_integer_number('1234') # Needs 0x
283 >>> is_hexidecimal_integer_number('-0xff')
285 >>> is_hexidecimal_integer_number('test')
287 >>> is_hexidecimal_integer_number(12345) # Not a string
288 Traceback (most recent call last):
291 >>> is_hexidecimal_integer_number(101.4)
292 Traceback (most recent call last):
295 >>> is_hexidecimal_integer_number(0x1A3E)
296 Traceback (most recent call last):
300 if not is_string(in_str):
301 raise ValueError(in_str)
302 return HEX_NUMBER_RE.match(in_str) is not None
305 def is_octal_integer_number(in_str: str) -> bool:
307 Checks whether a string is an octal number.
309 >>> is_octal_integer_number('0o777')
311 >>> is_octal_integer_number('-0O115')
313 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
315 >>> is_octal_integer_number('7777') # Needs 0o
317 >>> is_octal_integer_number('test')
320 if not is_string(in_str):
321 raise ValueError(in_str)
322 return OCT_NUMBER_RE.match(in_str) is not None
325 def is_binary_integer_number(in_str: str) -> bool:
327 Returns whether a string contains a binary number.
329 >>> is_binary_integer_number('0b10111')
331 >>> is_binary_integer_number('-0b111')
333 >>> is_binary_integer_number('0B10101')
335 >>> is_binary_integer_number('0b10102')
337 >>> is_binary_integer_number('0xFFF')
339 >>> is_binary_integer_number('test')
342 if not is_string(in_str):
343 raise ValueError(in_str)
344 return BIN_NUMBER_RE.match(in_str) is not None
347 def to_int(in_str: str) -> int:
348 """Returns the integral value of the string or raises on error.
353 Traceback (most recent call last):
355 ValueError: invalid literal for int() with base 10: 'test'
357 if not is_string(in_str):
358 raise ValueError(in_str)
359 if is_binary_integer_number(in_str):
360 return int(in_str, 2)
361 if is_octal_integer_number(in_str):
362 return int(in_str, 8)
363 if is_hexidecimal_integer_number(in_str):
364 return int(in_str, 16)
368 def is_decimal_number(in_str: str) -> bool:
370 Checks whether the given string represents a decimal or not.
372 A decimal may be signed or unsigned or use a "scientific notation".
374 >>> is_decimal_number('42.0')
376 >>> is_decimal_number('42')
379 return is_number(in_str) and "." in in_str
382 def strip_escape_sequences(in_str: str) -> str:
384 Remove escape sequences in the input string.
386 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
389 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
393 def add_thousands_separator(
394 in_str: str, *, separator_char=',', places=3
397 Add thousands separator to a numeric string. Also handles numbers.
399 >>> add_thousands_separator('12345678')
401 >>> add_thousands_separator(12345678)
403 >>> add_thousands_separator(12345678.99)
405 >>> add_thousands_separator('test')
406 Traceback (most recent call last):
411 if isinstance(in_str, numbers.Number):
413 if is_number(in_str):
414 return _add_thousands_separator(
415 in_str, separator_char=separator_char, places=places
417 raise ValueError(in_str)
420 def _add_thousands_separator(
421 in_str: str, *, separator_char=',', places=3
425 (in_str, decimal_part) = in_str.split('.')
426 tmp = [iter(in_str[::-1])] * places
427 ret = separator_char.join(
428 "".join(x) for x in zip_longest(*tmp, fillvalue="")
430 if len(decimal_part) > 0:
437 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
438 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
440 Check if a string is a valid url.
442 >>> is_url('http://www.mysite.com')
444 >>> is_url('https://mysite.com')
446 >>> is_url('.mysite.com')
449 if not is_full_string(in_str):
452 valid = URL_RE.match(in_str) is not None
455 return valid and any([in_str.startswith(s) for s in allowed_schemes])
459 def is_email(in_str: Any) -> bool:
461 Check if a string is a valid email.
463 Reference: https://tools.ietf.org/html/rfc3696#section-3
467 >>> is_email('@gmail.com')
471 not is_full_string(in_str)
473 or in_str.startswith(".")
478 # we expect 2 tokens, one before "@" and one after, otherwise
479 # we have an exception and the email is not valid.
480 head, tail = in_str.split("@")
482 # head's size must be <= 64, tail <= 255, head must not start
483 # with a dot or contain multiple consecutive dots.
487 or head.endswith(".")
492 # removes escaped spaces, so that later on the test regex will
494 head = head.replace("\\ ", "")
495 if head.startswith('"') and head.endswith('"'):
496 head = head.replace(" ", "")[1:-1]
497 return EMAIL_RE.match(head + "@" + tail) is not None
500 # borderline case in which we have multiple "@" signs but the
501 # head part is correctly escaped.
502 if ESCAPED_AT_SIGN.search(in_str) is not None:
503 # replace "@" with "a" in the head
504 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
508 def suffix_string_to_number(in_str: str) -> Optional[int]:
509 """Take a string like "33Gb" and convert it into a number (of bytes)
510 like 34603008. Return None if the input string is not valid.
512 >>> suffix_string_to_number('1Mb')
514 >>> suffix_string_to_number('13.1Gb')
518 def suffix_capitalize(s: str) -> str:
522 return f"{s[0].upper()}{s[1].lower()}"
523 return suffix_capitalize(s[0:1])
525 if is_string(in_str):
526 if is_integer_number(in_str):
527 return to_int(in_str)
528 suffixes = [in_str[-2:], in_str[-1:]]
529 rest = [in_str[:-2], in_str[:-1]]
530 for x in range(len(suffixes)):
532 s = suffix_capitalize(s)
533 multiplier = NUM_SUFFIXES.get(s, None)
534 if multiplier is not None:
536 if is_integer_number(r):
537 return to_int(r) * multiplier
538 if is_decimal_number(r):
539 return int(float(r) * multiplier)
543 def number_to_suffix_string(num: int) -> Optional[str]:
544 """Take a number (of bytes) and returns a string like "43.8Gb".
545 Returns none if the input is invalid.
547 >>> number_to_suffix_string(14066017894)
549 >>> number_to_suffix_string(1024 * 1024)
555 for (sfx, size) in NUM_SUFFIXES.items():
560 if suffix is not None:
561 return f"{d:.1f}{suffix}"
566 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
568 Checks if a string is a valid credit card number.
569 If card type is provided then it checks against that specific type only,
570 otherwise any known credit card number will be accepted.
572 Supported card types are the following:
581 if not is_full_string(in_str):
584 if card_type is not None:
585 if card_type not in CREDIT_CARDS:
587 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
589 return CREDIT_CARDS[card_type].match(in_str) is not None
590 for c in CREDIT_CARDS:
591 if CREDIT_CARDS[c].match(in_str) is not None:
596 def is_camel_case(in_str: Any) -> bool:
598 Checks if a string is formatted as camel case.
600 A string is considered camel case when:
602 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
603 - it contains both lowercase and uppercase letters
604 - it does not start with a number
607 is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
611 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
613 Checks if a string is formatted as "snake case".
615 A string is considered snake case when:
617 - it's composed only by lowercase/uppercase letters and digits
618 - it contains at least one underscore (or provided separator)
619 - it does not start with a number
621 >>> is_snake_case('this_is_a_test')
623 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
625 >>> is_snake_case('this-is-a-test')
627 >>> is_snake_case('this-is-a-test', separator='-')
631 if is_full_string(in_str):
632 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
634 r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
639 re_template.format(sign=re.escape(separator)), re.IGNORECASE
642 return r.match(in_str) is not None
646 def is_json(in_str: Any) -> bool:
648 Check if a string is a valid json.
650 >>> is_json('{"name": "Peter"}')
652 >>> is_json('[1, 2, 3]')
654 >>> is_json('{nope}')
657 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
659 return isinstance(json.loads(in_str), (dict, list))
660 except (TypeError, ValueError, OverflowError):
665 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
667 Check if a string is a valid UUID.
669 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
671 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
673 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
676 # string casting is used to allow UUID itself as input data type
679 return UUID_HEX_OK_RE.match(s) is not None
680 return UUID_RE.match(s) is not None
683 def is_ip_v4(in_str: Any) -> bool:
685 Checks if a string is a valid ip v4.
687 >>> is_ip_v4('255.200.100.75')
691 >>> is_ip_v4('255.200.100.999') # 999 out of range
694 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
697 # checks that each entry in the ip is in the valid range (0 to 255)
698 for token in in_str.split("."):
699 if not 0 <= int(token) <= 255:
704 def extract_ip_v4(in_str: Any) -> Optional[str]:
706 Extracts the IPv4 chunk of a string or None.
708 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
710 >>> extract_ip_v4('Your mom dresses you funny.')
712 if not is_full_string(in_str):
714 m = ANYWHERE_IP_V4_RE.search(in_str)
720 def is_ip_v6(in_str: Any) -> bool:
722 Checks if a string is a valid ip v6.
724 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
726 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
729 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
732 def extract_ip_v6(in_str: Any) -> Optional[str]:
734 Extract IPv6 chunk or None.
736 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
737 '2001:db8:85a3:0000:0000:8a2e:370:7334'
738 >>> extract_ip_v6("(and she's ugly too, btw)")
740 if not is_full_string(in_str):
742 m = ANYWHERE_IP_V6_RE.search(in_str)
748 def is_ip(in_str: Any) -> bool:
750 Checks if a string is a valid ip (either v4 or v6).
752 >>> is_ip('255.200.100.75')
754 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
758 >>> is_ip('1.2.3.999')
761 return is_ip_v6(in_str) or is_ip_v4(in_str)
764 def extract_ip(in_str: Any) -> Optional[str]:
766 Extract the IP address or None.
768 >>> extract_ip('Attacker: 255.200.100.75')
770 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
771 '2001:db8:85a3:0000:0000:8a2e:370:7334'
772 >>> extract_ip('1.2.3')
775 ip = extract_ip_v4(in_str)
777 ip = extract_ip_v6(in_str)
781 def is_mac_address(in_str: Any) -> bool:
782 """Return True if in_str is a valid MAC address false otherwise.
784 >>> is_mac_address("34:29:8F:12:0D:2F")
786 >>> is_mac_address('34:29:8f:12:0d:2f')
788 >>> is_mac_address('34-29-8F-12-0D-2F')
790 >>> is_mac_address("test")
793 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
796 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
798 Extract the MAC address from in_str.
800 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
803 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
807 if not is_full_string(in_str):
810 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
813 mac.replace(":", separator)
814 mac.replace("-", separator)
819 def is_slug(in_str: Any, separator: str = "-") -> bool:
821 Checks if a given string is a slug (as created by `slugify()`).
823 >>> is_slug('my-blog-post-title')
825 >>> is_slug('My blog post title')
829 if not is_full_string(in_str):
831 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
832 return re.match(rex, in_str) is not None
835 def contains_html(in_str: str) -> bool:
837 Checks if the given string contains HTML/XML tags.
839 By design, this function matches ANY type of tag, so don't expect to use it
840 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
842 >>> contains_html('my string is <strong>bold</strong>')
844 >>> contains_html('my string is not bold')
848 if not is_string(in_str):
849 raise ValueError(in_str)
850 return HTML_RE.search(in_str) is not None
853 def words_count(in_str: str) -> int:
855 Returns the number of words contained into the given string.
857 This method is smart, it does consider only sequence of one or more letter and/or numbers
858 as "words", so a string like this: "! @ # % ... []" will return zero!
859 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
860 will be 4 not 1 (even if there are no spaces in the string).
862 >>> words_count('hello world')
864 >>> words_count('one,two,three.stop')
868 if not is_string(in_str):
869 raise ValueError(in_str)
870 return len(WORDS_COUNT_RE.findall(in_str))
873 def generate_uuid(omit_dashes: bool = False) -> str:
875 Generated an UUID string (using `uuid.uuid4()`).
877 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
878 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
887 def generate_random_alphanumeric_string(size: int) -> str:
889 Returns a string of the specified size containing random
890 characters (uppercase/lowercase ascii letters and digits).
892 random_string(9) # possible output: "cx3QQbzYg"
896 raise ValueError("size must be >= 1")
897 chars = string.ascii_letters + string.digits
898 buffer = [random.choice(chars) for _ in range(size)]
899 return from_char_list(buffer)
902 def reverse(in_str: str) -> str:
904 Returns the string with its chars reversed.
910 if not is_string(in_str):
911 raise ValueError(in_str)
915 def camel_case_to_snake_case(in_str, *, separator="_"):
917 Convert a camel case string into a snake case one.
918 (The original string is returned if is not a valid camel case string)
920 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
921 'mac_address_extractor_factory'
922 >>> camel_case_to_snake_case('Luke Skywalker')
925 if not is_string(in_str):
926 raise ValueError(in_str)
927 if not is_camel_case(in_str):
929 return CAMEL_CASE_REPLACE_RE.sub(
930 lambda m: m.group(1) + separator, in_str
934 def snake_case_to_camel_case(
935 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
938 Convert a snake case string into a camel case one.
939 (The original string is returned if is not a valid snake case string)
941 >>> snake_case_to_camel_case('this_is_a_test')
943 >>> snake_case_to_camel_case('Han Solo')
946 if not is_string(in_str):
947 raise ValueError(in_str)
948 if not is_snake_case(in_str, separator=separator):
950 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
951 if not upper_case_first:
952 tokens[0] = tokens[0].lower()
953 return from_char_list(tokens)
956 def to_char_list(in_str: str) -> List[str]:
957 """Convert a string into a list of chars.
959 >>> to_char_list('test')
962 if not is_string(in_str):
967 def from_char_list(in_list: List[str]) -> str:
968 """Convert a char list into a string.
970 >>> from_char_list(['t', 'e', 's', 't'])
973 return "".join(in_list)
976 def shuffle(in_str: str) -> str:
977 """Return a new string containing same chars of the given one but in
980 if not is_string(in_str):
981 raise ValueError(in_str)
983 # turn the string into a list of chars
984 chars = to_char_list(in_str)
985 random.shuffle(chars)
986 return from_char_list(chars)
989 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
991 Remove html code contained into the given string.
993 >>> strip_html('test: <a href="foo/bar">click here</a>')
995 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
998 if not is_string(in_str):
999 raise ValueError(in_str)
1000 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1001 return r.sub("", in_str)
1004 def asciify(in_str: str) -> str:
1006 Force string content to be ascii-only by translating all non-ascii
1007 chars into the closest possible representation (eg: ó -> o, Ë ->
1010 N.B. Some chars may be lost if impossible to translate.
1012 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1013 'eeuuooaaeynAAACIINOE'
1015 if not is_string(in_str):
1016 raise ValueError(in_str)
1018 # "NFKD" is the algorithm which is able to successfully translate
1019 # the most of non-ascii chars.
1020 normalized = unicodedata.normalize("NFKD", in_str)
1022 # encode string forcing ascii and ignore any errors
1023 # (unrepresentable chars will be stripped out)
1024 ascii_bytes = normalized.encode("ascii", "ignore")
1026 # turns encoded bytes into an utf-8 string
1027 return ascii_bytes.decode("utf-8")
1030 def slugify(in_str: str, *, separator: str = "-") -> str:
1032 Converts a string into a "slug" using provided separator.
1033 The returned string has the following properties:
1036 - all letters are in lower case
1037 - all punctuation signs and non alphanumeric chars are removed
1038 - words are divided using provided separator
1039 - all chars are encoded as ascii (by using `asciify()`)
1042 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1043 'top-10-reasons-to-love-dogs'
1044 >>> slugify('Mönstér Mägnët')
1047 if not is_string(in_str):
1048 raise ValueError(in_str)
1050 # replace any character that is NOT letter or number with spaces
1051 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1053 # replace spaces with join sign
1054 out = SPACES_RE.sub(separator, out)
1056 # normalize joins (remove duplicates)
1057 out = re.sub(re.escape(separator) + r"+", separator, out)
1061 def to_bool(in_str: str) -> bool:
1063 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1065 A positive boolean (True) is returned if the string value is one
1073 Otherwise False is returned.
1094 if not is_string(in_str):
1095 raise ValueError(in_str)
1096 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1099 def to_date(in_str: str) -> Optional[datetime.date]:
1101 Parses a date string. See DateParser docs for details.
1103 import dateparse.dateparse_utils as dp
1109 except dp.ParseException:
1110 msg = f'Unable to parse date {in_str}.'
1115 def valid_date(in_str: str) -> bool:
1117 True if the string represents a valid date.
1119 import dateparse.dateparse_utils as dp
1125 except dp.ParseException:
1126 msg = f'Unable to parse date {in_str}.'
1131 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1133 Parses a datetime string. See DateParser docs for more info.
1135 import dateparse.dateparse_utils as dp
1139 dt = d.parse(in_str)
1140 if type(dt) == datetime.datetime:
1143 msg = f'Unable to parse datetime {in_str}.'
1148 def valid_datetime(in_str: str) -> bool:
1150 True if the string represents a valid datetime.
1152 _ = to_datetime(in_str)
1155 msg = f'Unable to parse datetime {in_str}.'
1160 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1162 Squeeze runs of more than one character_to_squeeze into one.
1164 >>> squeeze(' this is a test ')
1167 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1172 r'(' + re.escape(character_to_squeeze) + r')+',
1173 character_to_squeeze,
1178 def dedent(in_str: str) -> str:
1180 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1182 if not is_string(in_str):
1183 raise ValueError(in_str)
1184 line_separator = '\n'
1185 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1186 return line_separator.join(lines)
1189 def indent(in_str: str, amount: int) -> str:
1191 Indents string by prepending amount spaces.
1193 >>> indent('This is a test', 4)
1197 if not is_string(in_str):
1198 raise ValueError(in_str)
1199 line_separator = '\n'
1200 lines = [" " * amount + line for line in in_str.split(line_separator)]
1201 return line_separator.join(lines)
1204 def sprintf(*args, **kwargs) -> str:
1205 """String printf, like in C"""
1208 sep = kwargs.pop("sep", None)
1210 if not isinstance(sep, str):
1211 raise TypeError("sep must be None or a string")
1213 end = kwargs.pop("end", None)
1215 if not isinstance(end, str):
1216 raise TypeError("end must be None or a string")
1219 raise TypeError("invalid keyword arguments to sprint()")
1225 for i, arg in enumerate(args):
1228 if isinstance(arg, str):
1236 class SprintfStdout(object):
1238 A context manager that captures outputs to stdout.
1240 with SprintfStdout() as buf:
1247 def __init__(self) -> None:
1248 self.destination = io.StringIO()
1249 self.recorder = None
1251 def __enter__(self) -> Callable[[], str]:
1252 self.recorder = contextlib.redirect_stdout(self.destination)
1253 self.recorder.__enter__()
1254 return lambda: self.destination.getvalue()
1256 def __exit__(self, *args) -> None:
1257 self.recorder.__exit__(*args)
1258 self.destination.seek(0)
1259 return None # don't suppress exceptions
1262 def is_are(n: int) -> str:
1276 def pluralize(n: int) -> str:
1282 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1285 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1294 def thify(n: int) -> str:
1295 """Return the proper cardinal suffix for a number.
1306 assert is_integer_number(digit)
1318 def ngrams(txt: str, n: int):
1319 """Return the ngrams from a string.
1321 >>> [x for x in ngrams('This is a test', 2)]
1322 ['This is', 'is a', 'a test']
1326 for ngram in ngrams_presplit(words, n):
1333 def ngrams_presplit(words: Sequence[str], n: int):
1334 return list_utils.ngrams(words, n)
1337 def bigrams(txt: str):
1338 return ngrams(txt, 2)
1341 def trigrams(txt: str):
1342 return ngrams(txt, 3)
1345 def shuffle_columns_into_list(
1346 input_lines: Iterable[str], column_specs: Iterable[Iterable[int]], delim=''
1348 """Helper to shuffle / parse columnar data and return the results as a
1349 list. The column_specs argument is an iterable collection of
1350 numeric sequences that indicate one or more column numbers to
1353 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1354 >>> shuffle_columns_into_list(
1356 ... [ [8], [2, 3], [5, 6, 7] ],
1359 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1364 # Column specs map input lines' columns into outputs.
1366 for spec in column_specs:
1369 chunk = chunk + delim + input_lines[n]
1370 chunk = chunk.strip(delim)
1375 def shuffle_columns_into_dict(
1376 input_lines: Iterable[str],
1377 column_specs: Iterable[Tuple[str, Iterable[int]]],
1379 ) -> Dict[str, str]:
1380 """Helper to shuffle / parse columnar data and return the results
1383 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1384 >>> shuffle_columns_into_dict(
1386 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1389 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1394 # Column specs map input lines' columns into outputs.
1395 # "key", [col1, col2...]
1396 for spec in column_specs:
1399 chunk = chunk + delim + input_lines[n]
1400 chunk = chunk.strip(delim)
1401 out[spec[0]] = chunk
1405 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1406 """Interpolate a string with data from a dict.
1408 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1409 ... {'adjective': 'good', 'noun': 'example'})
1410 'This is a good example.'
1413 return sprintf(txt.format(**values), end='')
1416 def to_ascii(x: str):
1417 """Encode as ascii bytes string.
1419 >>> to_ascii('test')
1422 >>> to_ascii(b'1, 2, 3')
1427 return x.encode('ascii')
1428 if type(x) is bytes:
1430 raise Exception('to_ascii works with strings and bytes')
1433 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
1434 """Encode txt and then encode the bytes with a 64-character
1435 alphabet. This is compatible with uudecode.
1437 >>> to_base64('hello?')
1441 return base64.encodebytes(txt.encode(encoding, errors))
1444 def is_base64(txt: str) -> bool:
1445 """Determine whether a string is base64 encoded (with Python's standard
1446 base64 alphabet which is the same as what uuencode uses).
1448 >>> is_base64('test') # all letters in the b64 alphabet
1451 >>> is_base64('another test, how do you like this one?')
1454 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1458 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1459 alphabet = set(a.encode('ascii'))
1460 for char in to_ascii(txt.strip()):
1461 if char not in alphabet:
1466 def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
1467 """Convert base64 encoded string back to normal strings.
1469 >>> from_base64(b'aGVsbG8/\\n')
1473 return base64.decodebytes(b64).decode(encoding, errors)
1476 def chunk(txt: str, chunk_size):
1477 """Chunk up a string.
1479 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1480 '01001101 11000101 10101010 10101010 10011111 10101000'
1483 if len(txt) % chunk_size != 0:
1484 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1486 warnings.warn(msg, stacklevel=2)
1487 for x in range(0, len(txt), chunk_size):
1488 yield txt[x : x + chunk_size]
1492 txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass'
1494 """Encode txt and then chop it into bytes. Note: only bitstrings
1495 with delimiter='' are interpretable by from_bitstring.
1497 >>> to_bitstring('hello?')
1498 '011010000110010101101100011011000110111100111111'
1500 >>> to_bitstring('test', delimiter=' ')
1501 '01110100 01100101 01110011 01110100'
1503 >>> to_bitstring(b'test')
1504 '01110100011001010111001101110100'
1507 etxt = to_ascii(txt)
1508 bits = bin(int.from_bytes(etxt, 'big'))
1510 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1513 def is_bitstring(txt: str) -> bool:
1514 """Is this a bitstring?
1516 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1519 >>> is_bitstring('1234')
1523 return is_binary_integer_number(f'0b{txt}')
1526 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1527 """Convert from bitstring back to bytes then decode into a str.
1529 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1535 n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors)
1540 def ip_v4_sort_key(txt: str) -> Tuple[int]:
1541 """Turn an IPv4 address into a tuple for sorting purposes.
1543 >>> ip_v4_sort_key('10.0.0.18')
1546 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1547 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1548 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1551 if not is_ip_v4(txt):
1552 print(f"not IP: {txt}")
1554 return tuple([int(x) for x in txt.split('.')])
1557 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str]:
1558 """Chunk up a file path so that parent/ancestor paths sort before
1559 children/descendant paths.
1561 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1562 ('usr', 'local', 'bin')
1564 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1565 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1566 ['/usr', '/usr/local', '/usr/local/bin']
1569 return tuple([x for x in volume.split('/') if len(x) > 0])
1572 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1573 """Execute several replace operations in a row.
1575 >>> s = 'this_is a-test!'
1576 >>> replace_all(s, ' _-!', '')
1580 for char in replace_set:
1581 in_str = in_str.replace(char, replacement)
1585 if __name__ == '__main__':