7 from itertools import zip_longest
14 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
16 from uuid import uuid4
21 logger = logging.getLogger(__name__)
23 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
25 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
27 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
29 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
32 r"([a-z-]+://)" # scheme
33 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
35 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
36 r"(:\d{2,})?" # port number
37 r"(/[a-z\d_%+-]*)*" # folders
38 r"(\.[a-z\d_%+-]+)*" # file extension
39 r"(\?[a-z\d_+%-=]*)?" # query string
43 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
45 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
47 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
49 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
51 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
53 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
55 CAMEL_CASE_TEST_RE = re.compile(
56 r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
59 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
61 SNAKE_CASE_TEST_RE = re.compile(
62 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
65 SNAKE_CASE_TEST_DASH_RE = re.compile(
66 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
69 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
71 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
74 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
75 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
76 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
77 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
78 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
79 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
82 JSON_WRAPPER_RE = re.compile(
83 r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
87 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
90 UUID_HEX_OK_RE = re.compile(
91 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
95 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
97 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
99 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
101 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
103 MAC_ADDRESS_RE = re.compile(
104 r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
107 ANYWHERE_MAC_ADDRESS_RE = re.compile(
108 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
111 WORDS_COUNT_RE = re.compile(
112 r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
115 HTML_RE = re.compile(
116 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
117 re.IGNORECASE | re.MULTILINE | re.DOTALL,
120 HTML_TAG_ONLY_RE = re.compile(
121 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
122 re.IGNORECASE | re.MULTILINE | re.DOTALL,
125 SPACES_RE = re.compile(r"\s")
127 NO_LETTERS_OR_NUMBERS_RE = re.compile(
128 r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
131 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
133 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
149 def is_none_or_empty(in_str: Optional[str]) -> bool:
151 Returns true if the input string is either None or an empty string.
153 >>> is_none_or_empty("")
155 >>> is_none_or_empty(None)
157 >>> is_none_or_empty(" \t ")
159 >>> is_none_or_empty('Test')
162 return in_str is None or len(in_str.strip()) == 0
165 def is_string(obj: Any) -> bool:
167 Checks if an object is a string.
169 >>> is_string('test')
175 >>> is_string([1, 2, 3])
178 return isinstance(obj, str)
181 def is_empty_string(in_str: Any) -> bool:
182 return is_empty(in_str)
185 def is_empty(in_str: Any) -> bool:
187 Checks if input is a string and empty or only whitespace.
191 >>> is_empty(' \t\t ')
197 >>> is_empty([1, 2, 3])
200 return is_string(in_str) and in_str.strip() == ""
203 def is_full_string(in_str: Any) -> bool:
205 Checks that input is a string and is not empty ('') or only whitespace.
207 >>> is_full_string('test!')
209 >>> is_full_string('')
211 >>> is_full_string(' ')
213 >>> is_full_string(100.999)
215 >>> is_full_string({"a": 1, "b": 2})
218 return is_string(in_str) and in_str.strip() != ""
221 def is_number(in_str: str) -> bool:
223 Checks if a string is a valid number.
226 Traceback (most recent call last):
229 >>> is_number("100.5")
231 >>> is_number("test")
235 >>> is_number([1, 2, 3])
236 Traceback (most recent call last):
238 ValueError: [1, 2, 3]
240 if not is_string(in_str):
241 raise ValueError(in_str)
242 return NUMBER_RE.match(in_str) is not None
245 def is_integer_number(in_str: str) -> bool:
247 Checks whether the given string represents an integer or not.
249 An integer may be signed or unsigned or use a "scientific notation".
251 >>> is_integer_number('42')
253 >>> is_integer_number('42.0')
257 (is_number(in_str) and "." not in in_str) or
258 is_hexidecimal_integer_number(in_str) or
259 is_octal_integer_number(in_str) or
260 is_binary_integer_number(in_str)
264 def is_hexidecimal_integer_number(in_str: str) -> bool:
266 Checks whether a string is a hex integer number.
268 >>> is_hexidecimal_integer_number('0x12345')
270 >>> is_hexidecimal_integer_number('0x1A3E')
272 >>> is_hexidecimal_integer_number('1234') # Needs 0x
274 >>> is_hexidecimal_integer_number('-0xff')
276 >>> is_hexidecimal_integer_number('test')
278 >>> is_hexidecimal_integer_number(12345) # Not a string
279 Traceback (most recent call last):
282 >>> is_hexidecimal_integer_number(101.4)
283 Traceback (most recent call last):
286 >>> is_hexidecimal_integer_number(0x1A3E)
287 Traceback (most recent call last):
291 if not is_string(in_str):
292 raise ValueError(in_str)
293 return HEX_NUMBER_RE.match(in_str) is not None
296 def is_octal_integer_number(in_str: str) -> bool:
298 Checks whether a string is an octal number.
300 >>> is_octal_integer_number('0o777')
302 >>> is_octal_integer_number('-0O115')
304 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
306 >>> is_octal_integer_number('7777') # Needs 0o
308 >>> is_octal_integer_number('test')
311 if not is_string(in_str):
312 raise ValueError(in_str)
313 return OCT_NUMBER_RE.match(in_str) is not None
316 def is_binary_integer_number(in_str: str) -> bool:
318 Returns whether a string contains a binary number.
320 >>> is_binary_integer_number('0b10111')
322 >>> is_binary_integer_number('-0b111')
324 >>> is_binary_integer_number('0B10101')
326 >>> is_binary_integer_number('0b10102')
328 >>> is_binary_integer_number('0xFFF')
330 >>> is_binary_integer_number('test')
333 if not is_string(in_str):
334 raise ValueError(in_str)
335 return BIN_NUMBER_RE.match(in_str) is not None
338 def to_int(in_str: str) -> int:
339 """Returns the integral value of the string or raises on error.
344 Traceback (most recent call last):
346 ValueError: invalid literal for int() with base 10: 'test'
348 if not is_string(in_str):
349 raise ValueError(in_str)
350 if is_binary_integer_number(in_str):
351 return int(in_str, 2)
352 if is_octal_integer_number(in_str):
353 return int(in_str, 8)
354 if is_hexidecimal_integer_number(in_str):
355 return int(in_str, 16)
359 def is_decimal_number(in_str: str) -> bool:
361 Checks whether the given string represents a decimal or not.
363 A decimal may be signed or unsigned or use a "scientific notation".
365 >>> is_decimal_number('42.0')
367 >>> is_decimal_number('42')
370 return is_number(in_str) and "." in in_str
373 def strip_escape_sequences(in_str: str) -> str:
375 Remove escape sequences in the input string.
377 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
380 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
384 def add_thousands_separator(
387 separator_char = ',',
391 Add thousands separator to a numeric string. Also handles numbers.
393 >>> add_thousands_separator('12345678')
395 >>> add_thousands_separator(12345678)
397 >>> add_thousands_separator(12345678.99)
399 >>> add_thousands_separator('test')
400 Traceback (most recent call last):
405 if isinstance(in_str, numbers.Number):
407 if is_number(in_str):
408 return _add_thousands_separator(
410 separator_char = separator_char,
413 raise ValueError(in_str)
416 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
419 (in_str, decimal_part) = in_str.split('.')
420 tmp = [iter(in_str[::-1])] * places
421 ret = separator_char.join(
422 "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
423 if len(decimal_part) > 0:
430 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
431 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
433 Check if a string is a valid url.
435 >>> is_url('http://www.mysite.com')
437 >>> is_url('https://mysite.com')
439 >>> is_url('.mysite.com')
442 if not is_full_string(in_str):
445 valid = URL_RE.match(in_str) is not None
448 return valid and any([in_str.startswith(s) for s in allowed_schemes])
452 def is_email(in_str: Any) -> bool:
454 Check if a string is a valid email.
456 Reference: https://tools.ietf.org/html/rfc3696#section-3
460 >>> is_email('@gmail.com')
464 not is_full_string(in_str)
466 or in_str.startswith(".")
471 # we expect 2 tokens, one before "@" and one after, otherwise
472 # we have an exception and the email is not valid.
473 head, tail = in_str.split("@")
475 # head's size must be <= 64, tail <= 255, head must not start
476 # with a dot or contain multiple consecutive dots.
480 or head.endswith(".")
485 # removes escaped spaces, so that later on the test regex will
487 head = head.replace("\\ ", "")
488 if head.startswith('"') and head.endswith('"'):
489 head = head.replace(" ", "")[1:-1]
490 return EMAIL_RE.match(head + "@" + tail) is not None
493 # borderline case in which we have multiple "@" signs but the
494 # head part is correctly escaped.
495 if ESCAPED_AT_SIGN.search(in_str) is not None:
496 # replace "@" with "a" in the head
497 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
501 def suffix_string_to_number(in_str: str) -> Optional[int]:
502 """Take a string like "33Gb" and convert it into a number (of bytes)
503 like 34603008. Return None if the input string is not valid.
505 >>> suffix_string_to_number('1Mb')
507 >>> suffix_string_to_number('13.1Gb')
510 def suffix_capitalize(s: str) -> str:
514 return f"{s[0].upper()}{s[1].lower()}"
515 return suffix_capitalize(s[0:1])
517 if is_string(in_str):
518 if is_integer_number(in_str):
519 return to_int(in_str)
520 suffixes = [in_str[-2:], in_str[-1:]]
521 rest = [in_str[:-2], in_str[:-1]]
522 for x in range(len(suffixes)):
524 s = suffix_capitalize(s)
525 multiplier = NUM_SUFFIXES.get(s, None)
526 if multiplier is not None:
528 if is_integer_number(r):
529 return to_int(r) * multiplier
530 if is_decimal_number(r):
531 return int(float(r) * multiplier)
535 def number_to_suffix_string(num: int) -> Optional[str]:
536 """Take a number (of bytes) and returns a string like "43.8Gb".
537 Returns none if the input is invalid.
539 >>> number_to_suffix_string(14066017894)
541 >>> number_to_suffix_string(1024 * 1024)
547 for (sfx, size) in NUM_SUFFIXES.items():
552 if suffix is not None:
553 return f"{d:.1f}{suffix}"
558 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
560 Checks if a string is a valid credit card number.
561 If card type is provided then it checks against that specific type only,
562 otherwise any known credit card number will be accepted.
564 Supported card types are the following:
573 if not is_full_string(in_str):
576 if card_type is not None:
577 if card_type not in CREDIT_CARDS:
579 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
581 return CREDIT_CARDS[card_type].match(in_str) is not None
582 for c in CREDIT_CARDS:
583 if CREDIT_CARDS[c].match(in_str) is not None:
588 def is_camel_case(in_str: Any) -> bool:
590 Checks if a string is formatted as camel case.
592 A string is considered camel case when:
594 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
595 - it contains both lowercase and uppercase letters
596 - it does not start with a number
599 is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
603 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
605 Checks if a string is formatted as "snake case".
607 A string is considered snake case when:
609 - it's composed only by lowercase/uppercase letters and digits
610 - it contains at least one underscore (or provided separator)
611 - it does not start with a number
613 >>> is_snake_case('this_is_a_test')
615 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
617 >>> is_snake_case('this-is-a-test')
619 >>> is_snake_case('this-is-a-test', separator='-')
623 if is_full_string(in_str):
624 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
626 r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
631 re_template.format(sign=re.escape(separator)), re.IGNORECASE
634 return r.match(in_str) is not None
638 def is_json(in_str: Any) -> bool:
640 Check if a string is a valid json.
642 >>> is_json('{"name": "Peter"}')
644 >>> is_json('[1, 2, 3]')
646 >>> is_json('{nope}')
649 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
651 return isinstance(json.loads(in_str), (dict, list))
652 except (TypeError, ValueError, OverflowError):
657 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
659 Check if a string is a valid UUID.
661 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
663 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
665 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
668 # string casting is used to allow UUID itself as input data type
671 return UUID_HEX_OK_RE.match(s) is not None
672 return UUID_RE.match(s) is not None
675 def is_ip_v4(in_str: Any) -> bool:
677 Checks if a string is a valid ip v4.
679 >>> is_ip_v4('255.200.100.75')
683 >>> is_ip_v4('255.200.100.999') # 999 out of range
686 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
689 # checks that each entry in the ip is in the valid range (0 to 255)
690 for token in in_str.split("."):
691 if not 0 <= int(token) <= 255:
696 def extract_ip_v4(in_str: Any) -> Optional[str]:
698 Extracts the IPv4 chunk of a string or None.
700 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
702 >>> extract_ip_v4('Your mom dresses you funny.')
704 if not is_full_string(in_str):
706 m = ANYWHERE_IP_V4_RE.search(in_str)
712 def is_ip_v6(in_str: Any) -> bool:
714 Checks if a string is a valid ip v6.
716 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
718 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
721 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
724 def extract_ip_v6(in_str: Any) -> Optional[str]:
726 Extract IPv6 chunk or None.
728 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
729 '2001:db8:85a3:0000:0000:8a2e:370:7334'
730 >>> extract_ip_v6("(and she's ugly too, btw)")
732 if not is_full_string(in_str):
734 m = ANYWHERE_IP_V6_RE.search(in_str)
740 def is_ip(in_str: Any) -> bool:
742 Checks if a string is a valid ip (either v4 or v6).
744 >>> is_ip('255.200.100.75')
746 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
750 >>> is_ip('1.2.3.999')
753 return is_ip_v6(in_str) or is_ip_v4(in_str)
756 def extract_ip(in_str: Any) -> Optional[str]:
758 Extract the IP address or None.
760 >>> extract_ip('Attacker: 255.200.100.75')
762 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
763 '2001:db8:85a3:0000:0000:8a2e:370:7334'
764 >>> extract_ip('1.2.3')
767 ip = extract_ip_v4(in_str)
769 ip = extract_ip_v6(in_str)
773 def is_mac_address(in_str: Any) -> bool:
774 """Return True if in_str is a valid MAC address false otherwise.
776 >>> is_mac_address("34:29:8F:12:0D:2F")
778 >>> is_mac_address('34:29:8f:12:0d:2f')
780 >>> is_mac_address('34-29-8F-12-0D-2F')
782 >>> is_mac_address("test")
785 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
788 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
790 Extract the MAC address from in_str.
792 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
795 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
799 if not is_full_string(in_str):
802 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
805 mac.replace(":", separator)
806 mac.replace("-", separator)
811 def is_slug(in_str: Any, separator: str = "-") -> bool:
813 Checks if a given string is a slug (as created by `slugify()`).
815 >>> is_slug('my-blog-post-title')
817 >>> is_slug('My blog post title')
821 if not is_full_string(in_str):
823 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
824 return re.match(rex, in_str) is not None
827 def contains_html(in_str: str) -> bool:
829 Checks if the given string contains HTML/XML tags.
831 By design, this function matches ANY type of tag, so don't expect to use it
832 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
834 >>> contains_html('my string is <strong>bold</strong>')
836 >>> contains_html('my string is not bold')
840 if not is_string(in_str):
841 raise ValueError(in_str)
842 return HTML_RE.search(in_str) is not None
845 def words_count(in_str: str) -> int:
847 Returns the number of words contained into the given string.
849 This method is smart, it does consider only sequence of one or more letter and/or numbers
850 as "words", so a string like this: "! @ # % ... []" will return zero!
851 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
852 will be 4 not 1 (even if there are no spaces in the string).
854 >>> words_count('hello world')
856 >>> words_count('one,two,three.stop')
860 if not is_string(in_str):
861 raise ValueError(in_str)
862 return len(WORDS_COUNT_RE.findall(in_str))
865 def generate_uuid(omit_dashes: bool = False) -> str:
867 Generated an UUID string (using `uuid.uuid4()`).
869 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
870 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
879 def generate_random_alphanumeric_string(size: int) -> str:
881 Returns a string of the specified size containing random
882 characters (uppercase/lowercase ascii letters and digits).
884 random_string(9) # possible output: "cx3QQbzYg"
888 raise ValueError("size must be >= 1")
889 chars = string.ascii_letters + string.digits
890 buffer = [random.choice(chars) for _ in range(size)]
891 return from_char_list(buffer)
894 def reverse(in_str: str) -> str:
896 Returns the string with its chars reversed.
902 if not is_string(in_str):
903 raise ValueError(in_str)
907 def camel_case_to_snake_case(in_str, *, separator="_"):
909 Convert a camel case string into a snake case one.
910 (The original string is returned if is not a valid camel case string)
912 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
913 'mac_address_extractor_factory'
914 >>> camel_case_to_snake_case('Luke Skywalker')
917 if not is_string(in_str):
918 raise ValueError(in_str)
919 if not is_camel_case(in_str):
921 return CAMEL_CASE_REPLACE_RE.sub(
922 lambda m: m.group(1) + separator, in_str
926 def snake_case_to_camel_case(
927 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
930 Convert a snake case string into a camel case one.
931 (The original string is returned if is not a valid snake case string)
933 >>> snake_case_to_camel_case('this_is_a_test')
935 >>> snake_case_to_camel_case('Han Solo')
938 if not is_string(in_str):
939 raise ValueError(in_str)
940 if not is_snake_case(in_str, separator=separator):
942 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
943 if not upper_case_first:
944 tokens[0] = tokens[0].lower()
945 return from_char_list(tokens)
948 def to_char_list(in_str: str) -> List[str]:
949 """Convert a string into a list of chars.
951 >>> to_char_list('test')
954 if not is_string(in_str):
959 def from_char_list(in_list: List[str]) -> str:
960 """Convert a char list into a string.
962 >>> from_char_list(['t', 'e', 's', 't'])
965 return "".join(in_list)
968 def shuffle(in_str: str) -> str:
969 """Return a new string containing same chars of the given one but in
972 if not is_string(in_str):
973 raise ValueError(in_str)
975 # turn the string into a list of chars
976 chars = to_char_list(in_str)
977 random.shuffle(chars)
978 return from_char_list(chars)
981 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
983 Remove html code contained into the given string.
985 >>> strip_html('test: <a href="foo/bar">click here</a>')
987 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
990 if not is_string(in_str):
991 raise ValueError(in_str)
992 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
993 return r.sub("", in_str)
996 def asciify(in_str: str) -> str:
998 Force string content to be ascii-only by translating all non-ascii
999 chars into the closest possible representation (eg: ó -> o, Ë ->
1002 N.B. Some chars may be lost if impossible to translate.
1004 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1005 'eeuuooaaeynAAACIINOE'
1007 if not is_string(in_str):
1008 raise ValueError(in_str)
1010 # "NFKD" is the algorithm which is able to successfully translate
1011 # the most of non-ascii chars.
1012 normalized = unicodedata.normalize("NFKD", in_str)
1014 # encode string forcing ascii and ignore any errors
1015 # (unrepresentable chars will be stripped out)
1016 ascii_bytes = normalized.encode("ascii", "ignore")
1018 # turns encoded bytes into an utf-8 string
1019 return ascii_bytes.decode("utf-8")
1022 def slugify(in_str: str, *, separator: str = "-") -> str:
1024 Converts a string into a "slug" using provided separator.
1025 The returned string has the following properties:
1028 - all letters are in lower case
1029 - all punctuation signs and non alphanumeric chars are removed
1030 - words are divided using provided separator
1031 - all chars are encoded as ascii (by using `asciify()`)
1034 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1035 'top-10-reasons-to-love-dogs'
1036 >>> slugify('Mönstér Mägnët')
1039 if not is_string(in_str):
1040 raise ValueError(in_str)
1042 # replace any character that is NOT letter or number with spaces
1043 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1045 # replace spaces with join sign
1046 out = SPACES_RE.sub(separator, out)
1048 # normalize joins (remove duplicates)
1049 out = re.sub(re.escape(separator) + r"+", separator, out)
1053 def to_bool(in_str: str) -> bool:
1055 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1057 A positive boolean (True) is returned if the string value is one
1065 Otherwise False is returned.
1086 if not is_string(in_str):
1087 raise ValueError(in_str)
1088 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1091 def to_date(in_str: str) -> Optional[datetime.date]:
1093 Parses a date string. See DateParser docs for details.
1095 import dateparse.dateparse_utils as dp
1100 except dp.ParseException:
1101 msg = f'Unable to parse date {in_str}.'
1106 def valid_date(in_str: str) -> bool:
1108 True if the string represents a valid date.
1110 import dateparse.dateparse_utils as dp
1115 except dp.ParseException:
1116 msg = f'Unable to parse date {in_str}.'
1121 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1123 Parses a datetime string. See DateParser docs for more info.
1125 import dateparse.dateparse_utils as dp
1128 dt = d.parse(in_str)
1129 if type(dt) == datetime.datetime:
1132 msg = f'Unable to parse datetime {in_str}.'
1137 def valid_datetime(in_str: str) -> bool:
1139 True if the string represents a valid datetime.
1141 _ = to_datetime(in_str)
1144 msg = f'Unable to parse datetime {in_str}.'
1149 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1151 Squeeze runs of more than one character_to_squeeze into one.
1153 >>> squeeze(' this is a test ')
1156 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1161 r'(' + re.escape(character_to_squeeze) + r')+',
1162 character_to_squeeze,
1167 def dedent(in_str: str) -> str:
1169 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1171 if not is_string(in_str):
1172 raise ValueError(in_str)
1173 line_separator = '\n'
1174 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1175 return line_separator.join(lines)
1178 def indent(in_str: str, amount: int) -> str:
1180 Indents string by prepending amount spaces.
1182 >>> indent('This is a test', 4)
1186 if not is_string(in_str):
1187 raise ValueError(in_str)
1188 line_separator = '\n'
1189 lines = [" " * amount + line for line in in_str.split(line_separator)]
1190 return line_separator.join(lines)
1193 def sprintf(*args, **kwargs) -> str:
1194 """String printf, like in C"""
1197 sep = kwargs.pop("sep", None)
1199 if not isinstance(sep, str):
1200 raise TypeError("sep must be None or a string")
1202 end = kwargs.pop("end", None)
1204 if not isinstance(end, str):
1205 raise TypeError("end must be None or a string")
1208 raise TypeError("invalid keyword arguments to sprint()")
1214 for i, arg in enumerate(args):
1217 if isinstance(arg, str):
1225 class SprintfStdout(object):
1227 A context manager that captures outputs to stdout.
1229 with SprintfStdout() as buf:
1235 def __init__(self) -> None:
1236 self.destination = io.StringIO()
1237 self.recorder = None
1239 def __enter__(self) -> Callable[[], str]:
1240 self.recorder = contextlib.redirect_stdout(self.destination)
1241 self.recorder.__enter__()
1242 return lambda: self.destination.getvalue()
1244 def __exit__(self, *args) -> None:
1245 self.recorder.__exit__(*args)
1246 self.destination.seek(0)
1247 return None # don't suppress exceptions
1250 def is_are(n: int) -> str:
1264 def pluralize(n: int) -> str:
1270 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1273 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1282 def thify(n: int) -> str:
1283 """Return the proper cardinal suffix for a number.
1294 assert is_integer_number(digit)
1306 def ngrams(txt: str, n: int):
1307 """Return the ngrams from a string.
1309 >>> [x for x in ngrams('This is a test', 2)]
1310 ['This is', 'is a', 'a test']
1314 for ngram in ngrams_presplit(words, n):
1321 def ngrams_presplit(words: Sequence[str], n: int):
1322 return list_utils.ngrams(words, n)
1325 def bigrams(txt: str):
1326 return ngrams(txt, 2)
1329 def trigrams(txt: str):
1330 return ngrams(txt, 3)
1333 def shuffle_columns_into_list(
1334 input_lines: Iterable[str],
1335 column_specs: Iterable[Iterable[int]],
1338 """Helper to shuffle / parse columnar data and return the results as a
1339 list. The column_specs argument is an iterable collection of
1340 numeric sequences that indicate one or more column numbers to
1343 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1344 >>> shuffle_columns_into_list(
1346 ... [ [8], [2, 3], [5, 6, 7] ],
1349 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1354 # Column specs map input lines' columns into outputs.
1356 for spec in column_specs:
1359 chunk = chunk + delim + input_lines[n]
1360 chunk = chunk.strip(delim)
1365 def shuffle_columns_into_dict(
1366 input_lines: Iterable[str],
1367 column_specs: Iterable[Tuple[str, Iterable[int]]],
1369 ) -> Dict[str, str]:
1370 """Helper to shuffle / parse columnar data and return the results
1373 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1374 >>> shuffle_columns_into_dict(
1376 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1379 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1384 # Column specs map input lines' columns into outputs.
1385 # "key", [col1, col2...]
1386 for spec in column_specs:
1389 chunk = chunk + delim + input_lines[n]
1390 chunk = chunk.strip(delim)
1391 out[spec[0]] = chunk
1395 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1396 """Interpolate a string with data from a dict.
1398 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1399 ... {'adjective': 'good', 'noun': 'example'})
1400 'This is a good example.'
1403 return sprintf(txt.format(**values), end='')
1406 def to_ascii(x: str):
1407 """Encode as ascii bytes string.
1409 >>> to_ascii('test')
1412 >>> to_ascii(b'1, 2, 3')
1417 return x.encode('ascii')
1418 if type(x) is bytes:
1420 raise Exception('to_ascii works with strings and bytes')
1423 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
1424 """Encode txt and then encode the bytes with a 64-character
1425 alphabet. This is compatible with uudecode.
1427 >>> to_base64('hello?')
1431 return base64.encodebytes(txt.encode(encoding, errors))
1434 def is_base64(txt: str) -> bool:
1435 """Determine whether a string is base64 encoded (with Python's standard
1436 base64 alphabet which is the same as what uuencode uses).
1438 >>> is_base64('test') # all letters in the b64 alphabet
1441 >>> is_base64('another test, how do you like this one?')
1444 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1448 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1449 alphabet = set(a.encode('ascii'))
1450 for char in to_ascii(txt.strip()):
1451 if char not in alphabet:
1456 def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
1457 """Convert base64 encoded string back to normal strings.
1459 >>> from_base64(b'aGVsbG8/\\n')
1463 return base64.decodebytes(b64).decode(encoding, errors)
1466 def chunk(txt: str, chunk_size):
1467 """Chunk up a string.
1469 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1470 '01001101 11000101 10101010 10101010 10011111 10101000'
1473 if len(txt) % chunk_size != 0:
1474 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1476 warnings.warn(msg, stacklevel=2)
1477 for x in range(0, len(txt), chunk_size):
1478 yield txt[x:x+chunk_size]
1481 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1482 """Encode txt and then chop it into bytes. Note: only bitstrings
1483 with delimiter='' are interpretable by from_bitstring.
1485 >>> to_bitstring('hello?')
1486 '011010000110010101101100011011000110111100111111'
1488 >>> to_bitstring('test', delimiter=' ')
1489 '01110100 01100101 01110011 01110100'
1491 >>> to_bitstring(b'test')
1492 '01110100011001010111001101110100'
1495 etxt = to_ascii(txt)
1503 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1506 def is_bitstring(txt: str) -> bool:
1507 """Is this a bitstring?
1509 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1512 >>> is_bitstring('1234')
1516 return is_binary_integer_number(f'0b{txt}')
1519 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1520 """Convert from bitstring back to bytes then decode into a str.
1522 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1527 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1530 def ip_v4_sort_key(txt: str) -> Tuple[int]:
1531 """Turn an IPv4 address into a tuple for sorting purposes.
1533 >>> ip_v4_sort_key('10.0.0.18')
1536 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1537 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1538 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1541 if not is_ip_v4(txt):
1542 print(f"not IP: {txt}")
1544 return tuple([int(x) for x in txt.split('.')])
1547 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str]:
1548 """Chunk up a file path so that parent/ancestor paths sort before
1549 children/descendant paths.
1551 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1552 ('usr', 'local', 'bin')
1554 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1555 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1556 ['/usr', '/usr/local', '/usr/local/bin']
1559 return tuple([x for x in volume.split('/') if len(x) > 0])
1562 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1563 """Execute several replace operations in a row.
1565 >>> s = 'this_is a-test!'
1566 >>> replace_all(s, ' _-!', '')
1570 for char in replace_set:
1571 in_str = in_str.replace(char, replacement)
1575 if __name__ == '__main__':