7 from itertools import zip_longest
14 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
16 from uuid import uuid4
20 logger = logging.getLogger(__name__)
22 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
24 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
26 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
28 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
31 r"([a-z-]+://)" # scheme
32 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
34 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
35 r"(:\d{2,})?" # port number
36 r"(/[a-z\d_%+-]*)*" # folders
37 r"(\.[a-z\d_%+-]+)*" # file extension
38 r"(\?[a-z\d_+%-=]*)?" # query string
42 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
44 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
46 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
48 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
50 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
52 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
54 CAMEL_CASE_TEST_RE = re.compile(
55 r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
58 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
60 SNAKE_CASE_TEST_RE = re.compile(
61 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
64 SNAKE_CASE_TEST_DASH_RE = re.compile(
65 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
68 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
70 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
73 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
74 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
75 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
76 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
77 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
78 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
81 JSON_WRAPPER_RE = re.compile(
82 r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
86 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
89 UUID_HEX_OK_RE = re.compile(
90 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
94 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
96 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
98 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
100 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
102 MAC_ADDRESS_RE = re.compile(
103 r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
106 ANYWHERE_MAC_ADDRESS_RE = re.compile(
107 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
110 WORDS_COUNT_RE = re.compile(
111 r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
114 HTML_RE = re.compile(
115 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
116 re.IGNORECASE | re.MULTILINE | re.DOTALL,
119 HTML_TAG_ONLY_RE = re.compile(
120 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
121 re.IGNORECASE | re.MULTILINE | re.DOTALL,
124 SPACES_RE = re.compile(r"\s")
126 NO_LETTERS_OR_NUMBERS_RE = re.compile(
127 r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
130 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
132 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
148 def is_none_or_empty(in_str: Optional[str]) -> bool:
150 Returns true if the input string is either None or an empty string.
152 >>> is_none_or_empty("")
154 >>> is_none_or_empty(None)
156 >>> is_none_or_empty(" \t ")
158 >>> is_none_or_empty('Test')
161 return in_str is None or len(in_str.strip()) == 0
164 def is_string(obj: Any) -> bool:
166 Checks if an object is a string.
168 >>> is_string('test')
174 >>> is_string([1, 2, 3])
177 return isinstance(obj, str)
180 def is_empty_string(in_str: Any) -> bool:
181 return is_empty(in_str)
184 def is_empty(in_str: Any) -> bool:
186 Checks if input is a string and empty or only whitespace.
190 >>> is_empty(' \t\t ')
196 >>> is_empty([1, 2, 3])
199 return is_string(in_str) and in_str.strip() == ""
202 def is_full_string(in_str: Any) -> bool:
204 Checks that input is a string and is not empty ('') or only whitespace.
206 >>> is_full_string('test!')
208 >>> is_full_string('')
210 >>> is_full_string(' ')
212 >>> is_full_string(100.999)
214 >>> is_full_string({"a": 1, "b": 2})
217 return is_string(in_str) and in_str.strip() != ""
220 def is_number(in_str: str) -> bool:
222 Checks if a string is a valid number.
225 Traceback (most recent call last):
228 >>> is_number("100.5")
230 >>> is_number("test")
234 >>> is_number([1, 2, 3])
235 Traceback (most recent call last):
237 ValueError: [1, 2, 3]
239 if not is_string(in_str):
240 raise ValueError(in_str)
241 return NUMBER_RE.match(in_str) is not None
244 def is_integer_number(in_str: str) -> bool:
246 Checks whether the given string represents an integer or not.
248 An integer may be signed or unsigned or use a "scientific notation".
250 >>> is_integer_number('42')
252 >>> is_integer_number('42.0')
256 (is_number(in_str) and "." not in in_str) or
257 is_hexidecimal_integer_number(in_str) or
258 is_octal_integer_number(in_str) or
259 is_binary_integer_number(in_str)
263 def is_hexidecimal_integer_number(in_str: str) -> bool:
265 Checks whether a string is a hex integer number.
267 >>> is_hexidecimal_integer_number('0x12345')
269 >>> is_hexidecimal_integer_number('0x1A3E')
271 >>> is_hexidecimal_integer_number('1234') # Needs 0x
273 >>> is_hexidecimal_integer_number('-0xff')
275 >>> is_hexidecimal_integer_number('test')
277 >>> is_hexidecimal_integer_number(12345) # Not a string
278 Traceback (most recent call last):
281 >>> is_hexidecimal_integer_number(101.4)
282 Traceback (most recent call last):
285 >>> is_hexidecimal_integer_number(0x1A3E)
286 Traceback (most recent call last):
290 if not is_string(in_str):
291 raise ValueError(in_str)
292 return HEX_NUMBER_RE.match(in_str) is not None
295 def is_octal_integer_number(in_str: str) -> bool:
297 Checks whether a string is an octal number.
299 >>> is_octal_integer_number('0o777')
301 >>> is_octal_integer_number('-0O115')
303 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
305 >>> is_octal_integer_number('7777') # Needs 0o
307 >>> is_octal_integer_number('test')
310 if not is_string(in_str):
311 raise ValueError(in_str)
312 return OCT_NUMBER_RE.match(in_str) is not None
315 def is_binary_integer_number(in_str: str) -> bool:
317 Returns whether a string contains a binary number.
319 >>> is_binary_integer_number('0b10111')
321 >>> is_binary_integer_number('-0b111')
323 >>> is_binary_integer_number('0B10101')
325 >>> is_binary_integer_number('0b10102')
327 >>> is_binary_integer_number('0xFFF')
329 >>> is_binary_integer_number('test')
332 if not is_string(in_str):
333 raise ValueError(in_str)
334 return BIN_NUMBER_RE.match(in_str) is not None
337 def to_int(in_str: str) -> int:
338 """Returns the integral value of the string or raises on error.
343 Traceback (most recent call last):
345 ValueError: invalid literal for int() with base 10: 'test'
347 if not is_string(in_str):
348 raise ValueError(in_str)
349 if is_binary_integer_number(in_str):
350 return int(in_str, 2)
351 if is_octal_integer_number(in_str):
352 return int(in_str, 8)
353 if is_hexidecimal_integer_number(in_str):
354 return int(in_str, 16)
358 def is_decimal_number(in_str: str) -> bool:
360 Checks whether the given string represents a decimal or not.
362 A decimal may be signed or unsigned or use a "scientific notation".
364 >>> is_decimal_number('42.0')
366 >>> is_decimal_number('42')
369 return is_number(in_str) and "." in in_str
372 def strip_escape_sequences(in_str: str) -> str:
374 Remove escape sequences in the input string.
376 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
379 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
383 def add_thousands_separator(
386 separator_char = ',',
390 Add thousands separator to a numeric string. Also handles numbers.
392 >>> add_thousands_separator('12345678')
394 >>> add_thousands_separator(12345678)
396 >>> add_thousands_separator(12345678.99)
398 >>> add_thousands_separator('test')
399 Traceback (most recent call last):
404 if isinstance(in_str, numbers.Number):
406 if is_number(in_str):
407 return _add_thousands_separator(
409 separator_char = separator_char,
412 raise ValueError(in_str)
415 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
418 (in_str, decimal_part) = in_str.split('.')
419 tmp = [iter(in_str[::-1])] * places
420 ret = separator_char.join(
421 "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
422 if len(decimal_part) > 0:
429 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
430 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
432 Check if a string is a valid url.
434 >>> is_url('http://www.mysite.com')
436 >>> is_url('https://mysite.com')
438 >>> is_url('.mysite.com')
441 if not is_full_string(in_str):
444 valid = URL_RE.match(in_str) is not None
447 return valid and any([in_str.startswith(s) for s in allowed_schemes])
451 def is_email(in_str: Any) -> bool:
453 Check if a string is a valid email.
455 Reference: https://tools.ietf.org/html/rfc3696#section-3
459 >>> is_email('@gmail.com')
463 not is_full_string(in_str)
465 or in_str.startswith(".")
470 # we expect 2 tokens, one before "@" and one after, otherwise
471 # we have an exception and the email is not valid.
472 head, tail = in_str.split("@")
474 # head's size must be <= 64, tail <= 255, head must not start
475 # with a dot or contain multiple consecutive dots.
479 or head.endswith(".")
484 # removes escaped spaces, so that later on the test regex will
486 head = head.replace("\\ ", "")
487 if head.startswith('"') and head.endswith('"'):
488 head = head.replace(" ", "")[1:-1]
489 return EMAIL_RE.match(head + "@" + tail) is not None
492 # borderline case in which we have multiple "@" signs but the
493 # head part is correctly escaped.
494 if ESCAPED_AT_SIGN.search(in_str) is not None:
495 # replace "@" with "a" in the head
496 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
500 def suffix_string_to_number(in_str: str) -> Optional[int]:
501 """Take a string like "33Gb" and convert it into a number (of bytes)
502 like 34603008. Return None if the input string is not valid.
504 >>> suffix_string_to_number('1Mb')
506 >>> suffix_string_to_number('13.1Gb')
509 def suffix_capitalize(s: str) -> str:
513 return f"{s[0].upper()}{s[1].lower()}"
514 return suffix_capitalize(s[0:1])
516 if is_string(in_str):
517 if is_integer_number(in_str):
518 return to_int(in_str)
519 suffixes = [in_str[-2:], in_str[-1:]]
520 rest = [in_str[:-2], in_str[:-1]]
521 for x in range(len(suffixes)):
523 s = suffix_capitalize(s)
524 multiplier = NUM_SUFFIXES.get(s, None)
525 if multiplier is not None:
527 if is_integer_number(r):
528 return to_int(r) * multiplier
529 if is_decimal_number(r):
530 return int(float(r) * multiplier)
534 def number_to_suffix_string(num: int) -> Optional[str]:
535 """Take a number (of bytes) and returns a string like "43.8Gb".
536 Returns none if the input is invalid.
538 >>> number_to_suffix_string(14066017894)
540 >>> number_to_suffix_string(1024 * 1024)
546 for (sfx, size) in NUM_SUFFIXES.items():
551 if suffix is not None:
552 return f"{d:.1f}{suffix}"
557 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
559 Checks if a string is a valid credit card number.
560 If card type is provided then it checks against that specific type only,
561 otherwise any known credit card number will be accepted.
563 Supported card types are the following:
572 if not is_full_string(in_str):
575 if card_type is not None:
576 if card_type not in CREDIT_CARDS:
578 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
580 return CREDIT_CARDS[card_type].match(in_str) is not None
581 for c in CREDIT_CARDS:
582 if CREDIT_CARDS[c].match(in_str) is not None:
587 def is_camel_case(in_str: Any) -> bool:
589 Checks if a string is formatted as camel case.
591 A string is considered camel case when:
593 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
594 - it contains both lowercase and uppercase letters
595 - it does not start with a number
598 is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
602 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
604 Checks if a string is formatted as "snake case".
606 A string is considered snake case when:
608 - it's composed only by lowercase/uppercase letters and digits
609 - it contains at least one underscore (or provided separator)
610 - it does not start with a number
612 >>> is_snake_case('this_is_a_test')
614 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
616 >>> is_snake_case('this-is-a-test')
618 >>> is_snake_case('this-is-a-test', separator='-')
622 if is_full_string(in_str):
623 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
625 r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
630 re_template.format(sign=re.escape(separator)), re.IGNORECASE
633 return r.match(in_str) is not None
637 def is_json(in_str: Any) -> bool:
639 Check if a string is a valid json.
641 >>> is_json('{"name": "Peter"}')
643 >>> is_json('[1, 2, 3]')
645 >>> is_json('{nope}')
648 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
650 return isinstance(json.loads(in_str), (dict, list))
651 except (TypeError, ValueError, OverflowError):
656 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
658 Check if a string is a valid UUID.
660 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
662 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
664 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
667 # string casting is used to allow UUID itself as input data type
670 return UUID_HEX_OK_RE.match(s) is not None
671 return UUID_RE.match(s) is not None
674 def is_ip_v4(in_str: Any) -> bool:
676 Checks if a string is a valid ip v4.
678 >>> is_ip_v4('255.200.100.75')
682 >>> is_ip_v4('255.200.100.999') # 999 out of range
685 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
688 # checks that each entry in the ip is in the valid range (0 to 255)
689 for token in in_str.split("."):
690 if not 0 <= int(token) <= 255:
695 def extract_ip_v4(in_str: Any) -> Optional[str]:
697 Extracts the IPv4 chunk of a string or None.
699 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
701 >>> extract_ip_v4('Your mom dresses you funny.')
703 if not is_full_string(in_str):
705 m = ANYWHERE_IP_V4_RE.search(in_str)
711 def is_ip_v6(in_str: Any) -> bool:
713 Checks if a string is a valid ip v6.
715 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
717 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
720 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
723 def extract_ip_v6(in_str: Any) -> Optional[str]:
725 Extract IPv6 chunk or None.
727 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
728 '2001:db8:85a3:0000:0000:8a2e:370:7334'
729 >>> extract_ip_v6("(and she's ugly too, btw)")
731 if not is_full_string(in_str):
733 m = ANYWHERE_IP_V6_RE.search(in_str)
739 def is_ip(in_str: Any) -> bool:
741 Checks if a string is a valid ip (either v4 or v6).
743 >>> is_ip('255.200.100.75')
745 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
749 >>> is_ip('1.2.3.999')
752 return is_ip_v6(in_str) or is_ip_v4(in_str)
755 def extract_ip(in_str: Any) -> Optional[str]:
757 Extract the IP address or None.
759 >>> extract_ip('Attacker: 255.200.100.75')
761 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
762 '2001:db8:85a3:0000:0000:8a2e:370:7334'
763 >>> extract_ip('1.2.3')
766 ip = extract_ip_v4(in_str)
768 ip = extract_ip_v6(in_str)
772 def is_mac_address(in_str: Any) -> bool:
773 """Return True if in_str is a valid MAC address false otherwise.
775 >>> is_mac_address("34:29:8F:12:0D:2F")
777 >>> is_mac_address('34:29:8f:12:0d:2f')
779 >>> is_mac_address('34-29-8F-12-0D-2F')
781 >>> is_mac_address("test")
784 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
787 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
789 Extract the MAC address from in_str.
791 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
794 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
798 if not is_full_string(in_str):
801 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
804 mac.replace(":", separator)
805 mac.replace("-", separator)
810 def is_slug(in_str: Any, separator: str = "-") -> bool:
812 Checks if a given string is a slug (as created by `slugify()`).
814 >>> is_slug('my-blog-post-title')
816 >>> is_slug('My blog post title')
820 if not is_full_string(in_str):
822 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
823 return re.match(rex, in_str) is not None
826 def contains_html(in_str: str) -> bool:
828 Checks if the given string contains HTML/XML tags.
830 By design, this function matches ANY type of tag, so don't expect to use it
831 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
833 >>> contains_html('my string is <strong>bold</strong>')
835 >>> contains_html('my string is not bold')
839 if not is_string(in_str):
840 raise ValueError(in_str)
841 return HTML_RE.search(in_str) is not None
844 def words_count(in_str: str) -> int:
846 Returns the number of words contained into the given string.
848 This method is smart, it does consider only sequence of one or more letter and/or numbers
849 as "words", so a string like this: "! @ # % ... []" will return zero!
850 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
851 will be 4 not 1 (even if there are no spaces in the string).
853 >>> words_count('hello world')
855 >>> words_count('one,two,three.stop')
859 if not is_string(in_str):
860 raise ValueError(in_str)
861 return len(WORDS_COUNT_RE.findall(in_str))
864 def generate_uuid(as_hex: bool = False) -> str:
866 Generated an UUID string (using `uuid.uuid4()`).
868 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
869 generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
878 def generate_random_alphanumeric_string(size: int) -> str:
880 Returns a string of the specified size containing random
881 characters (uppercase/lowercase ascii letters and digits).
883 random_string(9) # possible output: "cx3QQbzYg"
887 raise ValueError("size must be >= 1")
888 chars = string.ascii_letters + string.digits
889 buffer = [random.choice(chars) for _ in range(size)]
890 return from_char_list(buffer)
893 def reverse(in_str: str) -> str:
895 Returns the string with its chars reversed.
901 if not is_string(in_str):
902 raise ValueError(in_str)
906 def camel_case_to_snake_case(in_str, *, separator="_"):
908 Convert a camel case string into a snake case one.
909 (The original string is returned if is not a valid camel case string)
911 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
912 'mac_address_extractor_factory'
913 >>> camel_case_to_snake_case('Luke Skywalker')
916 if not is_string(in_str):
917 raise ValueError(in_str)
918 if not is_camel_case(in_str):
920 return CAMEL_CASE_REPLACE_RE.sub(
921 lambda m: m.group(1) + separator, in_str
925 def snake_case_to_camel_case(
926 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
929 Convert a snake case string into a camel case one.
930 (The original string is returned if is not a valid snake case string)
932 >>> snake_case_to_camel_case('this_is_a_test')
934 >>> snake_case_to_camel_case('Han Solo')
937 if not is_string(in_str):
938 raise ValueError(in_str)
939 if not is_snake_case(in_str, separator=separator):
941 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
942 if not upper_case_first:
943 tokens[0] = tokens[0].lower()
944 return from_char_list(tokens)
947 def to_char_list(in_str: str) -> List[str]:
948 """Convert a string into a list of chars.
950 >>> to_char_list('test')
953 if not is_string(in_str):
958 def from_char_list(in_list: List[str]) -> str:
959 """Convert a char list into a string.
961 >>> from_char_list(['t', 'e', 's', 't'])
964 return "".join(in_list)
967 def shuffle(in_str: str) -> str:
968 """Return a new string containing same chars of the given one but in
971 if not is_string(in_str):
972 raise ValueError(in_str)
974 # turn the string into a list of chars
975 chars = to_char_list(in_str)
976 random.shuffle(chars)
977 return from_char_list(chars)
980 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
982 Remove html code contained into the given string.
984 >>> strip_html('test: <a href="foo/bar">click here</a>')
986 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
989 if not is_string(in_str):
990 raise ValueError(in_str)
991 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
992 return r.sub("", in_str)
995 def asciify(in_str: str) -> str:
997 Force string content to be ascii-only by translating all non-ascii
998 chars into the closest possible representation (eg: ó -> o, Ë ->
1001 N.B. Some chars may be lost if impossible to translate.
1003 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1004 'eeuuooaaeynAAACIINOE'
1006 if not is_string(in_str):
1007 raise ValueError(in_str)
1009 # "NFKD" is the algorithm which is able to successfully translate
1010 # the most of non-ascii chars.
1011 normalized = unicodedata.normalize("NFKD", in_str)
1013 # encode string forcing ascii and ignore any errors
1014 # (unrepresentable chars will be stripped out)
1015 ascii_bytes = normalized.encode("ascii", "ignore")
1017 # turns encoded bytes into an utf-8 string
1018 return ascii_bytes.decode("utf-8")
1021 def slugify(in_str: str, *, separator: str = "-") -> str:
1023 Converts a string into a "slug" using provided separator.
1024 The returned string has the following properties:
1027 - all letters are in lower case
1028 - all punctuation signs and non alphanumeric chars are removed
1029 - words are divided using provided separator
1030 - all chars are encoded as ascii (by using `asciify()`)
1033 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1034 'top-10-reasons-to-love-dogs'
1035 >>> slugify('Mönstér Mägnët')
1038 if not is_string(in_str):
1039 raise ValueError(in_str)
1041 # replace any character that is NOT letter or number with spaces
1042 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1044 # replace spaces with join sign
1045 out = SPACES_RE.sub(separator, out)
1047 # normalize joins (remove duplicates)
1048 out = re.sub(re.escape(separator) + r"+", separator, out)
1052 def to_bool(in_str: str) -> bool:
1054 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1056 A positive boolean (True) is returned if the string value is one
1064 Otherwise False is returned.
1085 if not is_string(in_str):
1086 raise ValueError(in_str)
1087 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1090 def to_date(in_str: str) -> Optional[datetime.date]:
1092 Parses a date string. See DateParser docs for details.
1094 import dateparse.dateparse_utils as dp
1099 except dp.ParseException:
1100 logger.warning(f'Unable to parse date {in_str}.')
1104 def valid_date(in_str: str) -> bool:
1106 True if the string represents a valid date.
1108 import dateparse.dateparse_utils as dp
1113 except dp.ParseException:
1114 logger.warning(f'Unable to parse date {in_str}.')
1118 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1120 Parses a datetime string. See DateParser docs for more info.
1122 import dateparse.dateparse_utils as dp
1125 dt = d.parse(in_str)
1126 if type(dt) == datetime.datetime:
1129 logger.warning(f'Unable to parse datetime {in_str}.')
1133 def valid_datetime(in_str: str) -> bool:
1135 True if the string represents a valid datetime.
1137 _ = to_datetime(in_str)
1140 logger.warning(f'Unable to parse datetime {in_str}.')
1144 def dedent(in_str: str) -> str:
1146 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1148 if not is_string(in_str):
1149 raise ValueError(in_str)
1150 line_separator = '\n'
1151 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1152 return line_separator.join(lines)
1155 def indent(in_str: str, amount: int) -> str:
1157 Indents string by prepending amount spaces.
1159 >>> indent('This is a test', 4)
1163 if not is_string(in_str):
1164 raise ValueError(in_str)
1165 line_separator = '\n'
1166 lines = [" " * amount + line for line in in_str.split(line_separator)]
1167 return line_separator.join(lines)
1170 def sprintf(*args, **kwargs) -> str:
1171 """String printf, like in C"""
1174 sep = kwargs.pop("sep", None)
1176 if not isinstance(sep, str):
1177 raise TypeError("sep must be None or a string")
1179 end = kwargs.pop("end", None)
1181 if not isinstance(end, str):
1182 raise TypeError("end must be None or a string")
1185 raise TypeError("invalid keyword arguments to sprint()")
1191 for i, arg in enumerate(args):
1194 if isinstance(arg, str):
1202 class SprintfStdout(object):
1204 A context manager that captures outputs to stdout.
1206 with SprintfStdout() as buf:
1212 def __init__(self) -> None:
1213 self.destination = io.StringIO()
1214 self.recorder = None
1216 def __enter__(self) -> Callable[[], str]:
1217 self.recorder = contextlib.redirect_stdout(self.destination)
1218 self.recorder.__enter__()
1219 return lambda: self.destination.getvalue()
1221 def __exit__(self, *args) -> None:
1222 self.recorder.__exit__(*args)
1223 self.destination.seek(0)
1224 return None # don't suppress exceptions
1227 def is_are(n: int) -> str:
1241 def pluralize(n: int) -> str:
1247 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1250 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1259 def thify(n: int) -> str:
1260 """Return the proper cardinal suffix for a number.
1271 assert is_integer_number(digit)
1283 def ngrams(txt: str, n: int):
1284 """Return the ngrams from a string.
1286 >>> [x for x in ngrams('This is a test', 2)]
1287 ['This is', 'is a', 'a test']
1291 for ngram in ngrams_presplit(words, n):
1298 def ngrams_presplit(words: Sequence[str], n: int):
1299 return list_utils.ngrams(words, n)
1302 def bigrams(txt: str):
1303 return ngrams(txt, 2)
1306 def trigrams(txt: str):
1307 return ngrams(txt, 3)
1310 def shuffle_columns_into_list(
1311 input_lines: Iterable[str],
1312 column_specs: Iterable[Iterable[int]],
1315 """Helper to shuffle / parse columnar data and return the results as a
1316 list. The column_specs argument is an iterable collection of
1317 numeric sequences that indicate one or more column numbers to
1320 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1321 >>> shuffle_columns_into_list(
1323 ... [ [8], [2, 3], [5, 6, 7] ],
1326 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1331 # Column specs map input lines' columns into outputs.
1333 for spec in column_specs:
1336 chunk = chunk + delim + input_lines[n]
1337 chunk = chunk.strip(delim)
1342 def shuffle_columns_into_dict(
1343 input_lines: Iterable[str],
1344 column_specs: Iterable[Tuple[str, Iterable[int]]],
1346 ) -> Dict[str, str]:
1347 """Helper to shuffle / parse columnar data and return the results
1350 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1351 >>> shuffle_columns_into_dict(
1353 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1356 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1361 # Column specs map input lines' columns into outputs.
1362 # "key", [col1, col2...]
1363 for spec in column_specs:
1366 chunk = chunk + delim + input_lines[n]
1367 chunk = chunk.strip(delim)
1368 out[spec[0]] = chunk
1372 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1373 """Interpolate a string with data from a dict.
1375 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1376 ... {'adjective': 'good', 'noun': 'example'})
1377 'This is a good example.'
1380 return sprintf(txt.format(**values), end='')
1383 def to_ascii(x: str):
1384 """Encode as ascii bytes string.
1386 >>> to_ascii('test')
1389 >>> to_ascii(b'1, 2, 3')
1394 return x.encode('ascii')
1395 if type(x) is bytes:
1397 raise Exception('to_ascii works with strings and bytes')
1400 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
1401 """Encode txt and then encode the bytes with a 64-character
1402 alphabet. This is compatible with uudecode.
1404 >>> to_base64('hello?')
1408 return base64.encodebytes(txt.encode(encoding, errors))
1411 def is_base64(txt: str) -> bool:
1412 """Determine whether a string is base64 encoded (with Python's standard
1413 base64 alphabet which is the same as what uuencode uses).
1415 >>> is_base64('test') # all letters in the b64 alphabet
1418 >>> is_base64('another test, how do you like this one?')
1421 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1425 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1426 alphabet = set(a.encode('ascii'))
1427 for char in to_ascii(txt.strip()):
1428 if char not in alphabet:
1433 def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
1434 """Convert base64 encoded string back to normal strings.
1436 >>> from_base64(b'aGVsbG8/\\n')
1440 return base64.decodebytes(b64).decode(encoding, errors)
1443 def chunk(txt: str, chunk_size):
1444 """Chunk up a string.
1446 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1447 '01001101 11000101 10101010 10101010 10011111 10101000'
1450 if len(txt) % chunk_size != 0:
1452 f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})')
1453 for x in range(0, len(txt), chunk_size):
1454 yield txt[x:x+chunk_size]
1457 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1458 """Encode txt and then chop it into bytes. Note: only bitstrings
1459 with delimiter='' are interpretable by from_bitstring.
1461 >>> to_bitstring('hello?')
1462 '011010000110010101101100011011000110111100111111'
1464 >>> to_bitstring('test', delimiter=' ')
1465 '01110100 01100101 01110011 01110100'
1467 >>> to_bitstring(b'test')
1468 '01110100011001010111001101110100'
1471 etxt = to_ascii(txt)
1479 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1482 def is_bitstring(txt: str) -> bool:
1483 """Is this a bitstring?
1485 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1488 >>> is_bitstring('1234')
1492 return is_binary_integer_number(f'0b{txt}')
1495 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1496 """Convert from bitstring back to bytes then decode into a str.
1498 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1503 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1506 def ip_v4_sort_key(txt: str) -> Tuple[int]:
1507 """Turn an IPv4 address into a tuple for sorting purposes.
1509 >>> ip_v4_sort_key('10.0.0.18')
1512 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1513 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1514 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1517 if not is_ip_v4(txt):
1518 print(f"not IP: {txt}")
1520 return tuple([int(x) for x in txt.split('.')])
1523 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str]:
1524 """Chunk up a file path so that parent/ancestor paths sort before
1525 children/descendant paths.
1527 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1528 ('usr', 'local', 'bin')
1530 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1531 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1532 ['/usr', '/usr/local', '/usr/local/bin']
1535 return tuple([x for x in volume.split('/') if len(x) > 0])
1538 if __name__ == '__main__':