7 from itertools import zip_longest
14 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
16 from uuid import uuid4
20 logger = logging.getLogger(__name__)
22 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
24 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
26 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
28 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
31 r"([a-z-]+://)" # scheme
32 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
34 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
35 r"(:\d{2,})?" # port number
36 r"(/[a-z\d_%+-]*)*" # folders
37 r"(\.[a-z\d_%+-]+)*" # file extension
38 r"(\?[a-z\d_+%-=]*)?" # query string
42 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
44 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
46 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
48 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
50 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
52 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
54 CAMEL_CASE_TEST_RE = re.compile(
55 r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
58 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
60 SNAKE_CASE_TEST_RE = re.compile(
61 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
64 SNAKE_CASE_TEST_DASH_RE = re.compile(
65 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
68 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
70 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
73 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
74 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
75 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
76 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
77 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
78 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
81 JSON_WRAPPER_RE = re.compile(
82 r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
86 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
89 UUID_HEX_OK_RE = re.compile(
90 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
94 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
96 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
98 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
100 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
102 MAC_ADDRESS_RE = re.compile(
103 r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
106 ANYWHERE_MAC_ADDRESS_RE = re.compile(
107 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
110 WORDS_COUNT_RE = re.compile(
111 r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
114 HTML_RE = re.compile(
115 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
116 re.IGNORECASE | re.MULTILINE | re.DOTALL,
119 HTML_TAG_ONLY_RE = re.compile(
120 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
121 re.IGNORECASE | re.MULTILINE | re.DOTALL,
124 SPACES_RE = re.compile(r"\s")
126 NO_LETTERS_OR_NUMBERS_RE = re.compile(
127 r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
130 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
132 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
148 def is_none_or_empty(in_str: Optional[str]) -> bool:
150 Returns true if the input string is either None or an empty string.
152 >>> is_none_or_empty("")
154 >>> is_none_or_empty(None)
156 >>> is_none_or_empty(" \t ")
158 >>> is_none_or_empty('Test')
161 return in_str is None or len(in_str.strip()) == 0
164 def is_string(obj: Any) -> bool:
166 Checks if an object is a string.
168 >>> is_string('test')
174 >>> is_string([1, 2, 3])
177 return isinstance(obj, str)
180 def is_empty_string(in_str: Any) -> bool:
181 return is_empty(in_str)
184 def is_empty(in_str: Any) -> bool:
186 Checks if input is a string and empty or only whitespace.
190 >>> is_empty(' \t\t ')
196 >>> is_empty([1, 2, 3])
199 return is_string(in_str) and in_str.strip() == ""
202 def is_full_string(in_str: Any) -> bool:
204 Checks that input is a string and is not empty ('') or only whitespace.
206 >>> is_full_string('test!')
208 >>> is_full_string('')
210 >>> is_full_string(' ')
212 >>> is_full_string(100.999)
214 >>> is_full_string({"a": 1, "b": 2})
217 return is_string(in_str) and in_str.strip() != ""
220 def is_number(in_str: str) -> bool:
222 Checks if a string is a valid number.
225 Traceback (most recent call last):
228 >>> is_number("100.5")
230 >>> is_number("test")
234 >>> is_number([1, 2, 3])
235 Traceback (most recent call last):
237 ValueError: [1, 2, 3]
239 if not is_string(in_str):
240 raise ValueError(in_str)
241 return NUMBER_RE.match(in_str) is not None
244 def is_integer_number(in_str: str) -> bool:
246 Checks whether the given string represents an integer or not.
248 An integer may be signed or unsigned or use a "scientific notation".
250 >>> is_integer_number('42')
252 >>> is_integer_number('42.0')
256 (is_number(in_str) and "." not in in_str) or
257 is_hexidecimal_integer_number(in_str) or
258 is_octal_integer_number(in_str) or
259 is_binary_integer_number(in_str)
263 def is_hexidecimal_integer_number(in_str: str) -> bool:
265 Checks whether a string is a hex integer number.
267 >>> is_hexidecimal_integer_number('0x12345')
269 >>> is_hexidecimal_integer_number('0x1A3E')
271 >>> is_hexidecimal_integer_number('1234') # Needs 0x
273 >>> is_hexidecimal_integer_number('-0xff')
275 >>> is_hexidecimal_integer_number('test')
277 >>> is_hexidecimal_integer_number(12345) # Not a string
278 Traceback (most recent call last):
281 >>> is_hexidecimal_integer_number(101.4)
282 Traceback (most recent call last):
285 >>> is_hexidecimal_integer_number(0x1A3E)
286 Traceback (most recent call last):
290 if not is_string(in_str):
291 raise ValueError(in_str)
292 return HEX_NUMBER_RE.match(in_str) is not None
295 def is_octal_integer_number(in_str: str) -> bool:
297 Checks whether a string is an octal number.
299 >>> is_octal_integer_number('0o777')
301 >>> is_octal_integer_number('-0O115')
303 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
305 >>> is_octal_integer_number('7777') # Needs 0o
307 >>> is_octal_integer_number('test')
310 if not is_string(in_str):
311 raise ValueError(in_str)
312 return OCT_NUMBER_RE.match(in_str) is not None
315 def is_binary_integer_number(in_str: str) -> bool:
317 Returns whether a string contains a binary number.
319 >>> is_binary_integer_number('0b10111')
321 >>> is_binary_integer_number('-0b111')
323 >>> is_binary_integer_number('0B10101')
325 >>> is_binary_integer_number('0b10102')
327 >>> is_binary_integer_number('0xFFF')
329 >>> is_binary_integer_number('test')
332 if not is_string(in_str):
333 raise ValueError(in_str)
334 return BIN_NUMBER_RE.match(in_str) is not None
337 def to_int(in_str: str) -> int:
338 """Returns the integral value of the string or raises on error.
343 Traceback (most recent call last):
345 ValueError: invalid literal for int() with base 10: 'test'
347 if not is_string(in_str):
348 raise ValueError(in_str)
349 if is_binary_integer_number(in_str):
350 return int(in_str, 2)
351 if is_octal_integer_number(in_str):
352 return int(in_str, 8)
353 if is_hexidecimal_integer_number(in_str):
354 return int(in_str, 16)
358 def is_decimal_number(in_str: str) -> bool:
360 Checks whether the given string represents a decimal or not.
362 A decimal may be signed or unsigned or use a "scientific notation".
364 >>> is_decimal_number('42.0')
366 >>> is_decimal_number('42')
369 return is_number(in_str) and "." in in_str
372 def strip_escape_sequences(in_str: str) -> str:
374 Remove escape sequences in the input string.
376 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
379 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
383 def add_thousands_separator(
386 separator_char = ',',
390 Add thousands separator to a numeric string. Also handles numbers.
392 >>> add_thousands_separator('12345678')
394 >>> add_thousands_separator(12345678)
396 >>> add_thousands_separator(12345678.99)
398 >>> add_thousands_separator('test')
399 Traceback (most recent call last):
404 if isinstance(in_str, numbers.Number):
406 if is_number(in_str):
407 return _add_thousands_separator(
409 separator_char = separator_char,
412 raise ValueError(in_str)
415 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
418 (in_str, decimal_part) = in_str.split('.')
419 tmp = [iter(in_str[::-1])] * places
420 ret = separator_char.join(
421 "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
422 if len(decimal_part) > 0:
429 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
430 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
432 Check if a string is a valid url.
434 >>> is_url('http://www.mysite.com')
436 >>> is_url('https://mysite.com')
438 >>> is_url('.mysite.com')
441 if not is_full_string(in_str):
444 valid = URL_RE.match(in_str) is not None
447 return valid and any([in_str.startswith(s) for s in allowed_schemes])
451 def is_email(in_str: Any) -> bool:
453 Check if a string is a valid email.
455 Reference: https://tools.ietf.org/html/rfc3696#section-3
459 >>> is_email('@gmail.com')
463 not is_full_string(in_str)
465 or in_str.startswith(".")
470 # we expect 2 tokens, one before "@" and one after, otherwise
471 # we have an exception and the email is not valid.
472 head, tail = in_str.split("@")
474 # head's size must be <= 64, tail <= 255, head must not start
475 # with a dot or contain multiple consecutive dots.
479 or head.endswith(".")
484 # removes escaped spaces, so that later on the test regex will
486 head = head.replace("\\ ", "")
487 if head.startswith('"') and head.endswith('"'):
488 head = head.replace(" ", "")[1:-1]
489 return EMAIL_RE.match(head + "@" + tail) is not None
492 # borderline case in which we have multiple "@" signs but the
493 # head part is correctly escaped.
494 if ESCAPED_AT_SIGN.search(in_str) is not None:
495 # replace "@" with "a" in the head
496 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
500 def suffix_string_to_number(in_str: str) -> Optional[int]:
501 """Take a string like "33Gb" and convert it into a number (of bytes)
502 like 34603008. Return None if the input string is not valid.
504 >>> suffix_string_to_number('1Mb')
506 >>> suffix_string_to_number('13.1Gb')
509 def suffix_capitalize(s: str) -> str:
513 return f"{s[0].upper()}{s[1].lower()}"
514 return suffix_capitalize(s[0:1])
516 if is_string(in_str):
517 if is_integer_number(in_str):
518 return to_int(in_str)
519 suffixes = [in_str[-2:], in_str[-1:]]
520 rest = [in_str[:-2], in_str[:-1]]
521 for x in range(len(suffixes)):
523 s = suffix_capitalize(s)
524 multiplier = NUM_SUFFIXES.get(s, None)
525 if multiplier is not None:
527 if is_integer_number(r):
528 return to_int(r) * multiplier
529 if is_decimal_number(r):
530 return int(float(r) * multiplier)
534 def number_to_suffix_string(num: int) -> Optional[str]:
535 """Take a number (of bytes) and returns a string like "43.8Gb".
536 Returns none if the input is invalid.
538 >>> number_to_suffix_string(14066017894)
540 >>> number_to_suffix_string(1024 * 1024)
546 for (sfx, size) in NUM_SUFFIXES.items():
551 if suffix is not None:
552 return f"{d:.1f}{suffix}"
557 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
559 Checks if a string is a valid credit card number.
560 If card type is provided then it checks against that specific type only,
561 otherwise any known credit card number will be accepted.
563 Supported card types are the following:
572 if not is_full_string(in_str):
575 if card_type is not None:
576 if card_type not in CREDIT_CARDS:
578 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
580 return CREDIT_CARDS[card_type].match(in_str) is not None
581 for c in CREDIT_CARDS:
582 if CREDIT_CARDS[c].match(in_str) is not None:
587 def is_camel_case(in_str: Any) -> bool:
589 Checks if a string is formatted as camel case.
591 A string is considered camel case when:
593 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
594 - it contains both lowercase and uppercase letters
595 - it does not start with a number
598 is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
602 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
604 Checks if a string is formatted as "snake case".
606 A string is considered snake case when:
608 - it's composed only by lowercase/uppercase letters and digits
609 - it contains at least one underscore (or provided separator)
610 - it does not start with a number
612 >>> is_snake_case('this_is_a_test')
614 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
616 >>> is_snake_case('this-is-a-test')
618 >>> is_snake_case('this-is-a-test', separator='-')
622 if is_full_string(in_str):
623 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
625 r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
630 re_template.format(sign=re.escape(separator)), re.IGNORECASE
633 return r.match(in_str) is not None
637 def is_json(in_str: Any) -> bool:
639 Check if a string is a valid json.
641 >>> is_json('{"name": "Peter"}')
643 >>> is_json('[1, 2, 3]')
645 >>> is_json('{nope}')
648 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
650 return isinstance(json.loads(in_str), (dict, list))
651 except (TypeError, ValueError, OverflowError):
656 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
658 Check if a string is a valid UUID.
660 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
662 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
664 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
667 # string casting is used to allow UUID itself as input data type
670 return UUID_HEX_OK_RE.match(s) is not None
671 return UUID_RE.match(s) is not None
674 def is_ip_v4(in_str: Any) -> bool:
676 Checks if a string is a valid ip v4.
678 >>> is_ip_v4('255.200.100.75')
682 >>> is_ip_v4('255.200.100.999') # 999 out of range
685 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
688 # checks that each entry in the ip is in the valid range (0 to 255)
689 for token in in_str.split("."):
690 if not 0 <= int(token) <= 255:
695 def extract_ip_v4(in_str: Any) -> Optional[str]:
697 Extracts the IPv4 chunk of a string or None.
699 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
701 >>> extract_ip_v4('Your mom dresses you funny.')
703 if not is_full_string(in_str):
705 m = ANYWHERE_IP_V4_RE.search(in_str)
711 def is_ip_v6(in_str: Any) -> bool:
713 Checks if a string is a valid ip v6.
715 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
717 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
720 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
723 def extract_ip_v6(in_str: Any) -> Optional[str]:
725 Extract IPv6 chunk or None.
727 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
728 '2001:db8:85a3:0000:0000:8a2e:370:7334'
729 >>> extract_ip_v6("(and she's ugly too, btw)")
731 if not is_full_string(in_str):
733 m = ANYWHERE_IP_V6_RE.search(in_str)
739 def is_ip(in_str: Any) -> bool:
741 Checks if a string is a valid ip (either v4 or v6).
743 >>> is_ip('255.200.100.75')
745 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
749 >>> is_ip('1.2.3.999')
752 return is_ip_v6(in_str) or is_ip_v4(in_str)
755 def extract_ip(in_str: Any) -> Optional[str]:
757 Extract the IP address or None.
759 >>> extract_ip('Attacker: 255.200.100.75')
761 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
762 '2001:db8:85a3:0000:0000:8a2e:370:7334'
763 >>> extract_ip('1.2.3')
766 ip = extract_ip_v4(in_str)
768 ip = extract_ip_v6(in_str)
772 def is_mac_address(in_str: Any) -> bool:
773 """Return True if in_str is a valid MAC address false otherwise.
775 >>> is_mac_address("34:29:8F:12:0D:2F")
777 >>> is_mac_address('34:29:8f:12:0d:2f')
779 >>> is_mac_address('34-29-8F-12-0D-2F')
781 >>> is_mac_address("test")
784 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
787 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
789 Extract the MAC address from in_str.
791 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
795 if not is_full_string(in_str):
798 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
801 mac.replace(":", separator)
802 mac.replace("-", separator)
807 def is_slug(in_str: Any, separator: str = "-") -> bool:
809 Checks if a given string is a slug (as created by `slugify()`).
811 >>> is_slug('my-blog-post-title')
813 >>> is_slug('My blog post title')
817 if not is_full_string(in_str):
819 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
820 return re.match(rex, in_str) is not None
823 def contains_html(in_str: str) -> bool:
825 Checks if the given string contains HTML/XML tags.
827 By design, this function matches ANY type of tag, so don't expect to use it
828 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
830 >>> contains_html('my string is <strong>bold</strong>')
832 >>> contains_html('my string is not bold')
836 if not is_string(in_str):
837 raise ValueError(in_str)
838 return HTML_RE.search(in_str) is not None
841 def words_count(in_str: str) -> int:
843 Returns the number of words contained into the given string.
845 This method is smart, it does consider only sequence of one or more letter and/or numbers
846 as "words", so a string like this: "! @ # % ... []" will return zero!
847 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
848 will be 4 not 1 (even if there are no spaces in the string).
850 >>> words_count('hello world')
852 >>> words_count('one,two,three.stop')
856 if not is_string(in_str):
857 raise ValueError(in_str)
858 return len(WORDS_COUNT_RE.findall(in_str))
861 def generate_uuid(as_hex: bool = False) -> str:
863 Generated an UUID string (using `uuid.uuid4()`).
865 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
866 generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
875 def generate_random_alphanumeric_string(size: int) -> str:
877 Returns a string of the specified size containing random
878 characters (uppercase/lowercase ascii letters and digits).
880 random_string(9) # possible output: "cx3QQbzYg"
884 raise ValueError("size must be >= 1")
885 chars = string.ascii_letters + string.digits
886 buffer = [random.choice(chars) for _ in range(size)]
887 return from_char_list(buffer)
890 def reverse(in_str: str) -> str:
892 Returns the string with its chars reversed.
898 if not is_string(in_str):
899 raise ValueError(in_str)
903 def camel_case_to_snake_case(in_str, *, separator="_"):
905 Convert a camel case string into a snake case one.
906 (The original string is returned if is not a valid camel case string)
908 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
909 'mac_address_extractor_factory'
910 >>> camel_case_to_snake_case('Luke Skywalker')
913 if not is_string(in_str):
914 raise ValueError(in_str)
915 if not is_camel_case(in_str):
917 return CAMEL_CASE_REPLACE_RE.sub(
918 lambda m: m.group(1) + separator, in_str
922 def snake_case_to_camel_case(
923 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
926 Convert a snake case string into a camel case one.
927 (The original string is returned if is not a valid snake case string)
929 >>> snake_case_to_camel_case('this_is_a_test')
931 >>> snake_case_to_camel_case('Han Solo')
934 if not is_string(in_str):
935 raise ValueError(in_str)
936 if not is_snake_case(in_str, separator=separator):
938 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
939 if not upper_case_first:
940 tokens[0] = tokens[0].lower()
941 return from_char_list(tokens)
944 def to_char_list(in_str: str) -> List[str]:
945 """Convert a string into a list of chars.
947 >>> to_char_list('test')
950 if not is_string(in_str):
955 def from_char_list(in_list: List[str]) -> str:
956 """Convert a char list into a string.
958 >>> from_char_list(['t', 'e', 's', 't'])
961 return "".join(in_list)
964 def shuffle(in_str: str) -> str:
965 """Return a new string containing same chars of the given one but in
968 if not is_string(in_str):
969 raise ValueError(in_str)
971 # turn the string into a list of chars
972 chars = to_char_list(in_str)
973 random.shuffle(chars)
974 return from_char_list(chars)
977 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
979 Remove html code contained into the given string.
981 >>> strip_html('test: <a href="foo/bar">click here</a>')
983 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
986 if not is_string(in_str):
987 raise ValueError(in_str)
988 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
989 return r.sub("", in_str)
992 def asciify(in_str: str) -> str:
994 Force string content to be ascii-only by translating all non-ascii
995 chars into the closest possible representation (eg: ó -> o, Ë ->
998 N.B. Some chars may be lost if impossible to translate.
1000 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1001 'eeuuooaaeynAAACIINOE'
1003 if not is_string(in_str):
1004 raise ValueError(in_str)
1006 # "NFKD" is the algorithm which is able to successfully translate
1007 # the most of non-ascii chars.
1008 normalized = unicodedata.normalize("NFKD", in_str)
1010 # encode string forcing ascii and ignore any errors
1011 # (unrepresentable chars will be stripped out)
1012 ascii_bytes = normalized.encode("ascii", "ignore")
1014 # turns encoded bytes into an utf-8 string
1015 return ascii_bytes.decode("utf-8")
1018 def slugify(in_str: str, *, separator: str = "-") -> str:
1020 Converts a string into a "slug" using provided separator.
1021 The returned string has the following properties:
1024 - all letters are in lower case
1025 - all punctuation signs and non alphanumeric chars are removed
1026 - words are divided using provided separator
1027 - all chars are encoded as ascii (by using `asciify()`)
1030 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1031 'top-10-reasons-to-love-dogs'
1032 >>> slugify('Mönstér Mägnët')
1035 if not is_string(in_str):
1036 raise ValueError(in_str)
1038 # replace any character that is NOT letter or number with spaces
1039 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1041 # replace spaces with join sign
1042 out = SPACES_RE.sub(separator, out)
1044 # normalize joins (remove duplicates)
1045 out = re.sub(re.escape(separator) + r"+", separator, out)
1049 def to_bool(in_str: str) -> bool:
1051 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1053 A positive boolean (True) is returned if the string value is one
1061 Otherwise False is returned.
1082 if not is_string(in_str):
1083 raise ValueError(in_str)
1084 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1087 def to_date(in_str: str) -> Optional[datetime.date]:
1089 Parses a date string. See DateParser docs for details.
1091 import dateparse.dateparse_utils as dp
1096 except dp.ParseException:
1097 logger.warning(f'Unable to parse date {in_str}.')
1101 def valid_date(in_str: str) -> bool:
1103 True if the string represents a valid date.
1105 import dateparse.dateparse_utils as dp
1110 except dp.ParseException:
1111 logger.warning(f'Unable to parse date {in_str}.')
1115 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1117 Parses a datetime string. See DateParser docs for more info.
1119 import dateparse.dateparse_utils as dp
1122 dt = d.parse(in_str)
1123 if type(dt) == datetime.datetime:
1126 logger.warning(f'Unable to parse datetime {in_str}.')
1130 def valid_datetime(in_str: str) -> bool:
1132 True if the string represents a valid datetime.
1134 _ = to_datetime(in_str)
1137 logger.warning(f'Unable to parse datetime {in_str}.')
1141 def dedent(in_str: str) -> str:
1143 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1145 if not is_string(in_str):
1146 raise ValueError(in_str)
1147 line_separator = '\n'
1148 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1149 return line_separator.join(lines)
1152 def indent(in_str: str, amount: int) -> str:
1154 Indents string by prepending amount spaces.
1156 >>> indent('This is a test', 4)
1160 if not is_string(in_str):
1161 raise ValueError(in_str)
1162 line_separator = '\n'
1163 lines = [" " * amount + line for line in in_str.split(line_separator)]
1164 return line_separator.join(lines)
1167 def sprintf(*args, **kwargs) -> str:
1168 """String printf, like in C"""
1171 sep = kwargs.pop("sep", None)
1173 if not isinstance(sep, str):
1174 raise TypeError("sep must be None or a string")
1176 end = kwargs.pop("end", None)
1178 if not isinstance(end, str):
1179 raise TypeError("end must be None or a string")
1182 raise TypeError("invalid keyword arguments to sprint()")
1188 for i, arg in enumerate(args):
1191 if isinstance(arg, str):
1199 class SprintfStdout(object):
1201 A context manager that captures outputs to stdout.
1203 with SprintfStdout() as buf:
1209 def __init__(self) -> None:
1210 self.destination = io.StringIO()
1211 self.recorder = None
1213 def __enter__(self) -> Callable[[], str]:
1214 self.recorder = contextlib.redirect_stdout(self.destination)
1215 self.recorder.__enter__()
1216 return lambda: self.destination.getvalue()
1218 def __exit__(self, *args) -> None:
1219 self.recorder.__exit__(*args)
1220 self.destination.seek(0)
1221 return None # don't suppress exceptions
1224 def is_are(n: int) -> str:
1238 def pluralize(n: int) -> str:
1244 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1247 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1256 def thify(n: int) -> str:
1257 """Return the proper cardinal suffix for a number.
1268 assert is_integer_number(digit)
1280 def ngrams(txt: str, n: int):
1281 """Return the ngrams from a string.
1283 >>> [x for x in ngrams('This is a test', 2)]
1284 ['This is', 'is a', 'a test']
1288 for ngram in ngrams_presplit(words, n):
1295 def ngrams_presplit(words: Sequence[str], n: int):
1296 return list_utils.ngrams(words, n)
1299 def bigrams(txt: str):
1300 return ngrams(txt, 2)
1303 def trigrams(txt: str):
1304 return ngrams(txt, 3)
1307 def shuffle_columns_into_list(
1308 input_lines: Iterable[str],
1309 column_specs: Iterable[Iterable[int]],
1312 """Helper to shuffle / parse columnar data and return the results as a
1313 list. The column_specs argument is an iterable collection of
1314 numeric sequences that indicate one or more column numbers to
1317 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1318 >>> shuffle_columns_into_list(
1320 ... [ [8], [2, 3], [5, 6, 7] ],
1323 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1328 # Column specs map input lines' columns into outputs.
1330 for spec in column_specs:
1333 chunk = chunk + delim + input_lines[n]
1334 chunk = chunk.strip(delim)
1339 def shuffle_columns_into_dict(
1340 input_lines: Iterable[str],
1341 column_specs: Iterable[Tuple[str, Iterable[int]]],
1343 ) -> Dict[str, str]:
1344 """Helper to shuffle / parse columnar data and return the results
1347 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1348 >>> shuffle_columns_into_dict(
1350 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1353 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1358 # Column specs map input lines' columns into outputs.
1359 # "key", [col1, col2...]
1360 for spec in column_specs:
1363 chunk = chunk + delim + input_lines[n]
1364 chunk = chunk.strip(delim)
1365 out[spec[0]] = chunk
1369 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1370 """Interpolate a string with data from a dict.
1372 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1373 ... {'adjective': 'good', 'noun': 'example'})
1374 'This is a good example.'
1377 return sprintf(txt.format(**values), end='')
1380 def to_ascii(x: str):
1381 """Encode as ascii bytes string.
1383 >>> to_ascii('test')
1386 >>> to_ascii(b'1, 2, 3')
1391 return x.encode('ascii')
1392 if type(x) is bytes:
1394 raise Exception('to_ascii works with strings and bytes')
1397 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> str:
1398 """Encode txt and then encode the bytes with a 64-character
1399 alphabet. This is compatible with uudecode.
1401 >>> to_base64('hello?')
1405 return base64.encodebytes(txt.encode(encoding, errors))
1408 def is_base64(txt: str) -> bool:
1409 """Determine whether a string is base64 encoded (with Python's standard
1410 base64 alphabet which is the same as what uuencode uses).
1412 >>> is_base64('test') # all letters in the b64 alphabet
1415 >>> is_base64('another test, how do you like this one?')
1418 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1422 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1423 alphabet = set(a.encode('ascii'))
1424 for char in to_ascii(txt.strip()):
1425 if char not in alphabet:
1430 def from_base64(b64: str, encoding='utf-8', errors='surrogatepass') -> str:
1431 """Convert base64 encoded string back to normal strings.
1433 >>> from_base64(b'aGVsbG8/\\n')
1437 return base64.decodebytes(b64).decode(encoding, errors)
1440 def chunk(txt: str, chunk_size):
1441 """Chunk up a string.
1443 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1444 '01001101 11000101 10101010 10101010 10011111 10101000'
1447 if len(txt) % chunk_size != 0:
1449 f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})')
1450 for x in range(0, len(txt), chunk_size):
1451 yield txt[x:x+chunk_size]
1454 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1455 """Encode txt and then chop it into bytes. Note: only bitstrings
1456 with delimiter='' are interpretable by from_bitstring.
1458 >>> to_bitstring('hello?')
1459 '011010000110010101101100011011000110111100111111'
1461 >>> to_bitstring('test', delimiter=' ')
1462 '01110100 01100101 01110011 01110100'
1467 txt.encode(encoding, errors),
1472 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1475 def is_bitstring(txt: str) -> bool:
1476 """Is this a bitstring?
1478 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1481 >>> is_bitstring('1234')
1485 return is_binary_integer_number(f'0b{txt}')
1488 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1489 """Convert from bitstring back to bytes then decode into a str.
1491 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1496 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1499 if __name__ == '__main__':