6 from itertools import zip_longest
13 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
15 from uuid import uuid4
17 logger = logging.getLogger(__name__)
19 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
21 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
23 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
25 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
28 r"([a-z-]+://)" # scheme
29 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
31 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
32 r"(:\d{2,})?" # port number
33 r"(/[a-z\d_%+-]*)*" # folders
34 r"(\.[a-z\d_%+-]+)*" # file extension
35 r"(\?[a-z\d_+%-=]*)?" # query string
39 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
41 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
43 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
45 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
47 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
49 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
51 CAMEL_CASE_TEST_RE = re.compile(
52 r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$"
55 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
57 SNAKE_CASE_TEST_RE = re.compile(
58 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
61 SNAKE_CASE_TEST_DASH_RE = re.compile(
62 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
65 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
67 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
70 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
71 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
72 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
73 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
74 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
75 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
78 JSON_WRAPPER_RE = re.compile(
79 r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL
83 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
86 UUID_HEX_OK_RE = re.compile(
87 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
91 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
93 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
95 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
97 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
99 MAC_ADDRESS_RE = re.compile(
100 r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE
103 ANYWHERE_MAC_ADDRESS_RE = re.compile(
104 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
107 WORDS_COUNT_RE = re.compile(
108 r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE
111 HTML_RE = re.compile(
112 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
113 re.IGNORECASE | re.MULTILINE | re.DOTALL,
116 HTML_TAG_ONLY_RE = re.compile(
117 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
118 re.IGNORECASE | re.MULTILINE | re.DOTALL,
121 SPACES_RE = re.compile(r"\s")
123 NO_LETTERS_OR_NUMBERS_RE = re.compile(
124 r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE
127 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
129 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
145 def is_none_or_empty(in_str: Optional[str]) -> bool:
147 Returns true if the input string is either None or an empty string.
149 >>> is_none_or_empty("")
151 >>> is_none_or_empty(None)
153 >>> is_none_or_empty(" ")
155 >>> is_none_or_empty('Test')
158 return in_str is None or len(in_str.strip()) == 0
161 def is_string(obj: Any) -> bool:
163 Checks if an object is a string.
165 >>> is_string('test')
171 >>> is_string([1, 2, 3])
174 return isinstance(obj, str)
177 def is_empty_string(in_str: Any) -> bool:
179 Checks if input is a string and empty or only whitespace.
181 >>> is_empty_string('')
183 >>> is_empty_string(' \t\t ')
185 >>> is_empty_string('test')
187 >>> is_empty_string(100.88)
189 >>> is_empty_string([1, 2, 3])
192 return is_string(in_str) and in_str.strip() == ""
195 def is_full_string(in_str: Any) -> bool:
197 Checks that input is a string and is not empty ('') or only whitespace.
199 >>> is_full_string('test!')
201 >>> is_full_string('')
203 >>> is_full_string(' ')
205 >>> is_full_string(100.999)
207 >>> is_full_string({"a": 1, "b": 2})
210 return is_string(in_str) and in_str.strip() != ""
213 def is_number(in_str: str) -> bool:
215 Checks if a string is a valid number.
218 Traceback (most recent call last):
221 >>> is_number("100.5")
223 >>> is_number("test")
227 >>> is_number([1, 2, 3])
228 Traceback (most recent call last):
230 ValueError: [1, 2, 3]
232 if not is_string(in_str):
233 raise ValueError(in_str)
234 return NUMBER_RE.match(in_str) is not None
237 def is_integer_number(in_str: str) -> bool:
239 Checks whether the given string represents an integer or not.
241 An integer may be signed or unsigned or use a "scientific notation".
243 >>> is_integer_number('42')
245 >>> is_integer_number('42.0')
249 (is_number(in_str) and "." not in in_str) or
250 is_hexidecimal_integer_number(in_str) or
251 is_octal_integer_number(in_str) or
252 is_binary_integer_number(in_str)
256 def is_hexidecimal_integer_number(in_str: str) -> bool:
258 Checks whether a string is a hex integer number.
260 >>> is_hexidecimal_integer_number('0x12345')
262 >>> is_hexidecimal_integer_number('0x1A3E')
264 >>> is_hexidecimal_integer_number('1234') # Needs 0x
266 >>> is_hexidecimal_integer_number('-0xff')
268 >>> is_hexidecimal_integer_number('test')
270 >>> is_hexidecimal_integer_number(12345) # Not a string
271 Traceback (most recent call last):
274 >>> is_hexidecimal_integer_number(101.4)
275 Traceback (most recent call last):
278 >>> is_hexidecimal_integer_number(0x1A3E)
279 Traceback (most recent call last):
283 if not is_string(in_str):
284 raise ValueError(in_str)
285 return HEX_NUMBER_RE.match(in_str) is not None
288 def is_octal_integer_number(in_str: str) -> bool:
290 Checks whether a string is an octal number.
292 >>> is_octal_integer_number('0o777')
294 >>> is_octal_integer_number('-0O115')
296 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
298 >>> is_octal_integer_number('7777') # Needs 0o
300 >>> is_octal_integer_number('test')
303 if not is_string(in_str):
304 raise ValueError(in_str)
305 return OCT_NUMBER_RE.match(in_str) is not None
308 def is_binary_integer_number(in_str: str) -> bool:
310 Returns whether a string contains a binary number.
312 >>> is_binary_integer_number('0b10111')
314 >>> is_binary_integer_number('-0b111')
316 >>> is_binary_integer_number('0B10101')
318 >>> is_binary_integer_number('0b10102')
320 >>> is_binary_integer_number('0xFFF')
322 >>> is_binary_integer_number('test')
325 if not is_string(in_str):
326 raise ValueError(in_str)
327 return BIN_NUMBER_RE.match(in_str) is not None
330 def to_int(in_str: str) -> int:
331 """Returns the integral value of the string or raises on error.
336 Traceback (most recent call last):
338 ValueError: invalid literal for int() with base 10: 'test'
340 if not is_string(in_str):
341 raise ValueError(in_str)
342 if is_binary_integer_number(in_str):
343 return int(in_str, 2)
344 if is_octal_integer_number(in_str):
345 return int(in_str, 8)
346 if is_hexidecimal_integer_number(in_str):
347 return int(in_str, 16)
351 def is_decimal_number(in_str: str) -> bool:
353 Checks whether the given string represents a decimal or not.
355 A decimal may be signed or unsigned or use a "scientific notation".
357 >>> is_decimal_number('42.0')
359 >>> is_decimal_number('42')
362 return is_number(in_str) and "." in in_str
365 def strip_escape_sequences(in_str: str) -> str:
367 Remove escape sequences in the input string.
369 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
372 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
376 def add_thousands_separator(
379 separator_char = ',',
383 Add thousands separator to a numeric string. Also handles numbers.
385 >>> add_thousands_separator('12345678')
387 >>> add_thousands_separator(12345678)
389 >>> add_thousands_separator(12345678.99)
391 >>> add_thousands_separator('test')
392 Traceback (most recent call last):
397 if isinstance(in_str, numbers.Number):
399 if is_number(in_str):
400 return _add_thousands_separator(
402 separator_char = separator_char,
405 raise ValueError(in_str)
408 def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str:
411 (in_str, decimal_part) = in_str.split('.')
412 tmp = [iter(in_str[::-1])] * places
413 ret = separator_char.join(
414 "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
415 if len(decimal_part) > 0:
422 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
423 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
425 Check if a string is a valid url.
427 >>> is_url('http://www.mysite.com')
429 >>> is_url('https://mysite.com')
431 >>> is_url('.mysite.com')
434 if not is_full_string(in_str):
437 valid = URL_RE.match(in_str) is not None
440 return valid and any([in_str.startswith(s) for s in allowed_schemes])
444 def is_email(in_str: Any) -> bool:
446 Check if a string is a valid email.
448 Reference: https://tools.ietf.org/html/rfc3696#section-3
452 >>> is_email('@gmail.com')
456 not is_full_string(in_str)
458 or in_str.startswith(".")
463 # we expect 2 tokens, one before "@" and one after, otherwise
464 # we have an exception and the email is not valid.
465 head, tail = in_str.split("@")
467 # head's size must be <= 64, tail <= 255, head must not start
468 # with a dot or contain multiple consecutive dots.
472 or head.endswith(".")
477 # removes escaped spaces, so that later on the test regex will
479 head = head.replace("\\ ", "")
480 if head.startswith('"') and head.endswith('"'):
481 head = head.replace(" ", "")[1:-1]
482 return EMAIL_RE.match(head + "@" + tail) is not None
485 # borderline case in which we have multiple "@" signs but the
486 # head part is correctly escaped.
487 if ESCAPED_AT_SIGN.search(in_str) is not None:
488 # replace "@" with "a" in the head
489 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
493 def suffix_string_to_number(in_str: str) -> Optional[int]:
494 """Take a string like "33Gb" and convert it into a number (of bytes)
495 like 34603008. Return None if the input string is not valid.
497 >>> suffix_string_to_number('1Mb')
499 >>> suffix_string_to_number('13.1Gb')
502 def suffix_capitalize(s: str) -> str:
506 return f"{s[0].upper()}{s[1].lower()}"
507 return suffix_capitalize(s[0:1])
509 if is_string(in_str):
510 if is_integer_number(in_str):
511 return to_int(in_str)
512 suffixes = [in_str[-2:], in_str[-1:]]
513 rest = [in_str[:-2], in_str[:-1]]
514 for x in range(len(suffixes)):
516 s = suffix_capitalize(s)
517 multiplier = NUM_SUFFIXES.get(s, None)
518 if multiplier is not None:
520 if is_integer_number(r):
521 return to_int(r) * multiplier
522 if is_decimal_number(r):
523 return int(float(r) * multiplier)
527 def number_to_suffix_string(num: int) -> Optional[str]:
528 """Take a number (of bytes) and returns a string like "43.8Gb".
529 Returns none if the input is invalid.
531 >>> number_to_suffix_string(14066017894)
533 >>> number_to_suffix_string(1024 * 1024)
539 for (sfx, size) in NUM_SUFFIXES.items():
544 if suffix is not None:
545 return f"{d:.1f}{suffix}"
550 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
552 Checks if a string is a valid credit card number.
553 If card type is provided then it checks against that specific type only,
554 otherwise any known credit card number will be accepted.
556 Supported card types are the following:
565 if not is_full_string(in_str):
568 if card_type is not None:
569 if card_type not in CREDIT_CARDS:
571 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
573 return CREDIT_CARDS[card_type].match(in_str) is not None
574 for c in CREDIT_CARDS:
575 if CREDIT_CARDS[c].match(in_str) is not None:
580 def is_camel_case(in_str: Any) -> bool:
582 Checks if a string is formatted as camel case.
584 A string is considered camel case when:
586 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
587 - it contains both lowercase and uppercase letters
588 - it does not start with a number
591 is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
595 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
597 Checks if a string is formatted as "snake case".
599 A string is considered snake case when:
601 - it's composed only by lowercase/uppercase letters and digits
602 - it contains at least one underscore (or provided separator)
603 - it does not start with a number
605 >>> is_snake_case('this_is_a_test')
607 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
609 >>> is_snake_case('this-is-a-test')
611 >>> is_snake_case('this-is-a-test', separator='-')
615 if is_full_string(in_str):
616 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
618 r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
623 re_template.format(sign=re.escape(separator)), re.IGNORECASE
626 return r.match(in_str) is not None
630 def is_json(in_str: Any) -> bool:
632 Check if a string is a valid json.
634 >>> is_json('{"name": "Peter"}')
636 >>> is_json('[1, 2, 3]')
638 >>> is_json('{nope}')
641 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
643 return isinstance(json.loads(in_str), (dict, list))
644 except (TypeError, ValueError, OverflowError):
649 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
651 Check if a string is a valid UUID.
653 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
655 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
657 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
660 # string casting is used to allow UUID itself as input data type
663 return UUID_HEX_OK_RE.match(s) is not None
664 return UUID_RE.match(s) is not None
667 def is_ip_v4(in_str: Any) -> bool:
669 Checks if a string is a valid ip v4.
671 >>> is_ip_v4('255.200.100.75')
675 >>> is_ip_v4('255.200.100.999') # 999 out of range
678 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
681 # checks that each entry in the ip is in the valid range (0 to 255)
682 for token in in_str.split("."):
683 if not 0 <= int(token) <= 255:
688 def extract_ip_v4(in_str: Any) -> Optional[str]:
690 Extracts the IPv4 chunk of a string or None.
692 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
694 >>> extract_ip_v4('Your mom dresses you funny.')
696 if not is_full_string(in_str):
698 m = ANYWHERE_IP_V4_RE.search(in_str)
704 def is_ip_v6(in_str: Any) -> bool:
706 Checks if a string is a valid ip v6.
708 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
710 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
713 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
716 def extract_ip_v6(in_str: Any) -> Optional[str]:
718 Extract IPv6 chunk or None.
720 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
721 '2001:db8:85a3:0000:0000:8a2e:370:7334'
722 >>> extract_ip_v6("(and she's ugly too, btw)")
724 if not is_full_string(in_str):
726 m = ANYWHERE_IP_V6_RE.search(in_str)
732 def is_ip(in_str: Any) -> bool:
734 Checks if a string is a valid ip (either v4 or v6).
738 >>> is_ip('255.200.100.75')
740 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
744 >>> is_ip('1.2.3.999')
747 return is_ip_v6(in_str) or is_ip_v4(in_str)
750 def extract_ip(in_str: Any) -> Optional[str]:
752 Extract the IP address or None.
754 >>> extract_ip('Attacker: 255.200.100.75')
756 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
757 '2001:db8:85a3:0000:0000:8a2e:370:7334'
758 >>> extract_ip('1.2.3')
761 ip = extract_ip_v4(in_str)
763 ip = extract_ip_v6(in_str)
767 def is_mac_address(in_str: Any) -> bool:
768 """Return True if in_str is a valid MAC address false otherwise.
770 >>> is_mac_address("34:29:8F:12:0D:2F")
772 >>> is_mac_address('34:29:8f:12:0d:2f')
774 >>> is_mac_address('34-29-8F-12-0D-2F')
776 >>> is_mac_address("test")
779 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
782 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
784 Extract the MAC address from in_str.
786 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
790 if not is_full_string(in_str):
793 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
796 mac.replace(":", separator)
797 mac.replace("-", separator)
802 def is_slug(in_str: Any, separator: str = "-") -> bool:
804 Checks if a given string is a slug (as created by `slugify()`).
806 >>> is_slug('my-blog-post-title')
808 >>> is_slug('My blog post title')
812 if not is_full_string(in_str):
814 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
815 return re.match(rex, in_str) is not None
818 def contains_html(in_str: str) -> bool:
820 Checks if the given string contains HTML/XML tags.
822 By design, this function matches ANY type of tag, so don't expect to use it
823 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
825 >>> contains_html('my string is <strong>bold</strong>')
827 >>> contains_html('my string is not bold')
831 if not is_string(in_str):
832 raise ValueError(in_str)
833 return HTML_RE.search(in_str) is not None
836 def words_count(in_str: str) -> int:
838 Returns the number of words contained into the given string.
840 This method is smart, it does consider only sequence of one or more letter and/or numbers
841 as "words", so a string like this: "! @ # % ... []" will return zero!
842 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
843 will be 4 not 1 (even if there are no spaces in the string).
845 >>> words_count('hello world')
847 >>> words_count('one,two,three.stop')
851 if not is_string(in_str):
852 raise ValueError(in_str)
853 return len(WORDS_COUNT_RE.findall(in_str))
856 def generate_uuid(as_hex: bool = False) -> str:
858 Generated an UUID string (using `uuid.uuid4()`).
860 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
861 generate_uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
870 def generate_random_alphanumeric_string(size: int) -> str:
872 Returns a string of the specified size containing random
873 characters (uppercase/lowercase ascii letters and digits).
875 random_string(9) # possible output: "cx3QQbzYg"
879 raise ValueError("size must be >= 1")
880 chars = string.ascii_letters + string.digits
881 buffer = [random.choice(chars) for _ in range(size)]
882 return from_char_list(buffer)
885 def reverse(in_str: str) -> str:
887 Returns the string with its chars reversed.
893 if not is_string(in_str):
894 raise ValueError(in_str)
898 def camel_case_to_snake_case(in_str, *, separator="_"):
900 Convert a camel case string into a snake case one.
901 (The original string is returned if is not a valid camel case string)
903 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
904 'mac_address_extractor_factory'
905 >>> camel_case_to_snake_case('Luke Skywalker')
908 if not is_string(in_str):
909 raise ValueError(in_str)
910 if not is_camel_case(in_str):
912 return CAMEL_CASE_REPLACE_RE.sub(
913 lambda m: m.group(1) + separator, in_str
917 def snake_case_to_camel_case(
918 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
921 Convert a snake case string into a camel case one.
922 (The original string is returned if is not a valid snake case string)
924 >>> snake_case_to_camel_case('this_is_a_test')
926 >>> snake_case_to_camel_case('Han Solo')
929 if not is_string(in_str):
930 raise ValueError(in_str)
931 if not is_snake_case(in_str, separator=separator):
933 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
934 if not upper_case_first:
935 tokens[0] = tokens[0].lower()
936 return from_char_list(tokens)
939 def to_char_list(in_str: str) -> List[str]:
940 """Convert a string into a list of chars.
942 >>> to_char_list('test')
945 if not is_string(in_str):
950 def from_char_list(in_list: List[str]) -> str:
951 """Convert a char list into a string.
953 >>> from_char_list(['t', 'e', 's', 't'])
956 return "".join(in_list)
959 def shuffle(in_str: str) -> str:
960 """Return a new string containing same chars of the given one but in
963 if not is_string(in_str):
964 raise ValueError(in_str)
966 # turn the string into a list of chars
967 chars = to_char_list(in_str)
968 random.shuffle(chars)
969 return from_char_list(chars)
972 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
974 Remove html code contained into the given string.
976 >>> strip_html('test: <a href="foo/bar">click here</a>')
978 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
981 if not is_string(in_str):
982 raise ValueError(in_str)
983 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
984 return r.sub("", in_str)
987 def asciify(in_str: str) -> str:
989 Force string content to be ascii-only by translating all non-ascii
990 chars into the closest possible representation (eg: ó -> o, Ë ->
993 N.B. Some chars may be lost if impossible to translate.
995 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
996 'eeuuooaaeynAAACIINOE'
998 if not is_string(in_str):
999 raise ValueError(in_str)
1001 # "NFKD" is the algorithm which is able to successfully translate
1002 # the most of non-ascii chars.
1003 normalized = unicodedata.normalize("NFKD", in_str)
1005 # encode string forcing ascii and ignore any errors
1006 # (unrepresentable chars will be stripped out)
1007 ascii_bytes = normalized.encode("ascii", "ignore")
1009 # turns encoded bytes into an utf-8 string
1010 return ascii_bytes.decode("utf-8")
1013 def slugify(in_str: str, *, separator: str = "-") -> str:
1015 Converts a string into a "slug" using provided separator.
1016 The returned string has the following properties:
1019 - all letters are in lower case
1020 - all punctuation signs and non alphanumeric chars are removed
1021 - words are divided using provided separator
1022 - all chars are encoded as ascii (by using `asciify()`)
1025 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1026 'top-10-reasons-to-love-dogs'
1027 >>> slugify('Mönstér Mägnët')
1030 if not is_string(in_str):
1031 raise ValueError(in_str)
1033 # replace any character that is NOT letter or number with spaces
1034 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1036 # replace spaces with join sign
1037 out = SPACES_RE.sub(separator, out)
1039 # normalize joins (remove duplicates)
1040 out = re.sub(re.escape(separator) + r"+", separator, out)
1044 def to_bool(in_str: str) -> bool:
1046 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1048 A positive boolean (True) is returned if the string value is one
1056 Otherwise False is returned.
1069 if not is_string(in_str):
1070 raise ValueError(in_str)
1071 return in_str.lower() in ("true", "1", "yes", "y", "t")
1074 def to_date(in_str: str) -> Optional[datetime.date]:
1076 Parses a date string. See DateParser docs for details.
1078 import dateparse.dateparse_utils as dp
1083 except dp.ParseException:
1084 logger.warning(f'Unable to parse date {in_str}.')
1088 def valid_date(in_str: str) -> bool:
1090 True if the string represents a valid date.
1092 import dateparse.dateparse_utils as dp
1097 except dp.ParseException:
1098 logger.warning(f'Unable to parse date {in_str}.')
1102 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1104 Parses a datetime string. See DateParser docs for more info.
1106 import dateparse.dateparse_utils as dp
1109 dt = d.parse(in_str)
1110 if type(dt) == datetime.datetime:
1113 logger.warning(f'Unable to parse datetime {in_str}.')
1117 def valid_datetime(in_str: str) -> bool:
1119 True if the string represents a valid datetime.
1121 _ = to_datetime(in_str)
1124 logger.warning(f'Unable to parse datetime {in_str}.')
1128 def dedent(in_str: str) -> str:
1130 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1132 if not is_string(in_str):
1133 raise ValueError(in_str)
1134 line_separator = '\n'
1135 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1136 return line_separator.join(lines)
1139 def indent(in_str: str, amount: int) -> str:
1141 Indents string by prepending amount spaces.
1143 >>> indent('This is a test', 4)
1147 if not is_string(in_str):
1148 raise ValueError(in_str)
1149 line_separator = '\n'
1150 lines = [" " * amount + line for line in in_str.split(line_separator)]
1151 return line_separator.join(lines)
1154 def sprintf(*args, **kwargs) -> str:
1155 """String printf, like in C"""
1158 sep = kwargs.pop("sep", None)
1160 if not isinstance(sep, str):
1161 raise TypeError("sep must be None or a string")
1163 end = kwargs.pop("end", None)
1165 if not isinstance(end, str):
1166 raise TypeError("end must be None or a string")
1169 raise TypeError("invalid keyword arguments to sprint()")
1175 for i, arg in enumerate(args):
1178 if isinstance(arg, str):
1186 class SprintfStdout(object):
1188 A context manager that captures outputs to stdout.
1190 with SprintfStdout() as buf:
1196 def __init__(self) -> None:
1197 self.destination = io.StringIO()
1198 self.recorder = None
1200 def __enter__(self) -> Callable[[], str]:
1201 self.recorder = contextlib.redirect_stdout(self.destination)
1202 self.recorder.__enter__()
1203 return lambda: self.destination.getvalue()
1205 def __exit__(self, *args) -> None:
1206 self.recorder.__exit__(*args)
1207 self.destination.seek(0)
1208 return None # don't suppress exceptions
1211 def is_are(n: int) -> str:
1225 def pluralize(n: int) -> str:
1231 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1234 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1243 def thify(n: int) -> str:
1244 """Return the proper cardinal suffix for a number.
1255 assert is_integer_number(digit)
1267 def ngrams(txt: str, n: int):
1268 """Return the ngrams from a string.
1270 >>> [x for x in ngrams('This is a test', 2)]
1271 ['This is', 'is a', 'a test']
1275 return ngrams_presplit(words, n)
1278 def ngrams_presplit(words: Iterable[str], n: int):
1279 for ngram in zip(*[words[i:] for i in range(n)]):
1280 yield(' '.join(ngram))
1283 def bigrams(txt: str):
1284 return ngrams(txt, 2)
1287 def trigrams(txt: str):
1288 return ngrams(txt, 3)
1291 def shuffle_columns_into_list(
1292 input_lines: Iterable[str],
1293 column_specs: Iterable[Iterable[int]],
1296 """Helper to shuffle / parse columnar data and return the results as a
1297 list. The column_specs argument is an iterable collection of
1298 numeric sequences that indicate one or more column numbers to
1301 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1302 >>> shuffle_columns_into_list(
1304 ... [ [8], [2, 3], [5, 6, 7] ],
1307 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1312 # Column specs map input lines' columns into outputs.
1314 for spec in column_specs:
1317 chunk = chunk + delim + input_lines[n]
1318 chunk = chunk.strip(delim)
1323 def shuffle_columns_into_dict(
1324 input_lines: Iterable[str],
1325 column_specs: Iterable[Tuple[str, Iterable[int]]],
1327 ) -> Dict[str, str]:
1328 """Helper to shuffle / parse columnar data and return the results
1331 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1332 >>> shuffle_columns_into_dict(
1334 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1337 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1342 # Column specs map input lines' columns into outputs.
1343 # "key", [col1, col2...]
1344 for spec in column_specs:
1347 chunk = chunk + delim + input_lines[n]
1348 chunk = chunk.strip(delim)
1349 out[spec[0]] = chunk
1353 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1354 """Interpolate a string with data from a dict.
1356 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1357 ... {'adjective': 'good', 'noun': 'example'})
1358 'This is a good example.'
1361 return sprintf(txt.format(**values), end='')
1364 if __name__ == '__main__':