3 """The MIT License (MIT)
5 Copyright (c) 2016-2020 Davide Zanotti
6 Modifications Copyright (c) 2021-2022 Scott Gasch
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in all
16 copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 This class is based on: https://github.com/daveoncode/python-string-utils.
30 import contextlib # type: ignore
33 from itertools import zip_longest
51 from uuid import uuid4
56 logger = logging.getLogger(__name__)
58 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
60 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
62 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
64 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
67 r"([a-z-]+://)" # scheme
68 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
70 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
71 r"(:\d{2,})?" # port number
72 r"(/[a-z\d_%+-]*)*" # folders
73 r"(\.[a-z\d_%+-]+)*" # file extension
74 r"(\?[a-z\d_+%-=]*)?" # query string
78 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
80 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
82 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
85 r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
88 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
90 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
92 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
94 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
96 SNAKE_CASE_TEST_RE = re.compile(
97 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
100 SNAKE_CASE_TEST_DASH_RE = re.compile(
101 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
104 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
106 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
109 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
110 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
111 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
112 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
113 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
114 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
117 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
119 UUID_RE = re.compile(
120 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
123 UUID_HEX_OK_RE = re.compile(
124 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
128 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
130 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
132 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
134 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
136 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
138 ANYWHERE_MAC_ADDRESS_RE = re.compile(
139 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
142 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
144 HTML_RE = re.compile(
145 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
146 re.IGNORECASE | re.MULTILINE | re.DOTALL,
149 HTML_TAG_ONLY_RE = re.compile(
150 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
151 re.IGNORECASE | re.MULTILINE | re.DOTALL,
154 SPACES_RE = re.compile(r"\s")
156 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
158 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
160 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
176 def is_none_or_empty(in_str: Optional[str]) -> bool:
178 Returns true if the input string is either None or an empty string.
180 >>> is_none_or_empty("")
182 >>> is_none_or_empty(None)
184 >>> is_none_or_empty(" \t ")
186 >>> is_none_or_empty('Test')
189 return in_str is None or len(in_str.strip()) == 0
192 def is_string(obj: Any) -> bool:
194 Checks if an object is a string.
196 >>> is_string('test')
202 >>> is_string([1, 2, 3])
205 return isinstance(obj, str)
208 def is_empty_string(in_str: Any) -> bool:
209 return is_empty(in_str)
212 def is_empty(in_str: Any) -> bool:
214 Checks if input is a string and empty or only whitespace.
218 >>> is_empty(' \t\t ')
224 >>> is_empty([1, 2, 3])
227 return is_string(in_str) and in_str.strip() == ""
230 def is_full_string(in_str: Any) -> bool:
232 Checks that input is a string and is not empty ('') or only whitespace.
234 >>> is_full_string('test!')
236 >>> is_full_string('')
238 >>> is_full_string(' ')
240 >>> is_full_string(100.999)
242 >>> is_full_string({"a": 1, "b": 2})
245 return is_string(in_str) and in_str.strip() != ""
248 def is_number(in_str: str) -> bool:
250 Checks if a string is a valid number.
253 Traceback (most recent call last):
256 >>> is_number("100.5")
258 >>> is_number("test")
262 >>> is_number([1, 2, 3])
263 Traceback (most recent call last):
265 ValueError: [1, 2, 3]
267 if not is_string(in_str):
268 raise ValueError(in_str)
269 return NUMBER_RE.match(in_str) is not None
272 def is_integer_number(in_str: str) -> bool:
274 Checks whether the given string represents an integer or not.
276 An integer may be signed or unsigned or use a "scientific notation".
278 >>> is_integer_number('42')
280 >>> is_integer_number('42.0')
284 (is_number(in_str) and "." not in in_str)
285 or is_hexidecimal_integer_number(in_str)
286 or is_octal_integer_number(in_str)
287 or is_binary_integer_number(in_str)
291 def is_hexidecimal_integer_number(in_str: str) -> bool:
293 Checks whether a string is a hex integer number.
295 >>> is_hexidecimal_integer_number('0x12345')
297 >>> is_hexidecimal_integer_number('0x1A3E')
299 >>> is_hexidecimal_integer_number('1234') # Needs 0x
301 >>> is_hexidecimal_integer_number('-0xff')
303 >>> is_hexidecimal_integer_number('test')
305 >>> is_hexidecimal_integer_number(12345) # Not a string
306 Traceback (most recent call last):
309 >>> is_hexidecimal_integer_number(101.4)
310 Traceback (most recent call last):
313 >>> is_hexidecimal_integer_number(0x1A3E)
314 Traceback (most recent call last):
318 if not is_string(in_str):
319 raise ValueError(in_str)
320 return HEX_NUMBER_RE.match(in_str) is not None
323 def is_octal_integer_number(in_str: str) -> bool:
325 Checks whether a string is an octal number.
327 >>> is_octal_integer_number('0o777')
329 >>> is_octal_integer_number('-0O115')
331 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
333 >>> is_octal_integer_number('7777') # Needs 0o
335 >>> is_octal_integer_number('test')
338 if not is_string(in_str):
339 raise ValueError(in_str)
340 return OCT_NUMBER_RE.match(in_str) is not None
343 def is_binary_integer_number(in_str: str) -> bool:
345 Returns whether a string contains a binary number.
347 >>> is_binary_integer_number('0b10111')
349 >>> is_binary_integer_number('-0b111')
351 >>> is_binary_integer_number('0B10101')
353 >>> is_binary_integer_number('0b10102')
355 >>> is_binary_integer_number('0xFFF')
357 >>> is_binary_integer_number('test')
360 if not is_string(in_str):
361 raise ValueError(in_str)
362 return BIN_NUMBER_RE.match(in_str) is not None
365 def to_int(in_str: str) -> int:
366 """Returns the integral value of the string or raises on error.
371 Traceback (most recent call last):
373 ValueError: invalid literal for int() with base 10: 'test'
375 if not is_string(in_str):
376 raise ValueError(in_str)
377 if is_binary_integer_number(in_str):
378 return int(in_str, 2)
379 if is_octal_integer_number(in_str):
380 return int(in_str, 8)
381 if is_hexidecimal_integer_number(in_str):
382 return int(in_str, 16)
386 def is_decimal_number(in_str: str) -> bool:
388 Checks whether the given string represents a decimal or not.
390 A decimal may be signed or unsigned or use a "scientific notation".
392 >>> is_decimal_number('42.0')
394 >>> is_decimal_number('42')
397 return is_number(in_str) and "." in in_str
400 def strip_escape_sequences(in_str: str) -> str:
402 Remove escape sequences in the input string.
404 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
407 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
411 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
413 Add thousands separator to a numeric string. Also handles numbers.
415 >>> add_thousands_separator('12345678')
417 >>> add_thousands_separator(12345678)
419 >>> add_thousands_separator(12345678.99)
421 >>> add_thousands_separator('test')
422 Traceback (most recent call last):
427 if isinstance(in_str, numbers.Number):
429 if is_number(in_str):
430 return _add_thousands_separator(
431 in_str, separator_char=separator_char, places=places
433 raise ValueError(in_str)
436 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
439 (in_str, decimal_part) = in_str.split('.')
440 tmp = [iter(in_str[::-1])] * places
441 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
442 if len(decimal_part) > 0:
449 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
450 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
452 Check if a string is a valid url.
454 >>> is_url('http://www.mysite.com')
456 >>> is_url('https://mysite.com')
458 >>> is_url('.mysite.com')
461 if not is_full_string(in_str):
464 valid = URL_RE.match(in_str) is not None
467 return valid and any([in_str.startswith(s) for s in allowed_schemes])
471 def is_email(in_str: Any) -> bool:
473 Check if a string is a valid email.
475 Reference: https://tools.ietf.org/html/rfc3696#section-3
479 >>> is_email('@gmail.com')
482 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
486 # we expect 2 tokens, one before "@" and one after, otherwise
487 # we have an exception and the email is not valid.
488 head, tail = in_str.split("@")
490 # head's size must be <= 64, tail <= 255, head must not start
491 # with a dot or contain multiple consecutive dots.
492 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
495 # removes escaped spaces, so that later on the test regex will
497 head = head.replace("\\ ", "")
498 if head.startswith('"') and head.endswith('"'):
499 head = head.replace(" ", "")[1:-1]
500 return EMAIL_RE.match(head + "@" + tail) is not None
503 # borderline case in which we have multiple "@" signs but the
504 # head part is correctly escaped.
505 if ESCAPED_AT_SIGN.search(in_str) is not None:
506 # replace "@" with "a" in the head
507 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
511 def suffix_string_to_number(in_str: str) -> Optional[int]:
512 """Take a string like "33Gb" and convert it into a number (of bytes)
513 like 34603008. Return None if the input string is not valid.
515 >>> suffix_string_to_number('1Mb')
517 >>> suffix_string_to_number('13.1Gb')
521 def suffix_capitalize(s: str) -> str:
525 return f"{s[0].upper()}{s[1].lower()}"
526 return suffix_capitalize(s[0:1])
528 if is_string(in_str):
529 if is_integer_number(in_str):
530 return to_int(in_str)
531 suffixes = [in_str[-2:], in_str[-1:]]
532 rest = [in_str[:-2], in_str[:-1]]
533 for x in range(len(suffixes)):
535 s = suffix_capitalize(s)
536 multiplier = NUM_SUFFIXES.get(s, None)
537 if multiplier is not None:
539 if is_integer_number(r):
540 return to_int(r) * multiplier
541 if is_decimal_number(r):
542 return int(float(r) * multiplier)
546 def number_to_suffix_string(num: int) -> Optional[str]:
547 """Take a number (of bytes) and returns a string like "43.8Gb".
548 Returns none if the input is invalid.
550 >>> number_to_suffix_string(14066017894)
552 >>> number_to_suffix_string(1024 * 1024)
558 for (sfx, size) in NUM_SUFFIXES.items():
563 if suffix is not None:
564 return f"{d:.1f}{suffix}"
569 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
571 Checks if a string is a valid credit card number.
572 If card type is provided then it checks against that specific type only,
573 otherwise any known credit card number will be accepted.
575 Supported card types are the following:
584 if not is_full_string(in_str):
587 if card_type is not None:
588 if card_type not in CREDIT_CARDS:
590 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
592 return CREDIT_CARDS[card_type].match(in_str) is not None
593 for c in CREDIT_CARDS:
594 if CREDIT_CARDS[c].match(in_str) is not None:
599 def is_camel_case(in_str: Any) -> bool:
601 Checks if a string is formatted as camel case.
603 A string is considered camel case when:
605 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
606 - it contains both lowercase and uppercase letters
607 - it does not start with a number
609 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
612 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
614 Checks if a string is formatted as "snake case".
616 A string is considered snake case when:
618 - it's composed only by lowercase/uppercase letters and digits
619 - it contains at least one underscore (or provided separator)
620 - it does not start with a number
622 >>> is_snake_case('this_is_a_test')
624 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
626 >>> is_snake_case('this-is-a-test')
628 >>> is_snake_case('this-is-a-test', separator='-')
632 if is_full_string(in_str):
633 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
634 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
637 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
639 return r.match(in_str) is not None
643 def is_json(in_str: Any) -> bool:
645 Check if a string is a valid json.
647 >>> is_json('{"name": "Peter"}')
649 >>> is_json('[1, 2, 3]')
651 >>> is_json('{nope}')
654 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
656 return isinstance(json.loads(in_str), (dict, list))
657 except (TypeError, ValueError, OverflowError):
662 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
664 Check if a string is a valid UUID.
666 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
668 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
670 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
673 # string casting is used to allow UUID itself as input data type
676 return UUID_HEX_OK_RE.match(s) is not None
677 return UUID_RE.match(s) is not None
680 def is_ip_v4(in_str: Any) -> bool:
682 Checks if a string is a valid ip v4.
684 >>> is_ip_v4('255.200.100.75')
688 >>> is_ip_v4('255.200.100.999') # 999 out of range
691 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
694 # checks that each entry in the ip is in the valid range (0 to 255)
695 for token in in_str.split("."):
696 if not 0 <= int(token) <= 255:
701 def extract_ip_v4(in_str: Any) -> Optional[str]:
703 Extracts the IPv4 chunk of a string or None.
705 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
707 >>> extract_ip_v4('Your mom dresses you funny.')
709 if not is_full_string(in_str):
711 m = ANYWHERE_IP_V4_RE.search(in_str)
717 def is_ip_v6(in_str: Any) -> bool:
719 Checks if a string is a valid ip v6.
721 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
723 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
726 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
729 def extract_ip_v6(in_str: Any) -> Optional[str]:
731 Extract IPv6 chunk or None.
733 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
734 '2001:db8:85a3:0000:0000:8a2e:370:7334'
735 >>> extract_ip_v6("(and she's ugly too, btw)")
737 if not is_full_string(in_str):
739 m = ANYWHERE_IP_V6_RE.search(in_str)
745 def is_ip(in_str: Any) -> bool:
747 Checks if a string is a valid ip (either v4 or v6).
749 >>> is_ip('255.200.100.75')
751 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
755 >>> is_ip('1.2.3.999')
758 return is_ip_v6(in_str) or is_ip_v4(in_str)
761 def extract_ip(in_str: Any) -> Optional[str]:
763 Extract the IP address or None.
765 >>> extract_ip('Attacker: 255.200.100.75')
767 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
768 '2001:db8:85a3:0000:0000:8a2e:370:7334'
769 >>> extract_ip('1.2.3')
772 ip = extract_ip_v4(in_str)
774 ip = extract_ip_v6(in_str)
778 def is_mac_address(in_str: Any) -> bool:
779 """Return True if in_str is a valid MAC address false otherwise.
781 >>> is_mac_address("34:29:8F:12:0D:2F")
783 >>> is_mac_address('34:29:8f:12:0d:2f')
785 >>> is_mac_address('34-29-8F-12-0D-2F')
787 >>> is_mac_address("test")
790 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
793 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
795 Extract the MAC address from in_str.
797 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
800 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
804 if not is_full_string(in_str):
807 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
810 mac.replace(":", separator)
811 mac.replace("-", separator)
816 def is_slug(in_str: Any, separator: str = "-") -> bool:
818 Checks if a given string is a slug (as created by `slugify()`).
820 >>> is_slug('my-blog-post-title')
822 >>> is_slug('My blog post title')
826 if not is_full_string(in_str):
828 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
829 return re.match(rex, in_str) is not None
832 def contains_html(in_str: str) -> bool:
834 Checks if the given string contains HTML/XML tags.
836 By design, this function matches ANY type of tag, so don't expect to use it
837 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
839 >>> contains_html('my string is <strong>bold</strong>')
841 >>> contains_html('my string is not bold')
845 if not is_string(in_str):
846 raise ValueError(in_str)
847 return HTML_RE.search(in_str) is not None
850 def words_count(in_str: str) -> int:
852 Returns the number of words contained into the given string.
854 This method is smart, it does consider only sequence of one or more letter and/or numbers
855 as "words", so a string like this: "! @ # % ... []" will return zero!
856 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
857 will be 4 not 1 (even if there are no spaces in the string).
859 >>> words_count('hello world')
861 >>> words_count('one,two,three.stop')
865 if not is_string(in_str):
866 raise ValueError(in_str)
867 return len(WORDS_COUNT_RE.findall(in_str))
870 def generate_uuid(omit_dashes: bool = False) -> str:
872 Generated an UUID string (using `uuid.uuid4()`).
874 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
875 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
884 def generate_random_alphanumeric_string(size: int) -> str:
886 Returns a string of the specified size containing random
887 characters (uppercase/lowercase ascii letters and digits).
889 random_string(9) # possible output: "cx3QQbzYg"
893 raise ValueError("size must be >= 1")
894 chars = string.ascii_letters + string.digits
895 buffer = [random.choice(chars) for _ in range(size)]
896 return from_char_list(buffer)
899 def reverse(in_str: str) -> str:
901 Returns the string with its chars reversed.
907 if not is_string(in_str):
908 raise ValueError(in_str)
912 def camel_case_to_snake_case(in_str, *, separator="_"):
914 Convert a camel case string into a snake case one.
915 (The original string is returned if is not a valid camel case string)
917 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
918 'mac_address_extractor_factory'
919 >>> camel_case_to_snake_case('Luke Skywalker')
922 if not is_string(in_str):
923 raise ValueError(in_str)
924 if not is_camel_case(in_str):
926 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
929 def snake_case_to_camel_case(
930 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
933 Convert a snake case string into a camel case one.
934 (The original string is returned if is not a valid snake case string)
936 >>> snake_case_to_camel_case('this_is_a_test')
938 >>> snake_case_to_camel_case('Han Solo')
941 if not is_string(in_str):
942 raise ValueError(in_str)
943 if not is_snake_case(in_str, separator=separator):
945 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
946 if not upper_case_first:
947 tokens[0] = tokens[0].lower()
948 return from_char_list(tokens)
951 def to_char_list(in_str: str) -> List[str]:
952 """Convert a string into a list of chars.
954 >>> to_char_list('test')
957 if not is_string(in_str):
962 def from_char_list(in_list: List[str]) -> str:
963 """Convert a char list into a string.
965 >>> from_char_list(['t', 'e', 's', 't'])
968 return "".join(in_list)
971 def shuffle(in_str: str) -> str:
972 """Return a new string containing same chars of the given one but in
975 if not is_string(in_str):
976 raise ValueError(in_str)
978 # turn the string into a list of chars
979 chars = to_char_list(in_str)
980 random.shuffle(chars)
981 return from_char_list(chars)
984 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
986 Remove html code contained into the given string.
988 >>> strip_html('test: <a href="foo/bar">click here</a>')
990 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
993 if not is_string(in_str):
994 raise ValueError(in_str)
995 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
996 return r.sub("", in_str)
999 def asciify(in_str: str) -> str:
1001 Force string content to be ascii-only by translating all non-ascii
1002 chars into the closest possible representation (eg: ó -> o, Ë ->
1005 N.B. Some chars may be lost if impossible to translate.
1007 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1008 'eeuuooaaeynAAACIINOE'
1010 if not is_string(in_str):
1011 raise ValueError(in_str)
1013 # "NFKD" is the algorithm which is able to successfully translate
1014 # the most of non-ascii chars.
1015 normalized = unicodedata.normalize("NFKD", in_str)
1017 # encode string forcing ascii and ignore any errors
1018 # (unrepresentable chars will be stripped out)
1019 ascii_bytes = normalized.encode("ascii", "ignore")
1021 # turns encoded bytes into an utf-8 string
1022 return ascii_bytes.decode("utf-8")
1025 def slugify(in_str: str, *, separator: str = "-") -> str:
1027 Converts a string into a "slug" using provided separator.
1028 The returned string has the following properties:
1031 - all letters are in lower case
1032 - all punctuation signs and non alphanumeric chars are removed
1033 - words are divided using provided separator
1034 - all chars are encoded as ascii (by using `asciify()`)
1037 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1038 'top-10-reasons-to-love-dogs'
1039 >>> slugify('Mönstér Mägnët')
1042 if not is_string(in_str):
1043 raise ValueError(in_str)
1045 # replace any character that is NOT letter or number with spaces
1046 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1048 # replace spaces with join sign
1049 out = SPACES_RE.sub(separator, out)
1051 # normalize joins (remove duplicates)
1052 out = re.sub(re.escape(separator) + r"+", separator, out)
1056 def to_bool(in_str: str) -> bool:
1058 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1060 A positive boolean (True) is returned if the string value is one
1068 Otherwise False is returned.
1089 if not is_string(in_str):
1090 raise ValueError(in_str)
1091 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1094 def to_date(in_str: str) -> Optional[datetime.date]:
1096 Parses a date string. See DateParser docs for details.
1098 import dateparse.dateparse_utils as dp
1104 except dp.ParseException:
1105 msg = f'Unable to parse date {in_str}.'
1110 def valid_date(in_str: str) -> bool:
1112 True if the string represents a valid date.
1114 import dateparse.dateparse_utils as dp
1120 except dp.ParseException:
1121 msg = f'Unable to parse date {in_str}.'
1126 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1128 Parses a datetime string. See DateParser docs for more info.
1130 import dateparse.dateparse_utils as dp
1134 dt = d.parse(in_str)
1135 if type(dt) == datetime.datetime:
1138 msg = f'Unable to parse datetime {in_str}.'
1143 def valid_datetime(in_str: str) -> bool:
1145 True if the string represents a valid datetime.
1147 _ = to_datetime(in_str)
1150 msg = f'Unable to parse datetime {in_str}.'
1155 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1157 Squeeze runs of more than one character_to_squeeze into one.
1159 >>> squeeze(' this is a test ')
1162 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1167 r'(' + re.escape(character_to_squeeze) + r')+',
1168 character_to_squeeze,
1173 def dedent(in_str: str) -> str:
1175 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1177 if not is_string(in_str):
1178 raise ValueError(in_str)
1179 line_separator = '\n'
1180 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1181 return line_separator.join(lines)
1184 def indent(in_str: str, amount: int) -> str:
1186 Indents string by prepending amount spaces.
1188 >>> indent('This is a test', 4)
1192 if not is_string(in_str):
1193 raise ValueError(in_str)
1194 line_separator = '\n'
1195 lines = [" " * amount + line for line in in_str.split(line_separator)]
1196 return line_separator.join(lines)
1199 def sprintf(*args, **kwargs) -> str:
1200 """String printf, like in C"""
1203 sep = kwargs.pop("sep", None)
1205 if not isinstance(sep, str):
1206 raise TypeError("sep must be None or a string")
1208 end = kwargs.pop("end", None)
1210 if not isinstance(end, str):
1211 raise TypeError("end must be None or a string")
1214 raise TypeError("invalid keyword arguments to sprint()")
1220 for i, arg in enumerate(args):
1223 if isinstance(arg, str):
1231 class SprintfStdout(object):
1233 A context manager that captures outputs to stdout.
1235 with SprintfStdout() as buf:
1242 def __init__(self) -> None:
1243 self.destination = io.StringIO()
1244 self.recorder: contextlib.redirect_stdout
1246 def __enter__(self) -> Callable[[], str]:
1247 self.recorder = contextlib.redirect_stdout(self.destination)
1248 self.recorder.__enter__()
1249 return lambda: self.destination.getvalue()
1251 def __exit__(self, *args) -> None:
1252 self.recorder.__exit__(*args)
1253 self.destination.seek(0)
1254 return None # don't suppress exceptions
1257 def is_are(n: int) -> str:
1271 def pluralize(n: int) -> str:
1277 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1280 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1289 def thify(n: int) -> str:
1290 """Return the proper cardinal suffix for a number.
1301 assert is_integer_number(digit)
1313 def ngrams(txt: str, n: int):
1314 """Return the ngrams from a string.
1316 >>> [x for x in ngrams('This is a test', 2)]
1317 ['This is', 'is a', 'a test']
1321 for ngram in ngrams_presplit(words, n):
1328 def ngrams_presplit(words: Sequence[str], n: int):
1329 return list_utils.ngrams(words, n)
1332 def bigrams(txt: str):
1333 return ngrams(txt, 2)
1336 def trigrams(txt: str):
1337 return ngrams(txt, 3)
1340 def shuffle_columns_into_list(
1341 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1343 """Helper to shuffle / parse columnar data and return the results as a
1344 list. The column_specs argument is an iterable collection of
1345 numeric sequences that indicate one or more column numbers to
1348 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1349 >>> shuffle_columns_into_list(
1351 ... [ [8], [2, 3], [5, 6, 7] ],
1354 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1359 # Column specs map input lines' columns into outputs.
1361 for spec in column_specs:
1364 chunk = chunk + delim + input_lines[n]
1365 chunk = chunk.strip(delim)
1370 def shuffle_columns_into_dict(
1371 input_lines: Sequence[str],
1372 column_specs: Iterable[Tuple[str, Iterable[int]]],
1374 ) -> Dict[str, str]:
1375 """Helper to shuffle / parse columnar data and return the results
1378 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1379 >>> shuffle_columns_into_dict(
1381 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1384 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1389 # Column specs map input lines' columns into outputs.
1390 # "key", [col1, col2...]
1391 for spec in column_specs:
1394 chunk = chunk + delim + input_lines[n]
1395 chunk = chunk.strip(delim)
1396 out[spec[0]] = chunk
1400 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1401 """Interpolate a string with data from a dict.
1403 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1404 ... {'adjective': 'good', 'noun': 'example'})
1405 'This is a good example.'
1408 return sprintf(txt.format(**values), end='')
1411 def to_ascii(x: str):
1412 """Encode as ascii bytes string.
1414 >>> to_ascii('test')
1417 >>> to_ascii(b'1, 2, 3')
1422 return x.encode('ascii')
1423 if type(x) is bytes:
1425 raise Exception('to_ascii works with strings and bytes')
1428 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1429 """Encode txt and then encode the bytes with a 64-character
1430 alphabet. This is compatible with uudecode.
1432 >>> to_base64('hello?')
1436 return base64.encodebytes(txt.encode(encoding, errors))
1439 def is_base64(txt: str) -> bool:
1440 """Determine whether a string is base64 encoded (with Python's standard
1441 base64 alphabet which is the same as what uuencode uses).
1443 >>> is_base64('test') # all letters in the b64 alphabet
1446 >>> is_base64('another test, how do you like this one?')
1449 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1453 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1454 alphabet = set(a.encode('ascii'))
1455 for char in to_ascii(txt.strip()):
1456 if char not in alphabet:
1461 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1462 """Convert base64 encoded string back to normal strings.
1464 >>> from_base64(b'aGVsbG8/\\n')
1468 return base64.decodebytes(b64).decode(encoding, errors)
1471 def chunk(txt: str, chunk_size):
1472 """Chunk up a string.
1474 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1475 '01001101 11000101 10101010 10101010 10011111 10101000'
1478 if len(txt) % chunk_size != 0:
1479 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1481 warnings.warn(msg, stacklevel=2)
1482 for x in range(0, len(txt), chunk_size):
1483 yield txt[x : x + chunk_size]
1487 txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass'
1489 """Encode txt and then chop it into bytes. Note: only bitstrings
1490 with delimiter='' are interpretable by from_bitstring.
1492 >>> to_bitstring('hello?')
1493 '011010000110010101101100011011000110111100111111'
1495 >>> to_bitstring('test', delimiter=' ')
1496 '01110100 01100101 01110011 01110100'
1498 >>> to_bitstring(b'test')
1499 '01110100011001010111001101110100'
1502 etxt = to_ascii(txt)
1503 bits = bin(int.from_bytes(etxt, 'big'))
1505 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1508 def is_bitstring(txt: str) -> bool:
1509 """Is this a bitstring?
1511 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1514 >>> is_bitstring('1234')
1518 return is_binary_integer_number(f'0b{txt}')
1521 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1522 """Convert from bitstring back to bytes then decode into a str.
1524 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1529 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1532 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1533 """Turn an IPv4 address into a tuple for sorting purposes.
1535 >>> ip_v4_sort_key('10.0.0.18')
1538 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1539 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1540 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1543 if not is_ip_v4(txt):
1544 print(f"not IP: {txt}")
1546 return tuple([int(x) for x in txt.split('.')])
1549 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1550 """Chunk up a file path so that parent/ancestor paths sort before
1551 children/descendant paths.
1553 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1554 ('usr', 'local', 'bin')
1556 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1557 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1558 ['/usr', '/usr/local', '/usr/local/bin']
1561 return tuple([x for x in volume.split('/') if len(x) > 0])
1564 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1565 """Execute several replace operations in a row.
1567 >>> s = 'this_is a-test!'
1568 >>> replace_all(s, ' _-!', '')
1572 for char in replace_set:
1573 in_str = in_str.replace(char, replacement)
1577 if __name__ == '__main__':