3 """The MIT License (MIT)
5 Copyright (c) 2016-2020 Davide Zanotti
6 Modifications Copyright (c) 2021-2022 Scott Gasch
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in all
16 copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 This class is based on: https://github.com/daveoncode/python-string-utils.
30 import contextlib # type: ignore
41 from itertools import zip_longest
42 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
43 from uuid import uuid4
47 logger = logging.getLogger(__name__)
49 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
51 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
53 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
55 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
58 r"([a-z-]+://)" # scheme
59 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
61 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
62 r"(:\d{2,})?" # port number
63 r"(/[a-z\d_%+-]*)*" # folders
64 r"(\.[a-z\d_%+-]+)*" # file extension
65 r"(\?[a-z\d_+%-=]*)?" # query string
69 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
71 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
73 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
75 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
77 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
79 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
81 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
83 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
85 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
87 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
89 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
91 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
94 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
95 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
96 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
97 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
98 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
99 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
102 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
104 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
106 UUID_HEX_OK_RE = re.compile(
107 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
111 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
113 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
115 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
117 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
119 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
121 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
123 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
125 HTML_RE = re.compile(
126 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
127 re.IGNORECASE | re.MULTILINE | re.DOTALL,
130 HTML_TAG_ONLY_RE = re.compile(
131 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
132 re.IGNORECASE | re.MULTILINE | re.DOTALL,
135 SPACES_RE = re.compile(r"\s")
137 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
139 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
141 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
157 def is_none_or_empty(in_str: Optional[str]) -> bool:
159 Returns true if the input string is either None or an empty string.
161 >>> is_none_or_empty("")
163 >>> is_none_or_empty(None)
165 >>> is_none_or_empty(" \t ")
167 >>> is_none_or_empty('Test')
170 return in_str is None or len(in_str.strip()) == 0
173 def is_string(obj: Any) -> bool:
175 Checks if an object is a string.
177 >>> is_string('test')
183 >>> is_string([1, 2, 3])
186 return isinstance(obj, str)
189 def is_empty_string(in_str: Any) -> bool:
190 return is_empty(in_str)
193 def is_empty(in_str: Any) -> bool:
195 Checks if input is a string and empty or only whitespace.
199 >>> is_empty(' \t\t ')
205 >>> is_empty([1, 2, 3])
208 return is_string(in_str) and in_str.strip() == ""
211 def is_full_string(in_str: Any) -> bool:
213 Checks that input is a string and is not empty ('') or only whitespace.
215 >>> is_full_string('test!')
217 >>> is_full_string('')
219 >>> is_full_string(' ')
221 >>> is_full_string(100.999)
223 >>> is_full_string({"a": 1, "b": 2})
226 return is_string(in_str) and in_str.strip() != ""
229 def is_number(in_str: str) -> bool:
231 Checks if a string is a valid number.
234 Traceback (most recent call last):
237 >>> is_number("100.5")
239 >>> is_number("test")
243 >>> is_number([1, 2, 3])
244 Traceback (most recent call last):
246 ValueError: [1, 2, 3]
248 if not is_string(in_str):
249 raise ValueError(in_str)
250 return NUMBER_RE.match(in_str) is not None
253 def is_integer_number(in_str: str) -> bool:
255 Checks whether the given string represents an integer or not.
257 An integer may be signed or unsigned or use a "scientific notation".
259 >>> is_integer_number('42')
261 >>> is_integer_number('42.0')
265 (is_number(in_str) and "." not in in_str)
266 or is_hexidecimal_integer_number(in_str)
267 or is_octal_integer_number(in_str)
268 or is_binary_integer_number(in_str)
272 def is_hexidecimal_integer_number(in_str: str) -> bool:
274 Checks whether a string is a hex integer number.
276 >>> is_hexidecimal_integer_number('0x12345')
278 >>> is_hexidecimal_integer_number('0x1A3E')
280 >>> is_hexidecimal_integer_number('1234') # Needs 0x
282 >>> is_hexidecimal_integer_number('-0xff')
284 >>> is_hexidecimal_integer_number('test')
286 >>> is_hexidecimal_integer_number(12345) # Not a string
287 Traceback (most recent call last):
290 >>> is_hexidecimal_integer_number(101.4)
291 Traceback (most recent call last):
294 >>> is_hexidecimal_integer_number(0x1A3E)
295 Traceback (most recent call last):
299 if not is_string(in_str):
300 raise ValueError(in_str)
301 return HEX_NUMBER_RE.match(in_str) is not None
304 def is_octal_integer_number(in_str: str) -> bool:
306 Checks whether a string is an octal number.
308 >>> is_octal_integer_number('0o777')
310 >>> is_octal_integer_number('-0O115')
312 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
314 >>> is_octal_integer_number('7777') # Needs 0o
316 >>> is_octal_integer_number('test')
319 if not is_string(in_str):
320 raise ValueError(in_str)
321 return OCT_NUMBER_RE.match(in_str) is not None
324 def is_binary_integer_number(in_str: str) -> bool:
326 Returns whether a string contains a binary number.
328 >>> is_binary_integer_number('0b10111')
330 >>> is_binary_integer_number('-0b111')
332 >>> is_binary_integer_number('0B10101')
334 >>> is_binary_integer_number('0b10102')
336 >>> is_binary_integer_number('0xFFF')
338 >>> is_binary_integer_number('test')
341 if not is_string(in_str):
342 raise ValueError(in_str)
343 return BIN_NUMBER_RE.match(in_str) is not None
346 def to_int(in_str: str) -> int:
347 """Returns the integral value of the string or raises on error.
352 Traceback (most recent call last):
354 ValueError: invalid literal for int() with base 10: 'test'
356 if not is_string(in_str):
357 raise ValueError(in_str)
358 if is_binary_integer_number(in_str):
359 return int(in_str, 2)
360 if is_octal_integer_number(in_str):
361 return int(in_str, 8)
362 if is_hexidecimal_integer_number(in_str):
363 return int(in_str, 16)
367 def is_decimal_number(in_str: str) -> bool:
369 Checks whether the given string represents a decimal or not.
371 A decimal may be signed or unsigned or use a "scientific notation".
373 >>> is_decimal_number('42.0')
375 >>> is_decimal_number('42')
378 return is_number(in_str) and "." in in_str
381 def strip_escape_sequences(in_str: str) -> str:
383 Remove escape sequences in the input string.
385 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
388 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
392 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
394 Add thousands separator to a numeric string. Also handles numbers.
396 >>> add_thousands_separator('12345678')
398 >>> add_thousands_separator(12345678)
400 >>> add_thousands_separator(12345678.99)
402 >>> add_thousands_separator('test')
403 Traceback (most recent call last):
408 if isinstance(in_str, numbers.Number):
410 if is_number(in_str):
411 return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
412 raise ValueError(in_str)
415 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
418 (in_str, decimal_part) = in_str.split('.')
419 tmp = [iter(in_str[::-1])] * places
420 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
421 if len(decimal_part) > 0:
428 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
429 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
431 Check if a string is a valid url.
433 >>> is_url('http://www.mysite.com')
435 >>> is_url('https://mysite.com')
437 >>> is_url('.mysite.com')
440 if not is_full_string(in_str):
443 valid = URL_RE.match(in_str) is not None
446 return valid and any([in_str.startswith(s) for s in allowed_schemes])
450 def is_email(in_str: Any) -> bool:
452 Check if a string is a valid email.
454 Reference: https://tools.ietf.org/html/rfc3696#section-3
458 >>> is_email('@gmail.com')
461 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
465 # we expect 2 tokens, one before "@" and one after, otherwise
466 # we have an exception and the email is not valid.
467 head, tail = in_str.split("@")
469 # head's size must be <= 64, tail <= 255, head must not start
470 # with a dot or contain multiple consecutive dots.
471 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
474 # removes escaped spaces, so that later on the test regex will
476 head = head.replace("\\ ", "")
477 if head.startswith('"') and head.endswith('"'):
478 head = head.replace(" ", "")[1:-1]
479 return EMAIL_RE.match(head + "@" + tail) is not None
482 # borderline case in which we have multiple "@" signs but the
483 # head part is correctly escaped.
484 if ESCAPED_AT_SIGN.search(in_str) is not None:
485 # replace "@" with "a" in the head
486 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
490 def suffix_string_to_number(in_str: str) -> Optional[int]:
491 """Take a string like "33Gb" and convert it into a number (of bytes)
492 like 34603008. Return None if the input string is not valid.
494 >>> suffix_string_to_number('1Mb')
496 >>> suffix_string_to_number('13.1Gb')
500 def suffix_capitalize(s: str) -> str:
504 return f"{s[0].upper()}{s[1].lower()}"
505 return suffix_capitalize(s[0:1])
507 if is_string(in_str):
508 if is_integer_number(in_str):
509 return to_int(in_str)
510 suffixes = [in_str[-2:], in_str[-1:]]
511 rest = [in_str[:-2], in_str[:-1]]
512 for x in range(len(suffixes)):
514 s = suffix_capitalize(s)
515 multiplier = NUM_SUFFIXES.get(s, None)
516 if multiplier is not None:
518 if is_integer_number(r):
519 return to_int(r) * multiplier
520 if is_decimal_number(r):
521 return int(float(r) * multiplier)
525 def number_to_suffix_string(num: int) -> Optional[str]:
526 """Take a number (of bytes) and returns a string like "43.8Gb".
527 Returns none if the input is invalid.
529 >>> number_to_suffix_string(14066017894)
531 >>> number_to_suffix_string(1024 * 1024)
537 for (sfx, size) in NUM_SUFFIXES.items():
542 if suffix is not None:
543 return f"{d:.1f}{suffix}"
548 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
550 Checks if a string is a valid credit card number.
551 If card type is provided then it checks against that specific type only,
552 otherwise any known credit card number will be accepted.
554 Supported card types are the following:
563 if not is_full_string(in_str):
566 if card_type is not None:
567 if card_type not in CREDIT_CARDS:
569 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
571 return CREDIT_CARDS[card_type].match(in_str) is not None
572 for c in CREDIT_CARDS:
573 if CREDIT_CARDS[c].match(in_str) is not None:
578 def is_camel_case(in_str: Any) -> bool:
580 Checks if a string is formatted as camel case.
582 A string is considered camel case when:
584 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
585 - it contains both lowercase and uppercase letters
586 - it does not start with a number
588 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
591 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
593 Checks if a string is formatted as "snake case".
595 A string is considered snake case when:
597 - it's composed only by lowercase/uppercase letters and digits
598 - it contains at least one underscore (or provided separator)
599 - it does not start with a number
601 >>> is_snake_case('this_is_a_test')
603 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
605 >>> is_snake_case('this-is-a-test')
607 >>> is_snake_case('this-is-a-test', separator='-')
611 if is_full_string(in_str):
612 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
613 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
616 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
618 return r.match(in_str) is not None
622 def is_json(in_str: Any) -> bool:
624 Check if a string is a valid json.
626 >>> is_json('{"name": "Peter"}')
628 >>> is_json('[1, 2, 3]')
630 >>> is_json('{nope}')
633 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
635 return isinstance(json.loads(in_str), (dict, list))
636 except (TypeError, ValueError, OverflowError):
641 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
643 Check if a string is a valid UUID.
645 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
647 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
649 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
652 # string casting is used to allow UUID itself as input data type
655 return UUID_HEX_OK_RE.match(s) is not None
656 return UUID_RE.match(s) is not None
659 def is_ip_v4(in_str: Any) -> bool:
661 Checks if a string is a valid ip v4.
663 >>> is_ip_v4('255.200.100.75')
667 >>> is_ip_v4('255.200.100.999') # 999 out of range
670 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
673 # checks that each entry in the ip is in the valid range (0 to 255)
674 for token in in_str.split("."):
675 if not 0 <= int(token) <= 255:
680 def extract_ip_v4(in_str: Any) -> Optional[str]:
682 Extracts the IPv4 chunk of a string or None.
684 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
686 >>> extract_ip_v4('Your mom dresses you funny.')
688 if not is_full_string(in_str):
690 m = ANYWHERE_IP_V4_RE.search(in_str)
696 def is_ip_v6(in_str: Any) -> bool:
698 Checks if a string is a valid ip v6.
700 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
702 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
705 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
708 def extract_ip_v6(in_str: Any) -> Optional[str]:
710 Extract IPv6 chunk or None.
712 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
713 '2001:db8:85a3:0000:0000:8a2e:370:7334'
714 >>> extract_ip_v6("(and she's ugly too, btw)")
716 if not is_full_string(in_str):
718 m = ANYWHERE_IP_V6_RE.search(in_str)
724 def is_ip(in_str: Any) -> bool:
726 Checks if a string is a valid ip (either v4 or v6).
728 >>> is_ip('255.200.100.75')
730 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
734 >>> is_ip('1.2.3.999')
737 return is_ip_v6(in_str) or is_ip_v4(in_str)
740 def extract_ip(in_str: Any) -> Optional[str]:
742 Extract the IP address or None.
744 >>> extract_ip('Attacker: 255.200.100.75')
746 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
747 '2001:db8:85a3:0000:0000:8a2e:370:7334'
748 >>> extract_ip('1.2.3')
751 ip = extract_ip_v4(in_str)
753 ip = extract_ip_v6(in_str)
757 def is_mac_address(in_str: Any) -> bool:
758 """Return True if in_str is a valid MAC address false otherwise.
760 >>> is_mac_address("34:29:8F:12:0D:2F")
762 >>> is_mac_address('34:29:8f:12:0d:2f')
764 >>> is_mac_address('34-29-8F-12-0D-2F')
766 >>> is_mac_address("test")
769 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
772 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
774 Extract the MAC address from in_str.
776 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
779 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
783 if not is_full_string(in_str):
786 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
789 mac.replace(":", separator)
790 mac.replace("-", separator)
795 def is_slug(in_str: Any, separator: str = "-") -> bool:
797 Checks if a given string is a slug (as created by `slugify()`).
799 >>> is_slug('my-blog-post-title')
801 >>> is_slug('My blog post title')
805 if not is_full_string(in_str):
807 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
808 return re.match(rex, in_str) is not None
811 def contains_html(in_str: str) -> bool:
813 Checks if the given string contains HTML/XML tags.
815 By design, this function matches ANY type of tag, so don't expect to use it
816 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
818 >>> contains_html('my string is <strong>bold</strong>')
820 >>> contains_html('my string is not bold')
824 if not is_string(in_str):
825 raise ValueError(in_str)
826 return HTML_RE.search(in_str) is not None
829 def words_count(in_str: str) -> int:
831 Returns the number of words contained into the given string.
833 This method is smart, it does consider only sequence of one or more letter and/or numbers
834 as "words", so a string like this: "! @ # % ... []" will return zero!
835 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
836 will be 4 not 1 (even if there are no spaces in the string).
838 >>> words_count('hello world')
840 >>> words_count('one,two,three.stop')
844 if not is_string(in_str):
845 raise ValueError(in_str)
846 return len(WORDS_COUNT_RE.findall(in_str))
849 def generate_uuid(omit_dashes: bool = False) -> str:
851 Generated an UUID string (using `uuid.uuid4()`).
853 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
854 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
863 def generate_random_alphanumeric_string(size: int) -> str:
865 Returns a string of the specified size containing random
866 characters (uppercase/lowercase ascii letters and digits).
868 random_string(9) # possible output: "cx3QQbzYg"
872 raise ValueError("size must be >= 1")
873 chars = string.ascii_letters + string.digits
874 buffer = [random.choice(chars) for _ in range(size)]
875 return from_char_list(buffer)
878 def reverse(in_str: str) -> str:
880 Returns the string with its chars reversed.
886 if not is_string(in_str):
887 raise ValueError(in_str)
891 def camel_case_to_snake_case(in_str, *, separator="_"):
893 Convert a camel case string into a snake case one.
894 (The original string is returned if is not a valid camel case string)
896 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
897 'mac_address_extractor_factory'
898 >>> camel_case_to_snake_case('Luke Skywalker')
901 if not is_string(in_str):
902 raise ValueError(in_str)
903 if not is_camel_case(in_str):
905 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
908 def snake_case_to_camel_case(
909 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
912 Convert a snake case string into a camel case one.
913 (The original string is returned if is not a valid snake case string)
915 >>> snake_case_to_camel_case('this_is_a_test')
917 >>> snake_case_to_camel_case('Han Solo')
920 if not is_string(in_str):
921 raise ValueError(in_str)
922 if not is_snake_case(in_str, separator=separator):
924 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
925 if not upper_case_first:
926 tokens[0] = tokens[0].lower()
927 return from_char_list(tokens)
930 def to_char_list(in_str: str) -> List[str]:
931 """Convert a string into a list of chars.
933 >>> to_char_list('test')
936 if not is_string(in_str):
941 def from_char_list(in_list: List[str]) -> str:
942 """Convert a char list into a string.
944 >>> from_char_list(['t', 'e', 's', 't'])
947 return "".join(in_list)
950 def shuffle(in_str: str) -> str:
951 """Return a new string containing same chars of the given one but in
954 if not is_string(in_str):
955 raise ValueError(in_str)
957 # turn the string into a list of chars
958 chars = to_char_list(in_str)
959 random.shuffle(chars)
960 return from_char_list(chars)
963 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
965 Remove html code contained into the given string.
967 >>> strip_html('test: <a href="foo/bar">click here</a>')
969 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
972 if not is_string(in_str):
973 raise ValueError(in_str)
974 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
975 return r.sub("", in_str)
978 def asciify(in_str: str) -> str:
980 Force string content to be ascii-only by translating all non-ascii
981 chars into the closest possible representation (eg: ó -> o, Ë ->
984 N.B. Some chars may be lost if impossible to translate.
986 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
987 'eeuuooaaeynAAACIINOE'
989 if not is_string(in_str):
990 raise ValueError(in_str)
992 # "NFKD" is the algorithm which is able to successfully translate
993 # the most of non-ascii chars.
994 normalized = unicodedata.normalize("NFKD", in_str)
996 # encode string forcing ascii and ignore any errors
997 # (unrepresentable chars will be stripped out)
998 ascii_bytes = normalized.encode("ascii", "ignore")
1000 # turns encoded bytes into an utf-8 string
1001 return ascii_bytes.decode("utf-8")
1004 def slugify(in_str: str, *, separator: str = "-") -> str:
1006 Converts a string into a "slug" using provided separator.
1007 The returned string has the following properties:
1010 - all letters are in lower case
1011 - all punctuation signs and non alphanumeric chars are removed
1012 - words are divided using provided separator
1013 - all chars are encoded as ascii (by using `asciify()`)
1016 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1017 'top-10-reasons-to-love-dogs'
1018 >>> slugify('Mönstér Mägnët')
1021 if not is_string(in_str):
1022 raise ValueError(in_str)
1024 # replace any character that is NOT letter or number with spaces
1025 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1027 # replace spaces with join sign
1028 out = SPACES_RE.sub(separator, out)
1030 # normalize joins (remove duplicates)
1031 out = re.sub(re.escape(separator) + r"+", separator, out)
1035 def to_bool(in_str: str) -> bool:
1037 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1039 A positive boolean (True) is returned if the string value is one
1047 Otherwise False is returned.
1068 if not is_string(in_str):
1069 raise ValueError(in_str)
1070 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1073 def to_date(in_str: str) -> Optional[datetime.date]:
1075 Parses a date string. See DateParser docs for details.
1077 import dateparse.dateparse_utils as dp # type: ignore
1083 except dp.ParseException:
1084 msg = f'Unable to parse date {in_str}.'
1089 def valid_date(in_str: str) -> bool:
1091 True if the string represents a valid date.
1093 import dateparse.dateparse_utils as dp
1099 except dp.ParseException:
1100 msg = f'Unable to parse date {in_str}.'
1105 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1107 Parses a datetime string. See DateParser docs for more info.
1109 import dateparse.dateparse_utils as dp
1113 dt = d.parse(in_str)
1114 if type(dt) == datetime.datetime:
1117 msg = f'Unable to parse datetime {in_str}.'
1122 def valid_datetime(in_str: str) -> bool:
1124 True if the string represents a valid datetime.
1126 _ = to_datetime(in_str)
1129 msg = f'Unable to parse datetime {in_str}.'
1134 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1136 Squeeze runs of more than one character_to_squeeze into one.
1138 >>> squeeze(' this is a test ')
1141 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1146 r'(' + re.escape(character_to_squeeze) + r')+',
1147 character_to_squeeze,
1152 def dedent(in_str: str) -> str:
1154 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1156 if not is_string(in_str):
1157 raise ValueError(in_str)
1158 line_separator = '\n'
1159 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1160 return line_separator.join(lines)
1163 def indent(in_str: str, amount: int) -> str:
1165 Indents string by prepending amount spaces.
1167 >>> indent('This is a test', 4)
1171 if not is_string(in_str):
1172 raise ValueError(in_str)
1173 line_separator = '\n'
1174 lines = [" " * amount + line for line in in_str.split(line_separator)]
1175 return line_separator.join(lines)
1178 def sprintf(*args, **kwargs) -> str:
1179 """String printf, like in C"""
1182 sep = kwargs.pop("sep", None)
1184 if not isinstance(sep, str):
1185 raise TypeError("sep must be None or a string")
1187 end = kwargs.pop("end", None)
1189 if not isinstance(end, str):
1190 raise TypeError("end must be None or a string")
1193 raise TypeError("invalid keyword arguments to sprint()")
1199 for i, arg in enumerate(args):
1202 if isinstance(arg, str):
1210 class SprintfStdout(object):
1212 A context manager that captures outputs to stdout.
1214 with SprintfStdout() as buf:
1221 def __init__(self) -> None:
1222 self.destination = io.StringIO()
1223 self.recorder: contextlib.redirect_stdout
1225 def __enter__(self) -> Callable[[], str]:
1226 self.recorder = contextlib.redirect_stdout(self.destination)
1227 self.recorder.__enter__()
1228 return lambda: self.destination.getvalue()
1230 def __exit__(self, *args) -> None:
1231 self.recorder.__exit__(*args)
1232 self.destination.seek(0)
1233 return None # don't suppress exceptions
1236 def capitalize_first_letter(txt: str) -> str:
1237 """Capitalize the first letter of a string.
1239 >>> capitalize_first_letter('test')
1241 >>> capitalize_first_letter("ALREADY!")
1245 return txt[0].upper() + txt[1:]
1248 def it_they(n: int) -> str:
1262 def is_are(n: int) -> str:
1276 def pluralize(n: int) -> str:
1282 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1285 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1294 def make_contractions(txt: str) -> str:
1295 """Glue words together to form contractions.
1297 >>> make_contractions('It is nice today.')
1300 >>> make_contractions('I can not even...')
1303 >>> make_contractions('She could not see!')
1306 >>> make_contractions('But she will not go.')
1309 >>> make_contractions('Verily, I shall not.')
1312 >>> make_contractions('No you cannot.')
1315 >>> make_contractions('I said you can not go.')
1316 "I said you can't go."
1353 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1357 # Special cases: can't, shan't and won't.
1358 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1359 txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1361 r'\b(w)ill\s*(n)(o)(t)\b',
1365 flags=re.IGNORECASE,
1368 for first_list, second_list in first_second:
1369 for first in first_list:
1370 for second in second_list:
1371 # Disallow there're/where're. They're valid English
1373 if (first == 'there' or first == 'where') and second == 'a(re)':
1376 pattern = fr'\b({first})\s+{second}\b'
1377 if second == '(n)o(t)':
1378 replacement = r"\1\2'\3"
1380 replacement = r"\1'\2"
1381 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1386 def thify(n: int) -> str:
1387 """Return the proper cardinal suffix for a number.
1398 assert is_integer_number(digit)
1410 def ngrams(txt: str, n: int):
1411 """Return the ngrams from a string.
1413 >>> [x for x in ngrams('This is a test', 2)]
1414 ['This is', 'is a', 'a test']
1418 for ngram in ngrams_presplit(words, n):
1425 def ngrams_presplit(words: Sequence[str], n: int):
1426 return list_utils.ngrams(words, n)
1429 def bigrams(txt: str):
1430 return ngrams(txt, 2)
1433 def trigrams(txt: str):
1434 return ngrams(txt, 3)
1437 def shuffle_columns_into_list(
1438 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1440 """Helper to shuffle / parse columnar data and return the results as a
1441 list. The column_specs argument is an iterable collection of
1442 numeric sequences that indicate one or more column numbers to
1445 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1446 >>> shuffle_columns_into_list(
1448 ... [ [8], [2, 3], [5, 6, 7] ],
1451 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1456 # Column specs map input lines' columns into outputs.
1458 for spec in column_specs:
1461 chunk = chunk + delim + input_lines[n]
1462 chunk = chunk.strip(delim)
1467 def shuffle_columns_into_dict(
1468 input_lines: Sequence[str],
1469 column_specs: Iterable[Tuple[str, Iterable[int]]],
1471 ) -> Dict[str, str]:
1472 """Helper to shuffle / parse columnar data and return the results
1475 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1476 >>> shuffle_columns_into_dict(
1478 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1481 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1486 # Column specs map input lines' columns into outputs.
1487 # "key", [col1, col2...]
1488 for spec in column_specs:
1491 chunk = chunk + delim + input_lines[n]
1492 chunk = chunk.strip(delim)
1493 out[spec[0]] = chunk
1497 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1498 """Interpolate a string with data from a dict.
1500 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1501 ... {'adjective': 'good', 'noun': 'example'})
1502 'This is a good example.'
1505 return sprintf(txt.format(**values), end='')
1508 def to_ascii(x: str):
1509 """Encode as ascii bytes string.
1511 >>> to_ascii('test')
1514 >>> to_ascii(b'1, 2, 3')
1519 return x.encode('ascii')
1520 if type(x) is bytes:
1522 raise Exception('to_ascii works with strings and bytes')
1525 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1526 """Encode txt and then encode the bytes with a 64-character
1527 alphabet. This is compatible with uudecode.
1529 >>> to_base64('hello?')
1533 return base64.encodebytes(txt.encode(encoding, errors))
1536 def is_base64(txt: str) -> bool:
1537 """Determine whether a string is base64 encoded (with Python's standard
1538 base64 alphabet which is the same as what uuencode uses).
1540 >>> is_base64('test') # all letters in the b64 alphabet
1543 >>> is_base64('another test, how do you like this one?')
1546 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1550 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1551 alphabet = set(a.encode('ascii'))
1552 for char in to_ascii(txt.strip()):
1553 if char not in alphabet:
1558 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1559 """Convert base64 encoded string back to normal strings.
1561 >>> from_base64(b'aGVsbG8/\\n')
1565 return base64.decodebytes(b64).decode(encoding, errors)
1568 def chunk(txt: str, chunk_size):
1569 """Chunk up a string.
1571 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1572 '01001101 11000101 10101010 10101010 10011111 10101000'
1575 if len(txt) % chunk_size != 0:
1576 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1578 warnings.warn(msg, stacklevel=2)
1579 for x in range(0, len(txt), chunk_size):
1580 yield txt[x : x + chunk_size]
1583 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1584 """Encode txt and then chop it into bytes. Note: only bitstrings
1585 with delimiter='' are interpretable by from_bitstring.
1587 >>> to_bitstring('hello?')
1588 '011010000110010101101100011011000110111100111111'
1590 >>> to_bitstring('test', delimiter=' ')
1591 '01110100 01100101 01110011 01110100'
1593 >>> to_bitstring(b'test')
1594 '01110100011001010111001101110100'
1597 etxt = to_ascii(txt)
1598 bits = bin(int.from_bytes(etxt, 'big'))
1600 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1603 def is_bitstring(txt: str) -> bool:
1604 """Is this a bitstring?
1606 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1609 >>> is_bitstring('1234')
1613 return is_binary_integer_number(f'0b{txt}')
1616 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1617 """Convert from bitstring back to bytes then decode into a str.
1619 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1624 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1627 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1628 """Turn an IPv4 address into a tuple for sorting purposes.
1630 >>> ip_v4_sort_key('10.0.0.18')
1633 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1634 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1635 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1638 if not is_ip_v4(txt):
1639 print(f"not IP: {txt}")
1641 return tuple([int(x) for x in txt.split('.')])
1644 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1645 """Chunk up a file path so that parent/ancestor paths sort before
1646 children/descendant paths.
1648 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1649 ('usr', 'local', 'bin')
1651 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1652 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1653 ['/usr', '/usr/local', '/usr/local/bin']
1656 return tuple([x for x in volume.split('/') if len(x) > 0])
1659 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1660 """Execute several replace operations in a row.
1662 >>> s = 'this_is a-test!'
1663 >>> replace_all(s, ' _-!', '')
1667 for char in replace_set:
1668 in_str = in_str.replace(char, replacement)
1672 if __name__ == '__main__':