3 """The MIT License (MIT)
5 Copyright (c) 2016-2020 Davide Zanotti
6 Modifications Copyright (c) 2021-2022 Scott Gasch
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in all
16 copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 This class is based on: https://github.com/daveoncode/python-string-utils.
30 import contextlib # type: ignore
41 from itertools import zip_longest
42 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
43 from uuid import uuid4
47 logger = logging.getLogger(__name__)
49 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
51 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
53 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
55 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
58 r"([a-z-]+://)" # scheme
59 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
61 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
62 r"(:\d{2,})?" # port number
63 r"(/[a-z\d_%+-]*)*" # folders
64 r"(\.[a-z\d_%+-]+)*" # file extension
65 r"(\?[a-z\d_+%-=]*)?" # query string
69 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
71 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
73 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
76 r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
79 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
81 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
83 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
85 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
87 SNAKE_CASE_TEST_RE = re.compile(
88 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
91 SNAKE_CASE_TEST_DASH_RE = re.compile(
92 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
95 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
97 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
100 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
101 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
102 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
103 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
104 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
105 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
108 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
110 UUID_RE = re.compile(
111 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
114 UUID_HEX_OK_RE = re.compile(
115 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
119 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
121 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
123 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
125 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
127 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
129 ANYWHERE_MAC_ADDRESS_RE = re.compile(
130 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
133 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
135 HTML_RE = re.compile(
136 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
137 re.IGNORECASE | re.MULTILINE | re.DOTALL,
140 HTML_TAG_ONLY_RE = re.compile(
141 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
142 re.IGNORECASE | re.MULTILINE | re.DOTALL,
145 SPACES_RE = re.compile(r"\s")
147 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
149 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
151 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
167 def is_none_or_empty(in_str: Optional[str]) -> bool:
169 Returns true if the input string is either None or an empty string.
171 >>> is_none_or_empty("")
173 >>> is_none_or_empty(None)
175 >>> is_none_or_empty(" \t ")
177 >>> is_none_or_empty('Test')
180 return in_str is None or len(in_str.strip()) == 0
183 def is_string(obj: Any) -> bool:
185 Checks if an object is a string.
187 >>> is_string('test')
193 >>> is_string([1, 2, 3])
196 return isinstance(obj, str)
199 def is_empty_string(in_str: Any) -> bool:
200 return is_empty(in_str)
203 def is_empty(in_str: Any) -> bool:
205 Checks if input is a string and empty or only whitespace.
209 >>> is_empty(' \t\t ')
215 >>> is_empty([1, 2, 3])
218 return is_string(in_str) and in_str.strip() == ""
221 def is_full_string(in_str: Any) -> bool:
223 Checks that input is a string and is not empty ('') or only whitespace.
225 >>> is_full_string('test!')
227 >>> is_full_string('')
229 >>> is_full_string(' ')
231 >>> is_full_string(100.999)
233 >>> is_full_string({"a": 1, "b": 2})
236 return is_string(in_str) and in_str.strip() != ""
239 def is_number(in_str: str) -> bool:
241 Checks if a string is a valid number.
244 Traceback (most recent call last):
247 >>> is_number("100.5")
249 >>> is_number("test")
253 >>> is_number([1, 2, 3])
254 Traceback (most recent call last):
256 ValueError: [1, 2, 3]
258 if not is_string(in_str):
259 raise ValueError(in_str)
260 return NUMBER_RE.match(in_str) is not None
263 def is_integer_number(in_str: str) -> bool:
265 Checks whether the given string represents an integer or not.
267 An integer may be signed or unsigned or use a "scientific notation".
269 >>> is_integer_number('42')
271 >>> is_integer_number('42.0')
275 (is_number(in_str) and "." not in in_str)
276 or is_hexidecimal_integer_number(in_str)
277 or is_octal_integer_number(in_str)
278 or is_binary_integer_number(in_str)
282 def is_hexidecimal_integer_number(in_str: str) -> bool:
284 Checks whether a string is a hex integer number.
286 >>> is_hexidecimal_integer_number('0x12345')
288 >>> is_hexidecimal_integer_number('0x1A3E')
290 >>> is_hexidecimal_integer_number('1234') # Needs 0x
292 >>> is_hexidecimal_integer_number('-0xff')
294 >>> is_hexidecimal_integer_number('test')
296 >>> is_hexidecimal_integer_number(12345) # Not a string
297 Traceback (most recent call last):
300 >>> is_hexidecimal_integer_number(101.4)
301 Traceback (most recent call last):
304 >>> is_hexidecimal_integer_number(0x1A3E)
305 Traceback (most recent call last):
309 if not is_string(in_str):
310 raise ValueError(in_str)
311 return HEX_NUMBER_RE.match(in_str) is not None
314 def is_octal_integer_number(in_str: str) -> bool:
316 Checks whether a string is an octal number.
318 >>> is_octal_integer_number('0o777')
320 >>> is_octal_integer_number('-0O115')
322 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
324 >>> is_octal_integer_number('7777') # Needs 0o
326 >>> is_octal_integer_number('test')
329 if not is_string(in_str):
330 raise ValueError(in_str)
331 return OCT_NUMBER_RE.match(in_str) is not None
334 def is_binary_integer_number(in_str: str) -> bool:
336 Returns whether a string contains a binary number.
338 >>> is_binary_integer_number('0b10111')
340 >>> is_binary_integer_number('-0b111')
342 >>> is_binary_integer_number('0B10101')
344 >>> is_binary_integer_number('0b10102')
346 >>> is_binary_integer_number('0xFFF')
348 >>> is_binary_integer_number('test')
351 if not is_string(in_str):
352 raise ValueError(in_str)
353 return BIN_NUMBER_RE.match(in_str) is not None
356 def to_int(in_str: str) -> int:
357 """Returns the integral value of the string or raises on error.
362 Traceback (most recent call last):
364 ValueError: invalid literal for int() with base 10: 'test'
366 if not is_string(in_str):
367 raise ValueError(in_str)
368 if is_binary_integer_number(in_str):
369 return int(in_str, 2)
370 if is_octal_integer_number(in_str):
371 return int(in_str, 8)
372 if is_hexidecimal_integer_number(in_str):
373 return int(in_str, 16)
377 def is_decimal_number(in_str: str) -> bool:
379 Checks whether the given string represents a decimal or not.
381 A decimal may be signed or unsigned or use a "scientific notation".
383 >>> is_decimal_number('42.0')
385 >>> is_decimal_number('42')
388 return is_number(in_str) and "." in in_str
391 def strip_escape_sequences(in_str: str) -> str:
393 Remove escape sequences in the input string.
395 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
398 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
402 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
404 Add thousands separator to a numeric string. Also handles numbers.
406 >>> add_thousands_separator('12345678')
408 >>> add_thousands_separator(12345678)
410 >>> add_thousands_separator(12345678.99)
412 >>> add_thousands_separator('test')
413 Traceback (most recent call last):
418 if isinstance(in_str, numbers.Number):
420 if is_number(in_str):
421 return _add_thousands_separator(
422 in_str, separator_char=separator_char, places=places
424 raise ValueError(in_str)
427 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
430 (in_str, decimal_part) = in_str.split('.')
431 tmp = [iter(in_str[::-1])] * places
432 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
433 if len(decimal_part) > 0:
440 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
441 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
443 Check if a string is a valid url.
445 >>> is_url('http://www.mysite.com')
447 >>> is_url('https://mysite.com')
449 >>> is_url('.mysite.com')
452 if not is_full_string(in_str):
455 valid = URL_RE.match(in_str) is not None
458 return valid and any([in_str.startswith(s) for s in allowed_schemes])
462 def is_email(in_str: Any) -> bool:
464 Check if a string is a valid email.
466 Reference: https://tools.ietf.org/html/rfc3696#section-3
470 >>> is_email('@gmail.com')
473 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
477 # we expect 2 tokens, one before "@" and one after, otherwise
478 # we have an exception and the email is not valid.
479 head, tail = in_str.split("@")
481 # head's size must be <= 64, tail <= 255, head must not start
482 # with a dot or contain multiple consecutive dots.
483 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
486 # removes escaped spaces, so that later on the test regex will
488 head = head.replace("\\ ", "")
489 if head.startswith('"') and head.endswith('"'):
490 head = head.replace(" ", "")[1:-1]
491 return EMAIL_RE.match(head + "@" + tail) is not None
494 # borderline case in which we have multiple "@" signs but the
495 # head part is correctly escaped.
496 if ESCAPED_AT_SIGN.search(in_str) is not None:
497 # replace "@" with "a" in the head
498 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
502 def suffix_string_to_number(in_str: str) -> Optional[int]:
503 """Take a string like "33Gb" and convert it into a number (of bytes)
504 like 34603008. Return None if the input string is not valid.
506 >>> suffix_string_to_number('1Mb')
508 >>> suffix_string_to_number('13.1Gb')
512 def suffix_capitalize(s: str) -> str:
516 return f"{s[0].upper()}{s[1].lower()}"
517 return suffix_capitalize(s[0:1])
519 if is_string(in_str):
520 if is_integer_number(in_str):
521 return to_int(in_str)
522 suffixes = [in_str[-2:], in_str[-1:]]
523 rest = [in_str[:-2], in_str[:-1]]
524 for x in range(len(suffixes)):
526 s = suffix_capitalize(s)
527 multiplier = NUM_SUFFIXES.get(s, None)
528 if multiplier is not None:
530 if is_integer_number(r):
531 return to_int(r) * multiplier
532 if is_decimal_number(r):
533 return int(float(r) * multiplier)
537 def number_to_suffix_string(num: int) -> Optional[str]:
538 """Take a number (of bytes) and returns a string like "43.8Gb".
539 Returns none if the input is invalid.
541 >>> number_to_suffix_string(14066017894)
543 >>> number_to_suffix_string(1024 * 1024)
549 for (sfx, size) in NUM_SUFFIXES.items():
554 if suffix is not None:
555 return f"{d:.1f}{suffix}"
560 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
562 Checks if a string is a valid credit card number.
563 If card type is provided then it checks against that specific type only,
564 otherwise any known credit card number will be accepted.
566 Supported card types are the following:
575 if not is_full_string(in_str):
578 if card_type is not None:
579 if card_type not in CREDIT_CARDS:
581 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
583 return CREDIT_CARDS[card_type].match(in_str) is not None
584 for c in CREDIT_CARDS:
585 if CREDIT_CARDS[c].match(in_str) is not None:
590 def is_camel_case(in_str: Any) -> bool:
592 Checks if a string is formatted as camel case.
594 A string is considered camel case when:
596 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
597 - it contains both lowercase and uppercase letters
598 - it does not start with a number
600 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
603 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
605 Checks if a string is formatted as "snake case".
607 A string is considered snake case when:
609 - it's composed only by lowercase/uppercase letters and digits
610 - it contains at least one underscore (or provided separator)
611 - it does not start with a number
613 >>> is_snake_case('this_is_a_test')
615 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
617 >>> is_snake_case('this-is-a-test')
619 >>> is_snake_case('this-is-a-test', separator='-')
623 if is_full_string(in_str):
624 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
625 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
628 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
630 return r.match(in_str) is not None
634 def is_json(in_str: Any) -> bool:
636 Check if a string is a valid json.
638 >>> is_json('{"name": "Peter"}')
640 >>> is_json('[1, 2, 3]')
642 >>> is_json('{nope}')
645 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
647 return isinstance(json.loads(in_str), (dict, list))
648 except (TypeError, ValueError, OverflowError):
653 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
655 Check if a string is a valid UUID.
657 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
659 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
661 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
664 # string casting is used to allow UUID itself as input data type
667 return UUID_HEX_OK_RE.match(s) is not None
668 return UUID_RE.match(s) is not None
671 def is_ip_v4(in_str: Any) -> bool:
673 Checks if a string is a valid ip v4.
675 >>> is_ip_v4('255.200.100.75')
679 >>> is_ip_v4('255.200.100.999') # 999 out of range
682 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
685 # checks that each entry in the ip is in the valid range (0 to 255)
686 for token in in_str.split("."):
687 if not 0 <= int(token) <= 255:
692 def extract_ip_v4(in_str: Any) -> Optional[str]:
694 Extracts the IPv4 chunk of a string or None.
696 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
698 >>> extract_ip_v4('Your mom dresses you funny.')
700 if not is_full_string(in_str):
702 m = ANYWHERE_IP_V4_RE.search(in_str)
708 def is_ip_v6(in_str: Any) -> bool:
710 Checks if a string is a valid ip v6.
712 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
714 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
717 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
720 def extract_ip_v6(in_str: Any) -> Optional[str]:
722 Extract IPv6 chunk or None.
724 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
725 '2001:db8:85a3:0000:0000:8a2e:370:7334'
726 >>> extract_ip_v6("(and she's ugly too, btw)")
728 if not is_full_string(in_str):
730 m = ANYWHERE_IP_V6_RE.search(in_str)
736 def is_ip(in_str: Any) -> bool:
738 Checks if a string is a valid ip (either v4 or v6).
740 >>> is_ip('255.200.100.75')
742 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
746 >>> is_ip('1.2.3.999')
749 return is_ip_v6(in_str) or is_ip_v4(in_str)
752 def extract_ip(in_str: Any) -> Optional[str]:
754 Extract the IP address or None.
756 >>> extract_ip('Attacker: 255.200.100.75')
758 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
759 '2001:db8:85a3:0000:0000:8a2e:370:7334'
760 >>> extract_ip('1.2.3')
763 ip = extract_ip_v4(in_str)
765 ip = extract_ip_v6(in_str)
769 def is_mac_address(in_str: Any) -> bool:
770 """Return True if in_str is a valid MAC address false otherwise.
772 >>> is_mac_address("34:29:8F:12:0D:2F")
774 >>> is_mac_address('34:29:8f:12:0d:2f')
776 >>> is_mac_address('34-29-8F-12-0D-2F')
778 >>> is_mac_address("test")
781 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
784 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
786 Extract the MAC address from in_str.
788 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
791 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
795 if not is_full_string(in_str):
798 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
801 mac.replace(":", separator)
802 mac.replace("-", separator)
807 def is_slug(in_str: Any, separator: str = "-") -> bool:
809 Checks if a given string is a slug (as created by `slugify()`).
811 >>> is_slug('my-blog-post-title')
813 >>> is_slug('My blog post title')
817 if not is_full_string(in_str):
819 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
820 return re.match(rex, in_str) is not None
823 def contains_html(in_str: str) -> bool:
825 Checks if the given string contains HTML/XML tags.
827 By design, this function matches ANY type of tag, so don't expect to use it
828 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
830 >>> contains_html('my string is <strong>bold</strong>')
832 >>> contains_html('my string is not bold')
836 if not is_string(in_str):
837 raise ValueError(in_str)
838 return HTML_RE.search(in_str) is not None
841 def words_count(in_str: str) -> int:
843 Returns the number of words contained into the given string.
845 This method is smart, it does consider only sequence of one or more letter and/or numbers
846 as "words", so a string like this: "! @ # % ... []" will return zero!
847 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
848 will be 4 not 1 (even if there are no spaces in the string).
850 >>> words_count('hello world')
852 >>> words_count('one,two,three.stop')
856 if not is_string(in_str):
857 raise ValueError(in_str)
858 return len(WORDS_COUNT_RE.findall(in_str))
861 def generate_uuid(omit_dashes: bool = False) -> str:
863 Generated an UUID string (using `uuid.uuid4()`).
865 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
866 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
875 def generate_random_alphanumeric_string(size: int) -> str:
877 Returns a string of the specified size containing random
878 characters (uppercase/lowercase ascii letters and digits).
880 random_string(9) # possible output: "cx3QQbzYg"
884 raise ValueError("size must be >= 1")
885 chars = string.ascii_letters + string.digits
886 buffer = [random.choice(chars) for _ in range(size)]
887 return from_char_list(buffer)
890 def reverse(in_str: str) -> str:
892 Returns the string with its chars reversed.
898 if not is_string(in_str):
899 raise ValueError(in_str)
903 def camel_case_to_snake_case(in_str, *, separator="_"):
905 Convert a camel case string into a snake case one.
906 (The original string is returned if is not a valid camel case string)
908 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
909 'mac_address_extractor_factory'
910 >>> camel_case_to_snake_case('Luke Skywalker')
913 if not is_string(in_str):
914 raise ValueError(in_str)
915 if not is_camel_case(in_str):
917 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
920 def snake_case_to_camel_case(
921 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
924 Convert a snake case string into a camel case one.
925 (The original string is returned if is not a valid snake case string)
927 >>> snake_case_to_camel_case('this_is_a_test')
929 >>> snake_case_to_camel_case('Han Solo')
932 if not is_string(in_str):
933 raise ValueError(in_str)
934 if not is_snake_case(in_str, separator=separator):
936 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
937 if not upper_case_first:
938 tokens[0] = tokens[0].lower()
939 return from_char_list(tokens)
942 def to_char_list(in_str: str) -> List[str]:
943 """Convert a string into a list of chars.
945 >>> to_char_list('test')
948 if not is_string(in_str):
953 def from_char_list(in_list: List[str]) -> str:
954 """Convert a char list into a string.
956 >>> from_char_list(['t', 'e', 's', 't'])
959 return "".join(in_list)
962 def shuffle(in_str: str) -> str:
963 """Return a new string containing same chars of the given one but in
966 if not is_string(in_str):
967 raise ValueError(in_str)
969 # turn the string into a list of chars
970 chars = to_char_list(in_str)
971 random.shuffle(chars)
972 return from_char_list(chars)
975 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
977 Remove html code contained into the given string.
979 >>> strip_html('test: <a href="foo/bar">click here</a>')
981 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
984 if not is_string(in_str):
985 raise ValueError(in_str)
986 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
987 return r.sub("", in_str)
990 def asciify(in_str: str) -> str:
992 Force string content to be ascii-only by translating all non-ascii
993 chars into the closest possible representation (eg: ó -> o, Ë ->
996 N.B. Some chars may be lost if impossible to translate.
998 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
999 'eeuuooaaeynAAACIINOE'
1001 if not is_string(in_str):
1002 raise ValueError(in_str)
1004 # "NFKD" is the algorithm which is able to successfully translate
1005 # the most of non-ascii chars.
1006 normalized = unicodedata.normalize("NFKD", in_str)
1008 # encode string forcing ascii and ignore any errors
1009 # (unrepresentable chars will be stripped out)
1010 ascii_bytes = normalized.encode("ascii", "ignore")
1012 # turns encoded bytes into an utf-8 string
1013 return ascii_bytes.decode("utf-8")
1016 def slugify(in_str: str, *, separator: str = "-") -> str:
1018 Converts a string into a "slug" using provided separator.
1019 The returned string has the following properties:
1022 - all letters are in lower case
1023 - all punctuation signs and non alphanumeric chars are removed
1024 - words are divided using provided separator
1025 - all chars are encoded as ascii (by using `asciify()`)
1028 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1029 'top-10-reasons-to-love-dogs'
1030 >>> slugify('Mönstér Mägnët')
1033 if not is_string(in_str):
1034 raise ValueError(in_str)
1036 # replace any character that is NOT letter or number with spaces
1037 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1039 # replace spaces with join sign
1040 out = SPACES_RE.sub(separator, out)
1042 # normalize joins (remove duplicates)
1043 out = re.sub(re.escape(separator) + r"+", separator, out)
1047 def to_bool(in_str: str) -> bool:
1049 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1051 A positive boolean (True) is returned if the string value is one
1059 Otherwise False is returned.
1080 if not is_string(in_str):
1081 raise ValueError(in_str)
1082 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1085 def to_date(in_str: str) -> Optional[datetime.date]:
1087 Parses a date string. See DateParser docs for details.
1089 import dateparse.dateparse_utils as dp # type: ignore
1095 except dp.ParseException:
1096 msg = f'Unable to parse date {in_str}.'
1101 def valid_date(in_str: str) -> bool:
1103 True if the string represents a valid date.
1105 import dateparse.dateparse_utils as dp
1111 except dp.ParseException:
1112 msg = f'Unable to parse date {in_str}.'
1117 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1119 Parses a datetime string. See DateParser docs for more info.
1121 import dateparse.dateparse_utils as dp
1125 dt = d.parse(in_str)
1126 if type(dt) == datetime.datetime:
1129 msg = f'Unable to parse datetime {in_str}.'
1134 def valid_datetime(in_str: str) -> bool:
1136 True if the string represents a valid datetime.
1138 _ = to_datetime(in_str)
1141 msg = f'Unable to parse datetime {in_str}.'
1146 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1148 Squeeze runs of more than one character_to_squeeze into one.
1150 >>> squeeze(' this is a test ')
1153 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1158 r'(' + re.escape(character_to_squeeze) + r')+',
1159 character_to_squeeze,
1164 def dedent(in_str: str) -> str:
1166 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1168 if not is_string(in_str):
1169 raise ValueError(in_str)
1170 line_separator = '\n'
1171 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1172 return line_separator.join(lines)
1175 def indent(in_str: str, amount: int) -> str:
1177 Indents string by prepending amount spaces.
1179 >>> indent('This is a test', 4)
1183 if not is_string(in_str):
1184 raise ValueError(in_str)
1185 line_separator = '\n'
1186 lines = [" " * amount + line for line in in_str.split(line_separator)]
1187 return line_separator.join(lines)
1190 def sprintf(*args, **kwargs) -> str:
1191 """String printf, like in C"""
1194 sep = kwargs.pop("sep", None)
1196 if not isinstance(sep, str):
1197 raise TypeError("sep must be None or a string")
1199 end = kwargs.pop("end", None)
1201 if not isinstance(end, str):
1202 raise TypeError("end must be None or a string")
1205 raise TypeError("invalid keyword arguments to sprint()")
1211 for i, arg in enumerate(args):
1214 if isinstance(arg, str):
1222 class SprintfStdout(object):
1224 A context manager that captures outputs to stdout.
1226 with SprintfStdout() as buf:
1233 def __init__(self) -> None:
1234 self.destination = io.StringIO()
1235 self.recorder: contextlib.redirect_stdout
1237 def __enter__(self) -> Callable[[], str]:
1238 self.recorder = contextlib.redirect_stdout(self.destination)
1239 self.recorder.__enter__()
1240 return lambda: self.destination.getvalue()
1242 def __exit__(self, *args) -> None:
1243 self.recorder.__exit__(*args)
1244 self.destination.seek(0)
1245 return None # don't suppress exceptions
1248 def is_are(n: int) -> str:
1262 def pluralize(n: int) -> str:
1268 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1271 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1280 def thify(n: int) -> str:
1281 """Return the proper cardinal suffix for a number.
1292 assert is_integer_number(digit)
1304 def ngrams(txt: str, n: int):
1305 """Return the ngrams from a string.
1307 >>> [x for x in ngrams('This is a test', 2)]
1308 ['This is', 'is a', 'a test']
1312 for ngram in ngrams_presplit(words, n):
1319 def ngrams_presplit(words: Sequence[str], n: int):
1320 return list_utils.ngrams(words, n)
1323 def bigrams(txt: str):
1324 return ngrams(txt, 2)
1327 def trigrams(txt: str):
1328 return ngrams(txt, 3)
1331 def shuffle_columns_into_list(
1332 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1334 """Helper to shuffle / parse columnar data and return the results as a
1335 list. The column_specs argument is an iterable collection of
1336 numeric sequences that indicate one or more column numbers to
1339 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1340 >>> shuffle_columns_into_list(
1342 ... [ [8], [2, 3], [5, 6, 7] ],
1345 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1350 # Column specs map input lines' columns into outputs.
1352 for spec in column_specs:
1355 chunk = chunk + delim + input_lines[n]
1356 chunk = chunk.strip(delim)
1361 def shuffle_columns_into_dict(
1362 input_lines: Sequence[str],
1363 column_specs: Iterable[Tuple[str, Iterable[int]]],
1365 ) -> Dict[str, str]:
1366 """Helper to shuffle / parse columnar data and return the results
1369 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1370 >>> shuffle_columns_into_dict(
1372 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1375 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1380 # Column specs map input lines' columns into outputs.
1381 # "key", [col1, col2...]
1382 for spec in column_specs:
1385 chunk = chunk + delim + input_lines[n]
1386 chunk = chunk.strip(delim)
1387 out[spec[0]] = chunk
1391 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1392 """Interpolate a string with data from a dict.
1394 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1395 ... {'adjective': 'good', 'noun': 'example'})
1396 'This is a good example.'
1399 return sprintf(txt.format(**values), end='')
1402 def to_ascii(x: str):
1403 """Encode as ascii bytes string.
1405 >>> to_ascii('test')
1408 >>> to_ascii(b'1, 2, 3')
1413 return x.encode('ascii')
1414 if type(x) is bytes:
1416 raise Exception('to_ascii works with strings and bytes')
1419 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1420 """Encode txt and then encode the bytes with a 64-character
1421 alphabet. This is compatible with uudecode.
1423 >>> to_base64('hello?')
1427 return base64.encodebytes(txt.encode(encoding, errors))
1430 def is_base64(txt: str) -> bool:
1431 """Determine whether a string is base64 encoded (with Python's standard
1432 base64 alphabet which is the same as what uuencode uses).
1434 >>> is_base64('test') # all letters in the b64 alphabet
1437 >>> is_base64('another test, how do you like this one?')
1440 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1444 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1445 alphabet = set(a.encode('ascii'))
1446 for char in to_ascii(txt.strip()):
1447 if char not in alphabet:
1452 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1453 """Convert base64 encoded string back to normal strings.
1455 >>> from_base64(b'aGVsbG8/\\n')
1459 return base64.decodebytes(b64).decode(encoding, errors)
1462 def chunk(txt: str, chunk_size):
1463 """Chunk up a string.
1465 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1466 '01001101 11000101 10101010 10101010 10011111 10101000'
1469 if len(txt) % chunk_size != 0:
1470 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1472 warnings.warn(msg, stacklevel=2)
1473 for x in range(0, len(txt), chunk_size):
1474 yield txt[x : x + chunk_size]
1478 txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass'
1480 """Encode txt and then chop it into bytes. Note: only bitstrings
1481 with delimiter='' are interpretable by from_bitstring.
1483 >>> to_bitstring('hello?')
1484 '011010000110010101101100011011000110111100111111'
1486 >>> to_bitstring('test', delimiter=' ')
1487 '01110100 01100101 01110011 01110100'
1489 >>> to_bitstring(b'test')
1490 '01110100011001010111001101110100'
1493 etxt = to_ascii(txt)
1494 bits = bin(int.from_bytes(etxt, 'big'))
1496 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1499 def is_bitstring(txt: str) -> bool:
1500 """Is this a bitstring?
1502 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1505 >>> is_bitstring('1234')
1509 return is_binary_integer_number(f'0b{txt}')
1512 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1513 """Convert from bitstring back to bytes then decode into a str.
1515 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1520 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1523 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1524 """Turn an IPv4 address into a tuple for sorting purposes.
1526 >>> ip_v4_sort_key('10.0.0.18')
1529 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1530 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1531 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1534 if not is_ip_v4(txt):
1535 print(f"not IP: {txt}")
1537 return tuple([int(x) for x in txt.split('.')])
1540 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1541 """Chunk up a file path so that parent/ancestor paths sort before
1542 children/descendant paths.
1544 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1545 ('usr', 'local', 'bin')
1547 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1548 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1549 ['/usr', '/usr/local', '/usr/local/bin']
1552 return tuple([x for x in volume.split('/') if len(x) > 0])
1555 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1556 """Execute several replace operations in a row.
1558 >>> s = 'this_is a-test!'
1559 >>> replace_all(s, ' _-!', '')
1563 for char in replace_set:
1564 in_str = in_str.replace(char, replacement)
1568 if __name__ == '__main__':