2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
7 Modifications Copyright (c) 2021-2022 Scott Gasch
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
16 The above copyright notice and this permission notice shall be included in all
17 copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 This class is based on: https://github.com/daveoncode/python-string-utils.
31 import contextlib # type: ignore
42 from itertools import zip_longest
43 from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple
44 from uuid import uuid4
48 logger = logging.getLogger(__name__)
50 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
52 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
54 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
56 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
59 r"([a-z-]+://)" # scheme
60 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
62 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
63 r"(:\d{2,})?" # port number
64 r"(/[a-z\d_%+-]*)*" # folders
65 r"(\.[a-z\d_%+-]+)*" # file extension
66 r"(\?[a-z\d_+%-=]*)?" # query string
70 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
72 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
74 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
76 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
78 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
80 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
82 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
84 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
86 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
88 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
90 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
92 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
95 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
96 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
97 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
98 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
99 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
100 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
103 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
105 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
107 UUID_HEX_OK_RE = re.compile(
108 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
112 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
114 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
116 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
118 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
120 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
122 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
124 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
126 HTML_RE = re.compile(
127 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
128 re.IGNORECASE | re.MULTILINE | re.DOTALL,
131 HTML_TAG_ONLY_RE = re.compile(
132 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
133 re.IGNORECASE | re.MULTILINE | re.DOTALL,
136 SPACES_RE = re.compile(r"\s")
138 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
140 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
142 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
158 def is_none_or_empty(in_str: Optional[str]) -> bool:
160 Returns true if the input string is either None or an empty string.
162 >>> is_none_or_empty("")
164 >>> is_none_or_empty(None)
166 >>> is_none_or_empty(" \t ")
168 >>> is_none_or_empty('Test')
171 return in_str is None or len(in_str.strip()) == 0
174 def is_string(obj: Any) -> bool:
176 Checks if an object is a string.
178 >>> is_string('test')
184 >>> is_string([1, 2, 3])
187 return isinstance(obj, str)
190 def is_empty_string(in_str: Any) -> bool:
191 return is_empty(in_str)
194 def is_empty(in_str: Any) -> bool:
196 Checks if input is a string and empty or only whitespace.
200 >>> is_empty(' \t\t ')
206 >>> is_empty([1, 2, 3])
209 return is_string(in_str) and in_str.strip() == ""
212 def is_full_string(in_str: Any) -> bool:
214 Checks that input is a string and is not empty ('') or only whitespace.
216 >>> is_full_string('test!')
218 >>> is_full_string('')
220 >>> is_full_string(' ')
222 >>> is_full_string(100.999)
224 >>> is_full_string({"a": 1, "b": 2})
227 return is_string(in_str) and in_str.strip() != ""
230 def is_number(in_str: str) -> bool:
232 Checks if a string is a valid number.
235 Traceback (most recent call last):
238 >>> is_number("100.5")
240 >>> is_number("test")
244 >>> is_number([1, 2, 3])
245 Traceback (most recent call last):
247 ValueError: [1, 2, 3]
249 if not is_string(in_str):
250 raise ValueError(in_str)
251 return NUMBER_RE.match(in_str) is not None
254 def is_integer_number(in_str: str) -> bool:
256 Checks whether the given string represents an integer or not.
258 An integer may be signed or unsigned or use a "scientific notation".
260 >>> is_integer_number('42')
262 >>> is_integer_number('42.0')
266 (is_number(in_str) and "." not in in_str)
267 or is_hexidecimal_integer_number(in_str)
268 or is_octal_integer_number(in_str)
269 or is_binary_integer_number(in_str)
273 def is_hexidecimal_integer_number(in_str: str) -> bool:
275 Checks whether a string is a hex integer number.
277 >>> is_hexidecimal_integer_number('0x12345')
279 >>> is_hexidecimal_integer_number('0x1A3E')
281 >>> is_hexidecimal_integer_number('1234') # Needs 0x
283 >>> is_hexidecimal_integer_number('-0xff')
285 >>> is_hexidecimal_integer_number('test')
287 >>> is_hexidecimal_integer_number(12345) # Not a string
288 Traceback (most recent call last):
291 >>> is_hexidecimal_integer_number(101.4)
292 Traceback (most recent call last):
295 >>> is_hexidecimal_integer_number(0x1A3E)
296 Traceback (most recent call last):
300 if not is_string(in_str):
301 raise ValueError(in_str)
302 return HEX_NUMBER_RE.match(in_str) is not None
305 def is_octal_integer_number(in_str: str) -> bool:
307 Checks whether a string is an octal number.
309 >>> is_octal_integer_number('0o777')
311 >>> is_octal_integer_number('-0O115')
313 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
315 >>> is_octal_integer_number('7777') # Needs 0o
317 >>> is_octal_integer_number('test')
320 if not is_string(in_str):
321 raise ValueError(in_str)
322 return OCT_NUMBER_RE.match(in_str) is not None
325 def is_binary_integer_number(in_str: str) -> bool:
327 Returns whether a string contains a binary number.
329 >>> is_binary_integer_number('0b10111')
331 >>> is_binary_integer_number('-0b111')
333 >>> is_binary_integer_number('0B10101')
335 >>> is_binary_integer_number('0b10102')
337 >>> is_binary_integer_number('0xFFF')
339 >>> is_binary_integer_number('test')
342 if not is_string(in_str):
343 raise ValueError(in_str)
344 return BIN_NUMBER_RE.match(in_str) is not None
347 def to_int(in_str: str) -> int:
348 """Returns the integral value of the string or raises on error.
353 Traceback (most recent call last):
355 ValueError: invalid literal for int() with base 10: 'test'
357 if not is_string(in_str):
358 raise ValueError(in_str)
359 if is_binary_integer_number(in_str):
360 return int(in_str, 2)
361 if is_octal_integer_number(in_str):
362 return int(in_str, 8)
363 if is_hexidecimal_integer_number(in_str):
364 return int(in_str, 16)
368 def is_decimal_number(in_str: str) -> bool:
370 Checks whether the given string represents a decimal or not.
372 A decimal may be signed or unsigned or use a "scientific notation".
374 >>> is_decimal_number('42.0')
376 >>> is_decimal_number('42')
379 return is_number(in_str) and "." in in_str
382 def strip_escape_sequences(in_str: str) -> str:
384 Remove escape sequences in the input string.
386 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
389 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
393 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
395 Add thousands separator to a numeric string. Also handles numbers.
397 >>> add_thousands_separator('12345678')
399 >>> add_thousands_separator(12345678)
401 >>> add_thousands_separator(12345678.99)
403 >>> add_thousands_separator('test')
404 Traceback (most recent call last):
409 if isinstance(in_str, numbers.Number):
411 if is_number(in_str):
412 return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
413 raise ValueError(in_str)
416 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
419 (in_str, decimal_part) = in_str.split('.')
420 tmp = [iter(in_str[::-1])] * places
421 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
422 if len(decimal_part) > 0:
429 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
430 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
432 Check if a string is a valid url.
434 >>> is_url('http://www.mysite.com')
436 >>> is_url('https://mysite.com')
438 >>> is_url('.mysite.com')
441 if not is_full_string(in_str):
444 valid = URL_RE.match(in_str) is not None
447 return valid and any([in_str.startswith(s) for s in allowed_schemes])
451 def is_email(in_str: Any) -> bool:
453 Check if a string is a valid email.
455 Reference: https://tools.ietf.org/html/rfc3696#section-3
459 >>> is_email('@gmail.com')
462 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
466 # we expect 2 tokens, one before "@" and one after, otherwise
467 # we have an exception and the email is not valid.
468 head, tail = in_str.split("@")
470 # head's size must be <= 64, tail <= 255, head must not start
471 # with a dot or contain multiple consecutive dots.
472 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
475 # removes escaped spaces, so that later on the test regex will
477 head = head.replace("\\ ", "")
478 if head.startswith('"') and head.endswith('"'):
479 head = head.replace(" ", "")[1:-1]
480 return EMAIL_RE.match(head + "@" + tail) is not None
483 # borderline case in which we have multiple "@" signs but the
484 # head part is correctly escaped.
485 if ESCAPED_AT_SIGN.search(in_str) is not None:
486 # replace "@" with "a" in the head
487 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
491 def suffix_string_to_number(in_str: str) -> Optional[int]:
492 """Take a string like "33Gb" and convert it into a number (of bytes)
493 like 34603008. Return None if the input string is not valid.
495 >>> suffix_string_to_number('1Mb')
497 >>> suffix_string_to_number('13.1Gb')
501 def suffix_capitalize(s: str) -> str:
505 return f"{s[0].upper()}{s[1].lower()}"
506 return suffix_capitalize(s[0:1])
508 if is_string(in_str):
509 if is_integer_number(in_str):
510 return to_int(in_str)
511 suffixes = [in_str[-2:], in_str[-1:]]
512 rest = [in_str[:-2], in_str[:-1]]
513 for x in range(len(suffixes)):
515 s = suffix_capitalize(s)
516 multiplier = NUM_SUFFIXES.get(s, None)
517 if multiplier is not None:
519 if is_integer_number(r):
520 return to_int(r) * multiplier
521 if is_decimal_number(r):
522 return int(float(r) * multiplier)
526 def number_to_suffix_string(num: int) -> Optional[str]:
527 """Take a number (of bytes) and returns a string like "43.8Gb".
528 Returns none if the input is invalid.
530 >>> number_to_suffix_string(14066017894)
532 >>> number_to_suffix_string(1024 * 1024)
538 for (sfx, size) in NUM_SUFFIXES.items():
543 if suffix is not None:
544 return f"{d:.1f}{suffix}"
549 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
551 Checks if a string is a valid credit card number.
552 If card type is provided then it checks against that specific type only,
553 otherwise any known credit card number will be accepted.
555 Supported card types are the following:
564 if not is_full_string(in_str):
567 if card_type is not None:
568 if card_type not in CREDIT_CARDS:
570 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
572 return CREDIT_CARDS[card_type].match(in_str) is not None
573 for c in CREDIT_CARDS:
574 if CREDIT_CARDS[c].match(in_str) is not None:
579 def is_camel_case(in_str: Any) -> bool:
581 Checks if a string is formatted as camel case.
583 A string is considered camel case when:
585 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
586 - it contains both lowercase and uppercase letters
587 - it does not start with a number
589 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
592 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
594 Checks if a string is formatted as "snake case".
596 A string is considered snake case when:
598 - it's composed only by lowercase/uppercase letters and digits
599 - it contains at least one underscore (or provided separator)
600 - it does not start with a number
602 >>> is_snake_case('this_is_a_test')
604 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
606 >>> is_snake_case('this-is-a-test')
608 >>> is_snake_case('this-is-a-test', separator='-')
612 if is_full_string(in_str):
613 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
614 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
617 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
619 return r.match(in_str) is not None
623 def is_json(in_str: Any) -> bool:
625 Check if a string is a valid json.
627 >>> is_json('{"name": "Peter"}')
629 >>> is_json('[1, 2, 3]')
631 >>> is_json('{nope}')
634 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
636 return isinstance(json.loads(in_str), (dict, list))
637 except (TypeError, ValueError, OverflowError):
642 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
644 Check if a string is a valid UUID.
646 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
648 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
650 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
653 # string casting is used to allow UUID itself as input data type
656 return UUID_HEX_OK_RE.match(s) is not None
657 return UUID_RE.match(s) is not None
660 def is_ip_v4(in_str: Any) -> bool:
662 Checks if a string is a valid ip v4.
664 >>> is_ip_v4('255.200.100.75')
668 >>> is_ip_v4('255.200.100.999') # 999 out of range
671 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
674 # checks that each entry in the ip is in the valid range (0 to 255)
675 for token in in_str.split("."):
676 if not 0 <= int(token) <= 255:
681 def extract_ip_v4(in_str: Any) -> Optional[str]:
683 Extracts the IPv4 chunk of a string or None.
685 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
687 >>> extract_ip_v4('Your mom dresses you funny.')
689 if not is_full_string(in_str):
691 m = ANYWHERE_IP_V4_RE.search(in_str)
697 def is_ip_v6(in_str: Any) -> bool:
699 Checks if a string is a valid ip v6.
701 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
703 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
706 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
709 def extract_ip_v6(in_str: Any) -> Optional[str]:
711 Extract IPv6 chunk or None.
713 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
714 '2001:db8:85a3:0000:0000:8a2e:370:7334'
715 >>> extract_ip_v6("(and she's ugly too, btw)")
717 if not is_full_string(in_str):
719 m = ANYWHERE_IP_V6_RE.search(in_str)
725 def is_ip(in_str: Any) -> bool:
727 Checks if a string is a valid ip (either v4 or v6).
729 >>> is_ip('255.200.100.75')
731 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
735 >>> is_ip('1.2.3.999')
738 return is_ip_v6(in_str) or is_ip_v4(in_str)
741 def extract_ip(in_str: Any) -> Optional[str]:
743 Extract the IP address or None.
745 >>> extract_ip('Attacker: 255.200.100.75')
747 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
748 '2001:db8:85a3:0000:0000:8a2e:370:7334'
749 >>> extract_ip('1.2.3')
752 ip = extract_ip_v4(in_str)
754 ip = extract_ip_v6(in_str)
758 def is_mac_address(in_str: Any) -> bool:
759 """Return True if in_str is a valid MAC address false otherwise.
761 >>> is_mac_address("34:29:8F:12:0D:2F")
763 >>> is_mac_address('34:29:8f:12:0d:2f')
765 >>> is_mac_address('34-29-8F-12-0D-2F')
767 >>> is_mac_address("test")
770 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
773 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
775 Extract the MAC address from in_str.
777 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
780 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
784 if not is_full_string(in_str):
787 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
790 mac.replace(":", separator)
791 mac.replace("-", separator)
796 def is_slug(in_str: Any, separator: str = "-") -> bool:
798 Checks if a given string is a slug (as created by `slugify()`).
800 >>> is_slug('my-blog-post-title')
802 >>> is_slug('My blog post title')
806 if not is_full_string(in_str):
808 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
809 return re.match(rex, in_str) is not None
812 def contains_html(in_str: str) -> bool:
814 Checks if the given string contains HTML/XML tags.
816 By design, this function matches ANY type of tag, so don't expect to use it
817 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
819 >>> contains_html('my string is <strong>bold</strong>')
821 >>> contains_html('my string is not bold')
825 if not is_string(in_str):
826 raise ValueError(in_str)
827 return HTML_RE.search(in_str) is not None
830 def words_count(in_str: str) -> int:
832 Returns the number of words contained into the given string.
834 This method is smart, it does consider only sequence of one or more letter and/or numbers
835 as "words", so a string like this: "! @ # % ... []" will return zero!
836 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
837 will be 4 not 1 (even if there are no spaces in the string).
839 >>> words_count('hello world')
841 >>> words_count('one,two,three.stop')
845 if not is_string(in_str):
846 raise ValueError(in_str)
847 return len(WORDS_COUNT_RE.findall(in_str))
850 def generate_uuid(omit_dashes: bool = False) -> str:
852 Generated an UUID string (using `uuid.uuid4()`).
854 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
855 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
864 def generate_random_alphanumeric_string(size: int) -> str:
866 Returns a string of the specified size containing random
867 characters (uppercase/lowercase ascii letters and digits).
869 random_string(9) # possible output: "cx3QQbzYg"
873 raise ValueError("size must be >= 1")
874 chars = string.ascii_letters + string.digits
875 buffer = [random.choice(chars) for _ in range(size)]
876 return from_char_list(buffer)
879 def reverse(in_str: str) -> str:
881 Returns the string with its chars reversed.
887 if not is_string(in_str):
888 raise ValueError(in_str)
892 def camel_case_to_snake_case(in_str, *, separator="_"):
894 Convert a camel case string into a snake case one.
895 (The original string is returned if is not a valid camel case string)
897 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
898 'mac_address_extractor_factory'
899 >>> camel_case_to_snake_case('Luke Skywalker')
902 if not is_string(in_str):
903 raise ValueError(in_str)
904 if not is_camel_case(in_str):
906 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
909 def snake_case_to_camel_case(
910 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
913 Convert a snake case string into a camel case one.
914 (The original string is returned if is not a valid snake case string)
916 >>> snake_case_to_camel_case('this_is_a_test')
918 >>> snake_case_to_camel_case('Han Solo')
921 if not is_string(in_str):
922 raise ValueError(in_str)
923 if not is_snake_case(in_str, separator=separator):
925 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
926 if not upper_case_first:
927 tokens[0] = tokens[0].lower()
928 return from_char_list(tokens)
931 def to_char_list(in_str: str) -> List[str]:
932 """Convert a string into a list of chars.
934 >>> to_char_list('test')
937 if not is_string(in_str):
942 def from_char_list(in_list: List[str]) -> str:
943 """Convert a char list into a string.
945 >>> from_char_list(['t', 'e', 's', 't'])
948 return "".join(in_list)
951 def shuffle(in_str: str) -> str:
952 """Return a new string containing same chars of the given one but in
955 if not is_string(in_str):
956 raise ValueError(in_str)
958 # turn the string into a list of chars
959 chars = to_char_list(in_str)
960 random.shuffle(chars)
961 return from_char_list(chars)
964 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
966 Remove html code contained into the given string.
968 >>> strip_html('test: <a href="foo/bar">click here</a>')
970 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
973 if not is_string(in_str):
974 raise ValueError(in_str)
975 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
976 return r.sub("", in_str)
979 def asciify(in_str: str) -> str:
981 Force string content to be ascii-only by translating all non-ascii
982 chars into the closest possible representation (eg: ó -> o, Ë ->
985 N.B. Some chars may be lost if impossible to translate.
987 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
988 'eeuuooaaeynAAACIINOE'
990 if not is_string(in_str):
991 raise ValueError(in_str)
993 # "NFKD" is the algorithm which is able to successfully translate
994 # the most of non-ascii chars.
995 normalized = unicodedata.normalize("NFKD", in_str)
997 # encode string forcing ascii and ignore any errors
998 # (unrepresentable chars will be stripped out)
999 ascii_bytes = normalized.encode("ascii", "ignore")
1001 # turns encoded bytes into an utf-8 string
1002 return ascii_bytes.decode("utf-8")
1005 def slugify(in_str: str, *, separator: str = "-") -> str:
1007 Converts a string into a "slug" using provided separator.
1008 The returned string has the following properties:
1011 - all letters are in lower case
1012 - all punctuation signs and non alphanumeric chars are removed
1013 - words are divided using provided separator
1014 - all chars are encoded as ascii (by using `asciify()`)
1017 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1018 'top-10-reasons-to-love-dogs'
1019 >>> slugify('Mönstér Mägnët')
1022 if not is_string(in_str):
1023 raise ValueError(in_str)
1025 # replace any character that is NOT letter or number with spaces
1026 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1028 # replace spaces with join sign
1029 out = SPACES_RE.sub(separator, out)
1031 # normalize joins (remove duplicates)
1032 out = re.sub(re.escape(separator) + r"+", separator, out)
1036 def to_bool(in_str: str) -> bool:
1038 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1040 A positive boolean (True) is returned if the string value is one
1048 Otherwise False is returned.
1069 if not is_string(in_str):
1070 raise ValueError(in_str)
1071 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1074 def to_date(in_str: str) -> Optional[datetime.date]:
1076 Parses a date string. See DateParser docs for details.
1078 import dateparse.dateparse_utils as du
1081 d = du.DateParser() # type: ignore
1084 except du.ParseException: # type: ignore
1085 msg = f'Unable to parse date {in_str}.'
1090 def valid_date(in_str: str) -> bool:
1092 True if the string represents a valid date.
1094 import dateparse.dateparse_utils as dp
1097 d = dp.DateParser() # type: ignore
1100 except dp.ParseException: # type: ignore
1101 msg = f'Unable to parse date {in_str}.'
1106 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1108 Parses a datetime string. See DateParser docs for more info.
1110 import dateparse.dateparse_utils as dp
1113 d = dp.DateParser() # type: ignore
1114 dt = d.parse(in_str)
1115 if isinstance(dt, datetime.datetime):
1118 msg = f'Unable to parse datetime {in_str}.'
1123 def valid_datetime(in_str: str) -> bool:
1125 True if the string represents a valid datetime.
1127 _ = to_datetime(in_str)
1130 msg = f'Unable to parse datetime {in_str}.'
1135 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1137 Squeeze runs of more than one character_to_squeeze into one.
1139 >>> squeeze(' this is a test ')
1142 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1147 r'(' + re.escape(character_to_squeeze) + r')+',
1148 character_to_squeeze,
1153 def dedent(in_str: str) -> str:
1155 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1157 if not is_string(in_str):
1158 raise ValueError(in_str)
1159 line_separator = '\n'
1160 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1161 return line_separator.join(lines)
1164 def indent(in_str: str, amount: int) -> str:
1166 Indents string by prepending amount spaces.
1168 >>> indent('This is a test', 4)
1172 if not is_string(in_str):
1173 raise ValueError(in_str)
1174 line_separator = '\n'
1175 lines = [" " * amount + line for line in in_str.split(line_separator)]
1176 return line_separator.join(lines)
1179 def sprintf(*args, **kwargs) -> str:
1180 """String printf, like in C"""
1183 sep = kwargs.pop("sep", None)
1185 if not isinstance(sep, str):
1186 raise TypeError("sep must be None or a string")
1188 end = kwargs.pop("end", None)
1190 if not isinstance(end, str):
1191 raise TypeError("end must be None or a string")
1194 raise TypeError("invalid keyword arguments to sprint()")
1200 for i, arg in enumerate(args):
1203 if isinstance(arg, str):
1211 class SprintfStdout(contextlib.AbstractContextManager):
1213 A context manager that captures outputs to stdout.
1215 with SprintfStdout() as buf:
1222 def __init__(self) -> None:
1223 self.destination = io.StringIO()
1224 self.recorder: contextlib.redirect_stdout
1226 def __enter__(self) -> Callable[[], str]:
1227 self.recorder = contextlib.redirect_stdout(self.destination)
1228 self.recorder.__enter__()
1229 return lambda: self.destination.getvalue()
1231 def __exit__(self, *args) -> Literal[False]:
1232 self.recorder.__exit__(*args)
1233 self.destination.seek(0)
1237 def capitalize_first_letter(txt: str) -> str:
1238 """Capitalize the first letter of a string.
1240 >>> capitalize_first_letter('test')
1242 >>> capitalize_first_letter("ALREADY!")
1246 return txt[0].upper() + txt[1:]
1249 def it_they(n: int) -> str:
1263 def is_are(n: int) -> str:
1277 def pluralize(n: int) -> str:
1283 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1286 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1295 def make_contractions(txt: str) -> str:
1296 """Glue words together to form contractions.
1298 >>> make_contractions('It is nice today.')
1301 >>> make_contractions('I can not even...')
1304 >>> make_contractions('She could not see!')
1307 >>> make_contractions('But she will not go.')
1310 >>> make_contractions('Verily, I shall not.')
1313 >>> make_contractions('No you cannot.')
1316 >>> make_contractions('I said you can not go.')
1317 "I said you can't go."
1354 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1358 # Special cases: can't, shan't and won't.
1359 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1360 txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1362 r'\b(w)ill\s*(n)(o)(t)\b',
1366 flags=re.IGNORECASE,
1369 for first_list, second_list in first_second:
1370 for first in first_list:
1371 for second in second_list:
1372 # Disallow there're/where're. They're valid English
1374 if (first in ('there', 'where')) and second == 'a(re)':
1377 pattern = fr'\b({first})\s+{second}\b'
1378 if second == '(n)o(t)':
1379 replacement = r"\1\2'\3"
1381 replacement = r"\1'\2"
1382 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1387 def thify(n: int) -> str:
1388 """Return the proper cardinal suffix for a number.
1399 assert is_integer_number(digit)
1411 def ngrams(txt: str, n: int):
1412 """Return the ngrams from a string.
1414 >>> [x for x in ngrams('This is a test', 2)]
1415 ['This is', 'is a', 'a test']
1419 for ngram in ngrams_presplit(words, n):
1426 def ngrams_presplit(words: Sequence[str], n: int):
1427 return list_utils.ngrams(words, n)
1430 def bigrams(txt: str):
1431 return ngrams(txt, 2)
1434 def trigrams(txt: str):
1435 return ngrams(txt, 3)
1438 def shuffle_columns_into_list(
1439 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1441 """Helper to shuffle / parse columnar data and return the results as a
1442 list. The column_specs argument is an iterable collection of
1443 numeric sequences that indicate one or more column numbers to
1446 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1447 >>> shuffle_columns_into_list(
1449 ... [ [8], [2, 3], [5, 6, 7] ],
1452 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1457 # Column specs map input lines' columns into outputs.
1459 for spec in column_specs:
1462 hunk = hunk + delim + input_lines[n]
1463 hunk = hunk.strip(delim)
1468 def shuffle_columns_into_dict(
1469 input_lines: Sequence[str],
1470 column_specs: Iterable[Tuple[str, Iterable[int]]],
1472 ) -> Dict[str, str]:
1473 """Helper to shuffle / parse columnar data and return the results
1476 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1477 >>> shuffle_columns_into_dict(
1479 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1482 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1487 # Column specs map input lines' columns into outputs.
1488 # "key", [col1, col2...]
1489 for spec in column_specs:
1492 hunk = hunk + delim + input_lines[n]
1493 hunk = hunk.strip(delim)
1498 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1499 """Interpolate a string with data from a dict.
1501 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1502 ... {'adjective': 'good', 'noun': 'example'})
1503 'This is a good example.'
1506 return sprintf(txt.format(**values), end='')
1509 def to_ascii(x: str):
1510 """Encode as ascii bytes string.
1512 >>> to_ascii('test')
1515 >>> to_ascii(b'1, 2, 3')
1519 if isinstance(x, str):
1520 return x.encode('ascii')
1521 if isinstance(x, bytes):
1523 raise Exception('to_ascii works with strings and bytes')
1526 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1527 """Encode txt and then encode the bytes with a 64-character
1528 alphabet. This is compatible with uudecode.
1530 >>> to_base64('hello?')
1534 return base64.encodebytes(txt.encode(encoding, errors))
1537 def is_base64(txt: str) -> bool:
1538 """Determine whether a string is base64 encoded (with Python's standard
1539 base64 alphabet which is the same as what uuencode uses).
1541 >>> is_base64('test') # all letters in the b64 alphabet
1544 >>> is_base64('another test, how do you like this one?')
1547 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1551 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1552 alphabet = set(a.encode('ascii'))
1553 for char in to_ascii(txt.strip()):
1554 if char not in alphabet:
1559 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1560 """Convert base64 encoded string back to normal strings.
1562 >>> from_base64(b'aGVsbG8/\\n')
1566 return base64.decodebytes(b64).decode(encoding, errors)
1569 def chunk(txt: str, chunk_size):
1570 """Chunk up a string.
1572 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1573 '01001101 11000101 10101010 10101010 10011111 10101000'
1576 if len(txt) % chunk_size != 0:
1577 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1579 warnings.warn(msg, stacklevel=2)
1580 for x in range(0, len(txt), chunk_size):
1581 yield txt[x : x + chunk_size]
1584 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1585 """Encode txt and then chop it into bytes. Note: only bitstrings
1586 with delimiter='' are interpretable by from_bitstring.
1588 >>> to_bitstring('hello?')
1589 '011010000110010101101100011011000110111100111111'
1591 >>> to_bitstring('test', delimiter=' ')
1592 '01110100 01100101 01110011 01110100'
1594 >>> to_bitstring(b'test')
1595 '01110100011001010111001101110100'
1598 etxt = to_ascii(txt)
1599 bits = bin(int.from_bytes(etxt, 'big'))
1601 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1604 def is_bitstring(txt: str) -> bool:
1605 """Is this a bitstring?
1607 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1610 >>> is_bitstring('1234')
1614 return is_binary_integer_number(f'0b{txt}')
1617 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1618 """Convert from bitstring back to bytes then decode into a str.
1620 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1625 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1628 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1629 """Turn an IPv4 address into a tuple for sorting purposes.
1631 >>> ip_v4_sort_key('10.0.0.18')
1634 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1635 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1636 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1639 if not is_ip_v4(txt):
1640 print(f"not IP: {txt}")
1642 return tuple([int(x) for x in txt.split('.')])
1645 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1646 """Chunk up a file path so that parent/ancestor paths sort before
1647 children/descendant paths.
1649 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1650 ('usr', 'local', 'bin')
1652 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1653 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1654 ['/usr', '/usr/local', '/usr/local/bin']
1657 return tuple([x for x in volume.split('/') if len(x) > 0])
1660 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1661 """Execute several replace operations in a row.
1663 >>> s = 'this_is a-test!'
1664 >>> replace_all(s, ' _-!', '')
1668 for char in replace_set:
1669 in_str = in_str.replace(char, replacement)
1673 if __name__ == '__main__':