2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
7 Modifications Copyright (c) 2021-2022 Scott Gasch
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
16 The above copyright notice and this permission notice shall be included in all
17 copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 This class is based on: https://github.com/daveoncode/python-string-utils.
31 import contextlib # type: ignore
42 from itertools import zip_longest
43 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
44 from uuid import uuid4
48 logger = logging.getLogger(__name__)
50 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
52 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
54 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
56 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
59 r"([a-z-]+://)" # scheme
60 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
62 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
63 r"(:\d{2,})?" # port number
64 r"(/[a-z\d_%+-]*)*" # folders
65 r"(\.[a-z\d_%+-]+)*" # file extension
66 r"(\?[a-z\d_+%-=]*)?" # query string
70 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
72 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
74 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
76 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
78 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
80 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
82 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
84 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
86 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
88 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
90 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
92 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
95 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
96 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
97 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
98 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
99 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
100 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
103 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
105 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
107 UUID_HEX_OK_RE = re.compile(
108 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
112 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
114 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
116 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
118 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
120 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
122 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
124 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
126 HTML_RE = re.compile(
127 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
128 re.IGNORECASE | re.MULTILINE | re.DOTALL,
131 HTML_TAG_ONLY_RE = re.compile(
132 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
133 re.IGNORECASE | re.MULTILINE | re.DOTALL,
136 SPACES_RE = re.compile(r"\s")
138 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
140 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
142 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
158 def is_none_or_empty(in_str: Optional[str]) -> bool:
160 Returns true if the input string is either None or an empty string.
162 >>> is_none_or_empty("")
164 >>> is_none_or_empty(None)
166 >>> is_none_or_empty(" \t ")
168 >>> is_none_or_empty('Test')
171 return in_str is None or len(in_str.strip()) == 0
174 def is_string(obj: Any) -> bool:
176 Checks if an object is a string.
178 >>> is_string('test')
184 >>> is_string([1, 2, 3])
187 return isinstance(obj, str)
190 def is_empty_string(in_str: Any) -> bool:
191 return is_empty(in_str)
194 def is_empty(in_str: Any) -> bool:
196 Checks if input is a string and empty or only whitespace.
200 >>> is_empty(' \t\t ')
206 >>> is_empty([1, 2, 3])
209 return is_string(in_str) and in_str.strip() == ""
212 def is_full_string(in_str: Any) -> bool:
214 Checks that input is a string and is not empty ('') or only whitespace.
216 >>> is_full_string('test!')
218 >>> is_full_string('')
220 >>> is_full_string(' ')
222 >>> is_full_string(100.999)
224 >>> is_full_string({"a": 1, "b": 2})
227 return is_string(in_str) and in_str.strip() != ""
230 def is_number(in_str: str) -> bool:
232 Checks if a string is a valid number.
235 Traceback (most recent call last):
238 >>> is_number("100.5")
240 >>> is_number("test")
244 >>> is_number([1, 2, 3])
245 Traceback (most recent call last):
247 ValueError: [1, 2, 3]
249 if not is_string(in_str):
250 raise ValueError(in_str)
251 return NUMBER_RE.match(in_str) is not None
254 def is_integer_number(in_str: str) -> bool:
256 Checks whether the given string represents an integer or not.
258 An integer may be signed or unsigned or use a "scientific notation".
260 >>> is_integer_number('42')
262 >>> is_integer_number('42.0')
266 (is_number(in_str) and "." not in in_str)
267 or is_hexidecimal_integer_number(in_str)
268 or is_octal_integer_number(in_str)
269 or is_binary_integer_number(in_str)
273 def is_hexidecimal_integer_number(in_str: str) -> bool:
275 Checks whether a string is a hex integer number.
277 >>> is_hexidecimal_integer_number('0x12345')
279 >>> is_hexidecimal_integer_number('0x1A3E')
281 >>> is_hexidecimal_integer_number('1234') # Needs 0x
283 >>> is_hexidecimal_integer_number('-0xff')
285 >>> is_hexidecimal_integer_number('test')
287 >>> is_hexidecimal_integer_number(12345) # Not a string
288 Traceback (most recent call last):
291 >>> is_hexidecimal_integer_number(101.4)
292 Traceback (most recent call last):
295 >>> is_hexidecimal_integer_number(0x1A3E)
296 Traceback (most recent call last):
300 if not is_string(in_str):
301 raise ValueError(in_str)
302 return HEX_NUMBER_RE.match(in_str) is not None
305 def is_octal_integer_number(in_str: str) -> bool:
307 Checks whether a string is an octal number.
309 >>> is_octal_integer_number('0o777')
311 >>> is_octal_integer_number('-0O115')
313 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
315 >>> is_octal_integer_number('7777') # Needs 0o
317 >>> is_octal_integer_number('test')
320 if not is_string(in_str):
321 raise ValueError(in_str)
322 return OCT_NUMBER_RE.match(in_str) is not None
325 def is_binary_integer_number(in_str: str) -> bool:
327 Returns whether a string contains a binary number.
329 >>> is_binary_integer_number('0b10111')
331 >>> is_binary_integer_number('-0b111')
333 >>> is_binary_integer_number('0B10101')
335 >>> is_binary_integer_number('0b10102')
337 >>> is_binary_integer_number('0xFFF')
339 >>> is_binary_integer_number('test')
342 if not is_string(in_str):
343 raise ValueError(in_str)
344 return BIN_NUMBER_RE.match(in_str) is not None
347 def to_int(in_str: str) -> int:
348 """Returns the integral value of the string or raises on error.
353 Traceback (most recent call last):
355 ValueError: invalid literal for int() with base 10: 'test'
357 if not is_string(in_str):
358 raise ValueError(in_str)
359 if is_binary_integer_number(in_str):
360 return int(in_str, 2)
361 if is_octal_integer_number(in_str):
362 return int(in_str, 8)
363 if is_hexidecimal_integer_number(in_str):
364 return int(in_str, 16)
368 def is_decimal_number(in_str: str) -> bool:
370 Checks whether the given string represents a decimal or not.
372 A decimal may be signed or unsigned or use a "scientific notation".
374 >>> is_decimal_number('42.0')
376 >>> is_decimal_number('42')
379 return is_number(in_str) and "." in in_str
382 def strip_escape_sequences(in_str: str) -> str:
384 Remove escape sequences in the input string.
386 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
389 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
393 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
395 Add thousands separator to a numeric string. Also handles numbers.
397 >>> add_thousands_separator('12345678')
399 >>> add_thousands_separator(12345678)
401 >>> add_thousands_separator(12345678.99)
403 >>> add_thousands_separator('test')
404 Traceback (most recent call last):
409 if isinstance(in_str, numbers.Number):
411 if is_number(in_str):
412 return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
413 raise ValueError(in_str)
416 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
419 (in_str, decimal_part) = in_str.split('.')
420 tmp = [iter(in_str[::-1])] * places
421 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
422 if len(decimal_part) > 0:
429 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
430 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
432 Check if a string is a valid url.
434 >>> is_url('http://www.mysite.com')
436 >>> is_url('https://mysite.com')
438 >>> is_url('.mysite.com')
441 if not is_full_string(in_str):
444 valid = URL_RE.match(in_str) is not None
447 return valid and any([in_str.startswith(s) for s in allowed_schemes])
451 def is_email(in_str: Any) -> bool:
453 Check if a string is a valid email.
455 Reference: https://tools.ietf.org/html/rfc3696#section-3
459 >>> is_email('@gmail.com')
462 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
466 # we expect 2 tokens, one before "@" and one after, otherwise
467 # we have an exception and the email is not valid.
468 head, tail = in_str.split("@")
470 # head's size must be <= 64, tail <= 255, head must not start
471 # with a dot or contain multiple consecutive dots.
472 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
475 # removes escaped spaces, so that later on the test regex will
477 head = head.replace("\\ ", "")
478 if head.startswith('"') and head.endswith('"'):
479 head = head.replace(" ", "")[1:-1]
480 return EMAIL_RE.match(head + "@" + tail) is not None
483 # borderline case in which we have multiple "@" signs but the
484 # head part is correctly escaped.
485 if ESCAPED_AT_SIGN.search(in_str) is not None:
486 # replace "@" with "a" in the head
487 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
491 def suffix_string_to_number(in_str: str) -> Optional[int]:
492 """Take a string like "33Gb" and convert it into a number (of bytes)
493 like 34603008. Return None if the input string is not valid.
495 >>> suffix_string_to_number('1Mb')
497 >>> suffix_string_to_number('13.1Gb')
501 def suffix_capitalize(s: str) -> str:
505 return f"{s[0].upper()}{s[1].lower()}"
506 return suffix_capitalize(s[0:1])
508 if is_string(in_str):
509 if is_integer_number(in_str):
510 return to_int(in_str)
511 suffixes = [in_str[-2:], in_str[-1:]]
512 rest = [in_str[:-2], in_str[:-1]]
513 for x in range(len(suffixes)):
515 s = suffix_capitalize(s)
516 multiplier = NUM_SUFFIXES.get(s, None)
517 if multiplier is not None:
519 if is_integer_number(r):
520 return to_int(r) * multiplier
521 if is_decimal_number(r):
522 return int(float(r) * multiplier)
526 def number_to_suffix_string(num: int) -> Optional[str]:
527 """Take a number (of bytes) and returns a string like "43.8Gb".
528 Returns none if the input is invalid.
530 >>> number_to_suffix_string(14066017894)
532 >>> number_to_suffix_string(1024 * 1024)
538 for (sfx, size) in NUM_SUFFIXES.items():
543 if suffix is not None:
544 return f"{d:.1f}{suffix}"
549 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
551 Checks if a string is a valid credit card number.
552 If card type is provided then it checks against that specific type only,
553 otherwise any known credit card number will be accepted.
555 Supported card types are the following:
564 if not is_full_string(in_str):
567 if card_type is not None:
568 if card_type not in CREDIT_CARDS:
570 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
572 return CREDIT_CARDS[card_type].match(in_str) is not None
573 for c in CREDIT_CARDS:
574 if CREDIT_CARDS[c].match(in_str) is not None:
579 def is_camel_case(in_str: Any) -> bool:
581 Checks if a string is formatted as camel case.
583 A string is considered camel case when:
585 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
586 - it contains both lowercase and uppercase letters
587 - it does not start with a number
589 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
592 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
594 Checks if a string is formatted as "snake case".
596 A string is considered snake case when:
598 - it's composed only by lowercase/uppercase letters and digits
599 - it contains at least one underscore (or provided separator)
600 - it does not start with a number
602 >>> is_snake_case('this_is_a_test')
604 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
606 >>> is_snake_case('this-is-a-test')
608 >>> is_snake_case('this-is-a-test', separator='-')
612 if is_full_string(in_str):
613 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
614 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
617 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
619 return r.match(in_str) is not None
623 def is_json(in_str: Any) -> bool:
625 Check if a string is a valid json.
627 >>> is_json('{"name": "Peter"}')
629 >>> is_json('[1, 2, 3]')
631 >>> is_json('{nope}')
634 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
636 return isinstance(json.loads(in_str), (dict, list))
637 except (TypeError, ValueError, OverflowError):
642 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
644 Check if a string is a valid UUID.
646 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
648 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
650 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
653 # string casting is used to allow UUID itself as input data type
656 return UUID_HEX_OK_RE.match(s) is not None
657 return UUID_RE.match(s) is not None
660 def is_ip_v4(in_str: Any) -> bool:
662 Checks if a string is a valid ip v4.
664 >>> is_ip_v4('255.200.100.75')
668 >>> is_ip_v4('255.200.100.999') # 999 out of range
671 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
674 # checks that each entry in the ip is in the valid range (0 to 255)
675 for token in in_str.split("."):
676 if not 0 <= int(token) <= 255:
681 def extract_ip_v4(in_str: Any) -> Optional[str]:
683 Extracts the IPv4 chunk of a string or None.
685 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
687 >>> extract_ip_v4('Your mom dresses you funny.')
689 if not is_full_string(in_str):
691 m = ANYWHERE_IP_V4_RE.search(in_str)
697 def is_ip_v6(in_str: Any) -> bool:
699 Checks if a string is a valid ip v6.
701 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
703 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
706 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
709 def extract_ip_v6(in_str: Any) -> Optional[str]:
711 Extract IPv6 chunk or None.
713 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
714 '2001:db8:85a3:0000:0000:8a2e:370:7334'
715 >>> extract_ip_v6("(and she's ugly too, btw)")
717 if not is_full_string(in_str):
719 m = ANYWHERE_IP_V6_RE.search(in_str)
725 def is_ip(in_str: Any) -> bool:
727 Checks if a string is a valid ip (either v4 or v6).
729 >>> is_ip('255.200.100.75')
731 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
735 >>> is_ip('1.2.3.999')
738 return is_ip_v6(in_str) or is_ip_v4(in_str)
741 def extract_ip(in_str: Any) -> Optional[str]:
743 Extract the IP address or None.
745 >>> extract_ip('Attacker: 255.200.100.75')
747 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
748 '2001:db8:85a3:0000:0000:8a2e:370:7334'
749 >>> extract_ip('1.2.3')
752 ip = extract_ip_v4(in_str)
754 ip = extract_ip_v6(in_str)
758 def is_mac_address(in_str: Any) -> bool:
759 """Return True if in_str is a valid MAC address false otherwise.
761 >>> is_mac_address("34:29:8F:12:0D:2F")
763 >>> is_mac_address('34:29:8f:12:0d:2f')
765 >>> is_mac_address('34-29-8F-12-0D-2F')
767 >>> is_mac_address("test")
770 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
773 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
775 Extract the MAC address from in_str.
777 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
780 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
784 if not is_full_string(in_str):
787 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
790 mac.replace(":", separator)
791 mac.replace("-", separator)
796 def is_slug(in_str: Any, separator: str = "-") -> bool:
798 Checks if a given string is a slug (as created by `slugify()`).
800 >>> is_slug('my-blog-post-title')
802 >>> is_slug('My blog post title')
806 if not is_full_string(in_str):
808 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
809 return re.match(rex, in_str) is not None
812 def contains_html(in_str: str) -> bool:
814 Checks if the given string contains HTML/XML tags.
816 By design, this function matches ANY type of tag, so don't expect to use it
817 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
819 >>> contains_html('my string is <strong>bold</strong>')
821 >>> contains_html('my string is not bold')
825 if not is_string(in_str):
826 raise ValueError(in_str)
827 return HTML_RE.search(in_str) is not None
830 def words_count(in_str: str) -> int:
832 Returns the number of words contained into the given string.
834 This method is smart, it does consider only sequence of one or more letter and/or numbers
835 as "words", so a string like this: "! @ # % ... []" will return zero!
836 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
837 will be 4 not 1 (even if there are no spaces in the string).
839 >>> words_count('hello world')
841 >>> words_count('one,two,three.stop')
845 if not is_string(in_str):
846 raise ValueError(in_str)
847 return len(WORDS_COUNT_RE.findall(in_str))
850 def generate_uuid(omit_dashes: bool = False) -> str:
852 Generated an UUID string (using `uuid.uuid4()`).
854 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
855 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
864 def generate_random_alphanumeric_string(size: int) -> str:
866 Returns a string of the specified size containing random
867 characters (uppercase/lowercase ascii letters and digits).
869 random_string(9) # possible output: "cx3QQbzYg"
873 raise ValueError("size must be >= 1")
874 chars = string.ascii_letters + string.digits
875 buffer = [random.choice(chars) for _ in range(size)]
876 return from_char_list(buffer)
879 def reverse(in_str: str) -> str:
881 Returns the string with its chars reversed.
887 if not is_string(in_str):
888 raise ValueError(in_str)
892 def camel_case_to_snake_case(in_str, *, separator="_"):
894 Convert a camel case string into a snake case one.
895 (The original string is returned if is not a valid camel case string)
897 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
898 'mac_address_extractor_factory'
899 >>> camel_case_to_snake_case('Luke Skywalker')
902 if not is_string(in_str):
903 raise ValueError(in_str)
904 if not is_camel_case(in_str):
906 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
909 def snake_case_to_camel_case(
910 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
913 Convert a snake case string into a camel case one.
914 (The original string is returned if is not a valid snake case string)
916 >>> snake_case_to_camel_case('this_is_a_test')
918 >>> snake_case_to_camel_case('Han Solo')
921 if not is_string(in_str):
922 raise ValueError(in_str)
923 if not is_snake_case(in_str, separator=separator):
925 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
926 if not upper_case_first:
927 tokens[0] = tokens[0].lower()
928 return from_char_list(tokens)
931 def to_char_list(in_str: str) -> List[str]:
932 """Convert a string into a list of chars.
934 >>> to_char_list('test')
937 if not is_string(in_str):
942 def from_char_list(in_list: List[str]) -> str:
943 """Convert a char list into a string.
945 >>> from_char_list(['t', 'e', 's', 't'])
948 return "".join(in_list)
951 def shuffle(in_str: str) -> str:
952 """Return a new string containing same chars of the given one but in
955 if not is_string(in_str):
956 raise ValueError(in_str)
958 # turn the string into a list of chars
959 chars = to_char_list(in_str)
960 random.shuffle(chars)
961 return from_char_list(chars)
964 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
966 Remove html code contained into the given string.
968 >>> strip_html('test: <a href="foo/bar">click here</a>')
970 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
973 if not is_string(in_str):
974 raise ValueError(in_str)
975 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
976 return r.sub("", in_str)
979 def asciify(in_str: str) -> str:
981 Force string content to be ascii-only by translating all non-ascii
982 chars into the closest possible representation (eg: ó -> o, Ë ->
985 N.B. Some chars may be lost if impossible to translate.
987 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
988 'eeuuooaaeynAAACIINOE'
990 if not is_string(in_str):
991 raise ValueError(in_str)
993 # "NFKD" is the algorithm which is able to successfully translate
994 # the most of non-ascii chars.
995 normalized = unicodedata.normalize("NFKD", in_str)
997 # encode string forcing ascii and ignore any errors
998 # (unrepresentable chars will be stripped out)
999 ascii_bytes = normalized.encode("ascii", "ignore")
1001 # turns encoded bytes into an utf-8 string
1002 return ascii_bytes.decode("utf-8")
1005 def slugify(in_str: str, *, separator: str = "-") -> str:
1007 Converts a string into a "slug" using provided separator.
1008 The returned string has the following properties:
1011 - all letters are in lower case
1012 - all punctuation signs and non alphanumeric chars are removed
1013 - words are divided using provided separator
1014 - all chars are encoded as ascii (by using `asciify()`)
1017 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1018 'top-10-reasons-to-love-dogs'
1019 >>> slugify('Mönstér Mägnët')
1022 if not is_string(in_str):
1023 raise ValueError(in_str)
1025 # replace any character that is NOT letter or number with spaces
1026 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1028 # replace spaces with join sign
1029 out = SPACES_RE.sub(separator, out)
1031 # normalize joins (remove duplicates)
1032 out = re.sub(re.escape(separator) + r"+", separator, out)
1036 def to_bool(in_str: str) -> bool:
1038 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1040 A positive boolean (True) is returned if the string value is one
1048 Otherwise False is returned.
1069 if not is_string(in_str):
1070 raise ValueError(in_str)
1071 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1074 def to_date(in_str: str) -> Optional[datetime.date]:
1076 Parses a date string. See DateParser docs for details.
1078 import dateparse.dateparse_utils as du
1081 d = du.DateParser() # type: ignore
1084 except du.ParseException: # type: ignore
1085 msg = f'Unable to parse date {in_str}.'
1090 def valid_date(in_str: str) -> bool:
1092 True if the string represents a valid date.
1095 import dateparse.dateparse_utils as dp
1098 d = dp.DateParser() # type: ignore
1101 except dp.ParseException: # type: ignore
1102 msg = f'Unable to parse date {in_str}.'
1107 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1109 Parses a datetime string. See DateParser docs for more info.
1111 import dateparse.dateparse_utils as dp
1114 d = dp.DateParser() # type: ignore
1115 dt = d.parse(in_str)
1116 if type(dt) == datetime.datetime:
1119 msg = f'Unable to parse datetime {in_str}.'
1124 def valid_datetime(in_str: str) -> bool:
1126 True if the string represents a valid datetime.
1128 _ = to_datetime(in_str)
1131 msg = f'Unable to parse datetime {in_str}.'
1136 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1138 Squeeze runs of more than one character_to_squeeze into one.
1140 >>> squeeze(' this is a test ')
1143 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1148 r'(' + re.escape(character_to_squeeze) + r')+',
1149 character_to_squeeze,
1154 def dedent(in_str: str) -> str:
1156 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1158 if not is_string(in_str):
1159 raise ValueError(in_str)
1160 line_separator = '\n'
1161 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1162 return line_separator.join(lines)
1165 def indent(in_str: str, amount: int) -> str:
1167 Indents string by prepending amount spaces.
1169 >>> indent('This is a test', 4)
1173 if not is_string(in_str):
1174 raise ValueError(in_str)
1175 line_separator = '\n'
1176 lines = [" " * amount + line for line in in_str.split(line_separator)]
1177 return line_separator.join(lines)
1180 def sprintf(*args, **kwargs) -> str:
1181 """String printf, like in C"""
1184 sep = kwargs.pop("sep", None)
1186 if not isinstance(sep, str):
1187 raise TypeError("sep must be None or a string")
1189 end = kwargs.pop("end", None)
1191 if not isinstance(end, str):
1192 raise TypeError("end must be None or a string")
1195 raise TypeError("invalid keyword arguments to sprint()")
1201 for i, arg in enumerate(args):
1204 if isinstance(arg, str):
1212 class SprintfStdout(object):
1214 A context manager that captures outputs to stdout.
1216 with SprintfStdout() as buf:
1223 def __init__(self) -> None:
1224 self.destination = io.StringIO()
1225 self.recorder: contextlib.redirect_stdout
1227 def __enter__(self) -> Callable[[], str]:
1228 self.recorder = contextlib.redirect_stdout(self.destination)
1229 self.recorder.__enter__()
1230 return lambda: self.destination.getvalue()
1232 def __exit__(self, *args) -> None:
1233 self.recorder.__exit__(*args)
1234 self.destination.seek(0)
1235 return None # don't suppress exceptions
1238 def capitalize_first_letter(txt: str) -> str:
1239 """Capitalize the first letter of a string.
1241 >>> capitalize_first_letter('test')
1243 >>> capitalize_first_letter("ALREADY!")
1247 return txt[0].upper() + txt[1:]
1250 def it_they(n: int) -> str:
1264 def is_are(n: int) -> str:
1278 def pluralize(n: int) -> str:
1284 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1287 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1296 def make_contractions(txt: str) -> str:
1297 """Glue words together to form contractions.
1299 >>> make_contractions('It is nice today.')
1302 >>> make_contractions('I can not even...')
1305 >>> make_contractions('She could not see!')
1308 >>> make_contractions('But she will not go.')
1311 >>> make_contractions('Verily, I shall not.')
1314 >>> make_contractions('No you cannot.')
1317 >>> make_contractions('I said you can not go.')
1318 "I said you can't go."
1355 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1359 # Special cases: can't, shan't and won't.
1360 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1361 txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1363 r'\b(w)ill\s*(n)(o)(t)\b',
1367 flags=re.IGNORECASE,
1370 for first_list, second_list in first_second:
1371 for first in first_list:
1372 for second in second_list:
1373 # Disallow there're/where're. They're valid English
1375 if (first == 'there' or first == 'where') and second == 'a(re)':
1378 pattern = fr'\b({first})\s+{second}\b'
1379 if second == '(n)o(t)':
1380 replacement = r"\1\2'\3"
1382 replacement = r"\1'\2"
1383 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1388 def thify(n: int) -> str:
1389 """Return the proper cardinal suffix for a number.
1400 assert is_integer_number(digit)
1412 def ngrams(txt: str, n: int):
1413 """Return the ngrams from a string.
1415 >>> [x for x in ngrams('This is a test', 2)]
1416 ['This is', 'is a', 'a test']
1420 for ngram in ngrams_presplit(words, n):
1427 def ngrams_presplit(words: Sequence[str], n: int):
1428 return list_utils.ngrams(words, n)
1431 def bigrams(txt: str):
1432 return ngrams(txt, 2)
1435 def trigrams(txt: str):
1436 return ngrams(txt, 3)
1439 def shuffle_columns_into_list(
1440 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1442 """Helper to shuffle / parse columnar data and return the results as a
1443 list. The column_specs argument is an iterable collection of
1444 numeric sequences that indicate one or more column numbers to
1447 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1448 >>> shuffle_columns_into_list(
1450 ... [ [8], [2, 3], [5, 6, 7] ],
1453 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1458 # Column specs map input lines' columns into outputs.
1460 for spec in column_specs:
1463 chunk = chunk + delim + input_lines[n]
1464 chunk = chunk.strip(delim)
1469 def shuffle_columns_into_dict(
1470 input_lines: Sequence[str],
1471 column_specs: Iterable[Tuple[str, Iterable[int]]],
1473 ) -> Dict[str, str]:
1474 """Helper to shuffle / parse columnar data and return the results
1477 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1478 >>> shuffle_columns_into_dict(
1480 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1483 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1488 # Column specs map input lines' columns into outputs.
1489 # "key", [col1, col2...]
1490 for spec in column_specs:
1493 chunk = chunk + delim + input_lines[n]
1494 chunk = chunk.strip(delim)
1495 out[spec[0]] = chunk
1499 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1500 """Interpolate a string with data from a dict.
1502 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1503 ... {'adjective': 'good', 'noun': 'example'})
1504 'This is a good example.'
1507 return sprintf(txt.format(**values), end='')
1510 def to_ascii(x: str):
1511 """Encode as ascii bytes string.
1513 >>> to_ascii('test')
1516 >>> to_ascii(b'1, 2, 3')
1521 return x.encode('ascii')
1522 if type(x) is bytes:
1524 raise Exception('to_ascii works with strings and bytes')
1527 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1528 """Encode txt and then encode the bytes with a 64-character
1529 alphabet. This is compatible with uudecode.
1531 >>> to_base64('hello?')
1535 return base64.encodebytes(txt.encode(encoding, errors))
1538 def is_base64(txt: str) -> bool:
1539 """Determine whether a string is base64 encoded (with Python's standard
1540 base64 alphabet which is the same as what uuencode uses).
1542 >>> is_base64('test') # all letters in the b64 alphabet
1545 >>> is_base64('another test, how do you like this one?')
1548 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1552 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1553 alphabet = set(a.encode('ascii'))
1554 for char in to_ascii(txt.strip()):
1555 if char not in alphabet:
1560 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1561 """Convert base64 encoded string back to normal strings.
1563 >>> from_base64(b'aGVsbG8/\\n')
1567 return base64.decodebytes(b64).decode(encoding, errors)
1570 def chunk(txt: str, chunk_size):
1571 """Chunk up a string.
1573 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1574 '01001101 11000101 10101010 10101010 10011111 10101000'
1577 if len(txt) % chunk_size != 0:
1578 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1580 warnings.warn(msg, stacklevel=2)
1581 for x in range(0, len(txt), chunk_size):
1582 yield txt[x : x + chunk_size]
1585 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1586 """Encode txt and then chop it into bytes. Note: only bitstrings
1587 with delimiter='' are interpretable by from_bitstring.
1589 >>> to_bitstring('hello?')
1590 '011010000110010101101100011011000110111100111111'
1592 >>> to_bitstring('test', delimiter=' ')
1593 '01110100 01100101 01110011 01110100'
1595 >>> to_bitstring(b'test')
1596 '01110100011001010111001101110100'
1599 etxt = to_ascii(txt)
1600 bits = bin(int.from_bytes(etxt, 'big'))
1602 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1605 def is_bitstring(txt: str) -> bool:
1606 """Is this a bitstring?
1608 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1611 >>> is_bitstring('1234')
1615 return is_binary_integer_number(f'0b{txt}')
1618 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1619 """Convert from bitstring back to bytes then decode into a str.
1621 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1626 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1629 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1630 """Turn an IPv4 address into a tuple for sorting purposes.
1632 >>> ip_v4_sort_key('10.0.0.18')
1635 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1636 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1637 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1640 if not is_ip_v4(txt):
1641 print(f"not IP: {txt}")
1643 return tuple([int(x) for x in txt.split('.')])
1646 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1647 """Chunk up a file path so that parent/ancestor paths sort before
1648 children/descendant paths.
1650 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1651 ('usr', 'local', 'bin')
1653 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1654 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1655 ['/usr', '/usr/local', '/usr/local/bin']
1658 return tuple([x for x in volume.split('/') if len(x) > 0])
1661 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1662 """Execute several replace operations in a row.
1664 >>> s = 'this_is a-test!'
1665 >>> replace_all(s, ' _-!', '')
1669 for char in replace_set:
1670 in_str = in_str.replace(char, replacement)
1674 if __name__ == '__main__':