2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
7 Modifications Copyright (c) 2021-2022 Scott Gasch
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
16 The above copyright notice and this permission notice shall be included in all
17 copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 This class is based on: https://github.com/daveoncode/python-string-utils.
31 import contextlib # type: ignore
42 from itertools import zip_longest
54 from uuid import uuid4
58 logger = logging.getLogger(__name__)
60 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
62 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
64 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
66 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
69 r"([a-z-]+://)" # scheme
70 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
72 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
73 r"(:\d{2,})?" # port number
74 r"(/[a-z\d_%+-]*)*" # folders
75 r"(\.[a-z\d_%+-]+)*" # file extension
76 r"(\?[a-z\d_+%-=]*)?" # query string
80 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
82 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
84 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
86 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
88 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
90 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
92 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
94 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
96 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
98 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
100 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
102 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
105 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
106 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
107 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
108 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
109 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
110 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
113 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
115 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
117 UUID_HEX_OK_RE = re.compile(
118 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
122 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
124 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
126 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
128 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
130 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
132 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
134 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
136 HTML_RE = re.compile(
137 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
138 re.IGNORECASE | re.MULTILINE | re.DOTALL,
141 HTML_TAG_ONLY_RE = re.compile(
142 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
143 re.IGNORECASE | re.MULTILINE | re.DOTALL,
146 SPACES_RE = re.compile(r"\s")
148 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
150 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
152 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
168 def is_none_or_empty(in_str: Optional[str]) -> bool:
170 Returns true if the input string is either None or an empty string.
172 >>> is_none_or_empty("")
174 >>> is_none_or_empty(None)
176 >>> is_none_or_empty(" \t ")
178 >>> is_none_or_empty('Test')
181 return in_str is None or len(in_str.strip()) == 0
184 def is_string(obj: Any) -> bool:
186 Checks if an object is a string.
188 >>> is_string('test')
194 >>> is_string([1, 2, 3])
197 return isinstance(obj, str)
200 def is_empty_string(in_str: Any) -> bool:
201 return is_empty(in_str)
204 def is_empty(in_str: Any) -> bool:
206 Checks if input is a string and empty or only whitespace.
210 >>> is_empty(' \t\t ')
216 >>> is_empty([1, 2, 3])
219 return is_string(in_str) and in_str.strip() == ""
222 def is_full_string(in_str: Any) -> bool:
224 Checks that input is a string and is not empty ('') or only whitespace.
226 >>> is_full_string('test!')
228 >>> is_full_string('')
230 >>> is_full_string(' ')
232 >>> is_full_string(100.999)
234 >>> is_full_string({"a": 1, "b": 2})
237 return is_string(in_str) and in_str.strip() != ""
240 def is_number(in_str: str) -> bool:
242 Checks if a string is a valid number.
245 Traceback (most recent call last):
248 >>> is_number("100.5")
250 >>> is_number("test")
254 >>> is_number([1, 2, 3])
255 Traceback (most recent call last):
257 ValueError: [1, 2, 3]
259 if not is_string(in_str):
260 raise ValueError(in_str)
261 return NUMBER_RE.match(in_str) is not None
264 def is_integer_number(in_str: str) -> bool:
266 Checks whether the given string represents an integer or not.
268 An integer may be signed or unsigned or use a "scientific notation".
270 >>> is_integer_number('42')
272 >>> is_integer_number('42.0')
276 (is_number(in_str) and "." not in in_str)
277 or is_hexidecimal_integer_number(in_str)
278 or is_octal_integer_number(in_str)
279 or is_binary_integer_number(in_str)
283 def is_hexidecimal_integer_number(in_str: str) -> bool:
285 Checks whether a string is a hex integer number.
287 >>> is_hexidecimal_integer_number('0x12345')
289 >>> is_hexidecimal_integer_number('0x1A3E')
291 >>> is_hexidecimal_integer_number('1234') # Needs 0x
293 >>> is_hexidecimal_integer_number('-0xff')
295 >>> is_hexidecimal_integer_number('test')
297 >>> is_hexidecimal_integer_number(12345) # Not a string
298 Traceback (most recent call last):
301 >>> is_hexidecimal_integer_number(101.4)
302 Traceback (most recent call last):
305 >>> is_hexidecimal_integer_number(0x1A3E)
306 Traceback (most recent call last):
310 if not is_string(in_str):
311 raise ValueError(in_str)
312 return HEX_NUMBER_RE.match(in_str) is not None
315 def is_octal_integer_number(in_str: str) -> bool:
317 Checks whether a string is an octal number.
319 >>> is_octal_integer_number('0o777')
321 >>> is_octal_integer_number('-0O115')
323 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
325 >>> is_octal_integer_number('7777') # Needs 0o
327 >>> is_octal_integer_number('test')
330 if not is_string(in_str):
331 raise ValueError(in_str)
332 return OCT_NUMBER_RE.match(in_str) is not None
335 def is_binary_integer_number(in_str: str) -> bool:
337 Returns whether a string contains a binary number.
339 >>> is_binary_integer_number('0b10111')
341 >>> is_binary_integer_number('-0b111')
343 >>> is_binary_integer_number('0B10101')
345 >>> is_binary_integer_number('0b10102')
347 >>> is_binary_integer_number('0xFFF')
349 >>> is_binary_integer_number('test')
352 if not is_string(in_str):
353 raise ValueError(in_str)
354 return BIN_NUMBER_RE.match(in_str) is not None
357 def to_int(in_str: str) -> int:
358 """Returns the integral value of the string or raises on error.
363 Traceback (most recent call last):
365 ValueError: invalid literal for int() with base 10: 'test'
367 if not is_string(in_str):
368 raise ValueError(in_str)
369 if is_binary_integer_number(in_str):
370 return int(in_str, 2)
371 if is_octal_integer_number(in_str):
372 return int(in_str, 8)
373 if is_hexidecimal_integer_number(in_str):
374 return int(in_str, 16)
378 def is_decimal_number(in_str: str) -> bool:
380 Checks whether the given string represents a decimal or not.
382 A decimal may be signed or unsigned or use a "scientific notation".
384 >>> is_decimal_number('42.0')
386 >>> is_decimal_number('42')
389 return is_number(in_str) and "." in in_str
392 def strip_escape_sequences(in_str: str) -> str:
394 Remove escape sequences in the input string.
396 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
399 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
403 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
405 Add thousands separator to a numeric string. Also handles numbers.
407 >>> add_thousands_separator('12345678')
409 >>> add_thousands_separator(12345678)
411 >>> add_thousands_separator(12345678.99)
413 >>> add_thousands_separator('test')
414 Traceback (most recent call last):
419 if isinstance(in_str, numbers.Number):
421 if is_number(in_str):
422 return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
423 raise ValueError(in_str)
426 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
429 (in_str, decimal_part) = in_str.split('.')
430 tmp = [iter(in_str[::-1])] * places
431 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
432 if len(decimal_part) > 0:
439 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
440 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
442 Check if a string is a valid url.
444 >>> is_url('http://www.mysite.com')
446 >>> is_url('https://mysite.com')
448 >>> is_url('.mysite.com')
451 if not is_full_string(in_str):
454 valid = URL_RE.match(in_str) is not None
457 return valid and any([in_str.startswith(s) for s in allowed_schemes])
461 def is_email(in_str: Any) -> bool:
463 Check if a string is a valid email.
465 Reference: https://tools.ietf.org/html/rfc3696#section-3
469 >>> is_email('@gmail.com')
472 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
476 # we expect 2 tokens, one before "@" and one after, otherwise
477 # we have an exception and the email is not valid.
478 head, tail = in_str.split("@")
480 # head's size must be <= 64, tail <= 255, head must not start
481 # with a dot or contain multiple consecutive dots.
482 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
485 # removes escaped spaces, so that later on the test regex will
487 head = head.replace("\\ ", "")
488 if head.startswith('"') and head.endswith('"'):
489 head = head.replace(" ", "")[1:-1]
490 return EMAIL_RE.match(head + "@" + tail) is not None
493 # borderline case in which we have multiple "@" signs but the
494 # head part is correctly escaped.
495 if ESCAPED_AT_SIGN.search(in_str) is not None:
496 # replace "@" with "a" in the head
497 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
501 def suffix_string_to_number(in_str: str) -> Optional[int]:
502 """Take a string like "33Gb" and convert it into a number (of bytes)
503 like 34603008. Return None if the input string is not valid.
505 >>> suffix_string_to_number('1Mb')
507 >>> suffix_string_to_number('13.1Gb')
511 def suffix_capitalize(s: str) -> str:
515 return f"{s[0].upper()}{s[1].lower()}"
516 return suffix_capitalize(s[0:1])
518 if is_string(in_str):
519 if is_integer_number(in_str):
520 return to_int(in_str)
521 suffixes = [in_str[-2:], in_str[-1:]]
522 rest = [in_str[:-2], in_str[:-1]]
523 for x in range(len(suffixes)):
525 s = suffix_capitalize(s)
526 multiplier = NUM_SUFFIXES.get(s, None)
527 if multiplier is not None:
529 if is_integer_number(r):
530 return to_int(r) * multiplier
531 if is_decimal_number(r):
532 return int(float(r) * multiplier)
536 def number_to_suffix_string(num: int) -> Optional[str]:
537 """Take a number (of bytes) and returns a string like "43.8Gb".
538 Returns none if the input is invalid.
540 >>> number_to_suffix_string(14066017894)
542 >>> number_to_suffix_string(1024 * 1024)
548 for (sfx, size) in NUM_SUFFIXES.items():
553 if suffix is not None:
554 return f"{d:.1f}{suffix}"
559 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
561 Checks if a string is a valid credit card number.
562 If card type is provided then it checks against that specific type only,
563 otherwise any known credit card number will be accepted.
565 Supported card types are the following:
574 if not is_full_string(in_str):
577 if card_type is not None:
578 if card_type not in CREDIT_CARDS:
580 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
582 return CREDIT_CARDS[card_type].match(in_str) is not None
583 for c in CREDIT_CARDS:
584 if CREDIT_CARDS[c].match(in_str) is not None:
589 def is_camel_case(in_str: Any) -> bool:
591 Checks if a string is formatted as camel case.
593 A string is considered camel case when:
595 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
596 - it contains both lowercase and uppercase letters
597 - it does not start with a number
599 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
602 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
604 Checks if a string is formatted as "snake case".
606 A string is considered snake case when:
608 - it's composed only by lowercase/uppercase letters and digits
609 - it contains at least one underscore (or provided separator)
610 - it does not start with a number
612 >>> is_snake_case('this_is_a_test')
614 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
616 >>> is_snake_case('this-is-a-test')
618 >>> is_snake_case('this-is-a-test', separator='-')
622 if is_full_string(in_str):
623 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
624 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
627 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
629 return r.match(in_str) is not None
633 def is_json(in_str: Any) -> bool:
635 Check if a string is a valid json.
637 >>> is_json('{"name": "Peter"}')
639 >>> is_json('[1, 2, 3]')
641 >>> is_json('{nope}')
644 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
646 return isinstance(json.loads(in_str), (dict, list))
647 except (TypeError, ValueError, OverflowError):
652 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
654 Check if a string is a valid UUID.
656 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
658 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
660 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
663 # string casting is used to allow UUID itself as input data type
666 return UUID_HEX_OK_RE.match(s) is not None
667 return UUID_RE.match(s) is not None
670 def is_ip_v4(in_str: Any) -> bool:
672 Checks if a string is a valid ip v4.
674 >>> is_ip_v4('255.200.100.75')
678 >>> is_ip_v4('255.200.100.999') # 999 out of range
681 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
684 # checks that each entry in the ip is in the valid range (0 to 255)
685 for token in in_str.split("."):
686 if not 0 <= int(token) <= 255:
691 def extract_ip_v4(in_str: Any) -> Optional[str]:
693 Extracts the IPv4 chunk of a string or None.
695 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
697 >>> extract_ip_v4('Your mom dresses you funny.')
699 if not is_full_string(in_str):
701 m = ANYWHERE_IP_V4_RE.search(in_str)
707 def is_ip_v6(in_str: Any) -> bool:
709 Checks if a string is a valid ip v6.
711 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
713 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
716 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
719 def extract_ip_v6(in_str: Any) -> Optional[str]:
721 Extract IPv6 chunk or None.
723 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
724 '2001:db8:85a3:0000:0000:8a2e:370:7334'
725 >>> extract_ip_v6("(and she's ugly too, btw)")
727 if not is_full_string(in_str):
729 m = ANYWHERE_IP_V6_RE.search(in_str)
735 def is_ip(in_str: Any) -> bool:
737 Checks if a string is a valid ip (either v4 or v6).
739 >>> is_ip('255.200.100.75')
741 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
745 >>> is_ip('1.2.3.999')
748 return is_ip_v6(in_str) or is_ip_v4(in_str)
751 def extract_ip(in_str: Any) -> Optional[str]:
753 Extract the IP address or None.
755 >>> extract_ip('Attacker: 255.200.100.75')
757 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
758 '2001:db8:85a3:0000:0000:8a2e:370:7334'
759 >>> extract_ip('1.2.3')
762 ip = extract_ip_v4(in_str)
764 ip = extract_ip_v6(in_str)
768 def is_mac_address(in_str: Any) -> bool:
769 """Return True if in_str is a valid MAC address false otherwise.
771 >>> is_mac_address("34:29:8F:12:0D:2F")
773 >>> is_mac_address('34:29:8f:12:0d:2f')
775 >>> is_mac_address('34-29-8F-12-0D-2F')
777 >>> is_mac_address("test")
780 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
783 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
785 Extract the MAC address from in_str.
787 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
790 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
794 if not is_full_string(in_str):
797 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
800 mac.replace(":", separator)
801 mac.replace("-", separator)
806 def is_slug(in_str: Any, separator: str = "-") -> bool:
808 Checks if a given string is a slug (as created by `slugify()`).
810 >>> is_slug('my-blog-post-title')
812 >>> is_slug('My blog post title')
816 if not is_full_string(in_str):
818 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
819 return re.match(rex, in_str) is not None
822 def contains_html(in_str: str) -> bool:
824 Checks if the given string contains HTML/XML tags.
826 By design, this function matches ANY type of tag, so don't expect to use it
827 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
829 >>> contains_html('my string is <strong>bold</strong>')
831 >>> contains_html('my string is not bold')
835 if not is_string(in_str):
836 raise ValueError(in_str)
837 return HTML_RE.search(in_str) is not None
840 def words_count(in_str: str) -> int:
842 Returns the number of words contained into the given string.
844 This method is smart, it does consider only sequence of one or more letter and/or numbers
845 as "words", so a string like this: "! @ # % ... []" will return zero!
846 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
847 will be 4 not 1 (even if there are no spaces in the string).
849 >>> words_count('hello world')
851 >>> words_count('one,two,three.stop')
855 if not is_string(in_str):
856 raise ValueError(in_str)
857 return len(WORDS_COUNT_RE.findall(in_str))
860 def word_count(in_str: str) -> int:
861 return words_count(in_str)
864 def generate_uuid(omit_dashes: bool = False) -> str:
866 Generated an UUID string (using `uuid.uuid4()`).
868 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
869 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
878 def generate_random_alphanumeric_string(size: int) -> str:
880 Returns a string of the specified size containing random
881 characters (uppercase/lowercase ascii letters and digits).
884 >>> generate_random_alphanumeric_string(9)
889 raise ValueError("size must be >= 1")
890 chars = string.ascii_letters + string.digits
891 buffer = [random.choice(chars) for _ in range(size)]
892 return from_char_list(buffer)
895 def reverse(in_str: str) -> str:
897 Returns the string with its chars reversed.
903 if not is_string(in_str):
904 raise ValueError(in_str)
908 def camel_case_to_snake_case(in_str, *, separator="_"):
910 Convert a camel case string into a snake case one.
911 (The original string is returned if is not a valid camel case string)
913 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
914 'mac_address_extractor_factory'
915 >>> camel_case_to_snake_case('Luke Skywalker')
918 if not is_string(in_str):
919 raise ValueError(in_str)
920 if not is_camel_case(in_str):
922 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
925 def snake_case_to_camel_case(
926 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
929 Convert a snake case string into a camel case one.
930 (The original string is returned if is not a valid snake case string)
932 >>> snake_case_to_camel_case('this_is_a_test')
934 >>> snake_case_to_camel_case('Han Solo')
937 if not is_string(in_str):
938 raise ValueError(in_str)
939 if not is_snake_case(in_str, separator=separator):
941 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
942 if not upper_case_first:
943 tokens[0] = tokens[0].lower()
944 return from_char_list(tokens)
947 def to_char_list(in_str: str) -> List[str]:
948 """Convert a string into a list of chars.
950 >>> to_char_list('test')
953 if not is_string(in_str):
958 def from_char_list(in_list: List[str]) -> str:
959 """Convert a char list into a string.
961 >>> from_char_list(['t', 'e', 's', 't'])
964 return "".join(in_list)
967 def shuffle(in_str: str) -> str:
968 """Return a new string containing same chars of the given one but in
971 if not is_string(in_str):
972 raise ValueError(in_str)
974 # turn the string into a list of chars
975 chars = to_char_list(in_str)
976 random.shuffle(chars)
977 return from_char_list(chars)
980 def scramble(in_str: str) -> str:
981 return shuffle(in_str)
984 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
986 Remove html code contained into the given string.
988 >>> strip_html('test: <a href="foo/bar">click here</a>')
990 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
993 if not is_string(in_str):
994 raise ValueError(in_str)
995 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
996 return r.sub("", in_str)
999 def asciify(in_str: str) -> str:
1001 Force string content to be ascii-only by translating all non-ascii
1002 chars into the closest possible representation (eg: ó -> o, Ë ->
1005 N.B. Some chars may be lost if impossible to translate.
1007 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1008 'eeuuooaaeynAAACIINOE'
1010 if not is_string(in_str):
1011 raise ValueError(in_str)
1013 # "NFKD" is the algorithm which is able to successfully translate
1014 # the most of non-ascii chars.
1015 normalized = unicodedata.normalize("NFKD", in_str)
1017 # encode string forcing ascii and ignore any errors
1018 # (unrepresentable chars will be stripped out)
1019 ascii_bytes = normalized.encode("ascii", "ignore")
1021 # turns encoded bytes into an utf-8 string
1022 return ascii_bytes.decode("utf-8")
1025 def slugify(in_str: str, *, separator: str = "-") -> str:
1027 Converts a string into a "slug" using provided separator.
1028 The returned string has the following properties:
1031 - all letters are in lower case
1032 - all punctuation signs and non alphanumeric chars are removed
1033 - words are divided using provided separator
1034 - all chars are encoded as ascii (by using `asciify()`)
1037 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1038 'top-10-reasons-to-love-dogs'
1039 >>> slugify('Mönstér Mägnët')
1042 if not is_string(in_str):
1043 raise ValueError(in_str)
1045 # replace any character that is NOT letter or number with spaces
1046 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1048 # replace spaces with join sign
1049 out = SPACES_RE.sub(separator, out)
1051 # normalize joins (remove duplicates)
1052 out = re.sub(re.escape(separator) + r"+", separator, out)
1056 def to_bool(in_str: str) -> bool:
1058 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1060 A positive boolean (True) is returned if the string value is one
1068 Otherwise False is returned.
1089 if not is_string(in_str):
1090 raise ValueError(in_str)
1091 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1094 def to_date(in_str: str) -> Optional[datetime.date]:
1096 Parses a date string. See DateParser docs for details.
1098 import dateparse.dateparse_utils as du
1101 d = du.DateParser() # type: ignore
1104 except du.ParseException: # type: ignore
1105 msg = f'Unable to parse date {in_str}.'
1110 def valid_date(in_str: str) -> bool:
1112 True if the string represents a valid date.
1114 import dateparse.dateparse_utils as dp
1117 d = dp.DateParser() # type: ignore
1120 except dp.ParseException: # type: ignore
1121 msg = f'Unable to parse date {in_str}.'
1126 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1128 Parses a datetime string. See DateParser docs for more info.
1130 import dateparse.dateparse_utils as dp
1133 d = dp.DateParser() # type: ignore
1134 dt = d.parse(in_str)
1135 if isinstance(dt, datetime.datetime):
1138 msg = f'Unable to parse datetime {in_str}.'
1143 def valid_datetime(in_str: str) -> bool:
1145 True if the string represents a valid datetime.
1147 _ = to_datetime(in_str)
1150 msg = f'Unable to parse datetime {in_str}.'
1155 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1157 Squeeze runs of more than one character_to_squeeze into one.
1159 >>> squeeze(' this is a test ')
1162 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1167 r'(' + re.escape(character_to_squeeze) + r')+',
1168 character_to_squeeze,
1173 def dedent(in_str: str) -> str:
1175 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1177 if not is_string(in_str):
1178 raise ValueError(in_str)
1179 line_separator = '\n'
1180 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1181 return line_separator.join(lines)
1184 def indent(in_str: str, amount: int) -> str:
1186 Indents string by prepending amount spaces.
1188 >>> indent('This is a test', 4)
1192 if not is_string(in_str):
1193 raise ValueError(in_str)
1194 line_separator = '\n'
1195 lines = [" " * amount + line for line in in_str.split(line_separator)]
1196 return line_separator.join(lines)
1199 def sprintf(*args, **kwargs) -> str:
1200 """String printf, like in C"""
1203 sep = kwargs.pop("sep", None)
1205 if not isinstance(sep, str):
1206 raise TypeError("sep must be None or a string")
1208 end = kwargs.pop("end", None)
1210 if not isinstance(end, str):
1211 raise TypeError("end must be None or a string")
1214 raise TypeError("invalid keyword arguments to sprint()")
1220 for i, arg in enumerate(args):
1223 if isinstance(arg, str):
1231 def strip_ansi_sequences(in_str: str) -> str:
1232 """Strips ANSI sequences out of strings.
1234 >>> import ansi as a
1235 >>> s = a.fg('blue') + 'blue!' + a.reset()
1236 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1238 >>> len(strip_ansi_sequences(s))
1240 >>> strip_ansi_sequences(s)
1244 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1247 class SprintfStdout(contextlib.AbstractContextManager):
1249 A context manager that captures outputs to stdout to a buffer
1250 without printing them.
1252 >>> with SprintfStdout() as buf:
1254 ... print("1, 2, 3")
1256 >>> print(buf(), end='')
1262 def __init__(self) -> None:
1263 self.destination = io.StringIO()
1264 self.recorder: contextlib.redirect_stdout
1266 def __enter__(self) -> Callable[[], str]:
1267 self.recorder = contextlib.redirect_stdout(self.destination)
1268 self.recorder.__enter__()
1269 return lambda: self.destination.getvalue()
1271 def __exit__(self, *args) -> Literal[False]:
1272 self.recorder.__exit__(*args)
1273 self.destination.seek(0)
1277 def capitalize_first_letter(txt: str) -> str:
1278 """Capitalize the first letter of a string.
1280 >>> capitalize_first_letter('test')
1282 >>> capitalize_first_letter("ALREADY!")
1286 return txt[0].upper() + txt[1:]
1289 def it_they(n: int) -> str:
1303 def is_are(n: int) -> str:
1317 def pluralize(n: int) -> str:
1323 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1326 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1335 def make_contractions(txt: str) -> str:
1336 """Glue words together to form contractions.
1338 >>> make_contractions('It is nice today.')
1341 >>> make_contractions('I can not even...')
1344 >>> make_contractions('She could not see!')
1347 >>> make_contractions('But she will not go.')
1350 >>> make_contractions('Verily, I shall not.')
1353 >>> make_contractions('No you cannot.')
1356 >>> make_contractions('I said you can not go.')
1357 "I said you can't go."
1394 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1398 # Special cases: can't, shan't and won't.
1399 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1400 txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1402 r'\b(w)ill\s*(n)(o)(t)\b',
1406 flags=re.IGNORECASE,
1409 for first_list, second_list in first_second:
1410 for first in first_list:
1411 for second in second_list:
1412 # Disallow there're/where're. They're valid English
1414 if (first in ('there', 'where')) and second == 'a(re)':
1417 pattern = fr'\b({first})\s+{second}\b'
1418 if second == '(n)o(t)':
1419 replacement = r"\1\2'\3"
1421 replacement = r"\1'\2"
1422 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1427 def thify(n: int) -> str:
1428 """Return the proper cardinal suffix for a number.
1439 assert is_integer_number(digit)
1451 def ngrams(txt: str, n: int):
1452 """Return the ngrams from a string.
1454 >>> [x for x in ngrams('This is a test', 2)]
1455 ['This is', 'is a', 'a test']
1459 for ngram in ngrams_presplit(words, n):
1466 def ngrams_presplit(words: Sequence[str], n: int):
1467 return list_utils.ngrams(words, n)
1470 def bigrams(txt: str):
1471 return ngrams(txt, 2)
1474 def trigrams(txt: str):
1475 return ngrams(txt, 3)
1478 def shuffle_columns_into_list(
1479 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1481 """Helper to shuffle / parse columnar data and return the results as a
1482 list. The column_specs argument is an iterable collection of
1483 numeric sequences that indicate one or more column numbers to
1486 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1487 >>> shuffle_columns_into_list(
1489 ... [ [8], [2, 3], [5, 6, 7] ],
1492 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1497 # Column specs map input lines' columns into outputs.
1499 for spec in column_specs:
1502 hunk = hunk + delim + input_lines[n]
1503 hunk = hunk.strip(delim)
1508 def shuffle_columns_into_dict(
1509 input_lines: Sequence[str],
1510 column_specs: Iterable[Tuple[str, Iterable[int]]],
1512 ) -> Dict[str, str]:
1513 """Helper to shuffle / parse columnar data and return the results
1516 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1517 >>> shuffle_columns_into_dict(
1519 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1522 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1527 # Column specs map input lines' columns into outputs.
1528 # "key", [col1, col2...]
1529 for spec in column_specs:
1532 hunk = hunk + delim + input_lines[n]
1533 hunk = hunk.strip(delim)
1538 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1539 """Interpolate a string with data from a dict.
1541 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1542 ... {'adjective': 'good', 'noun': 'example'})
1543 'This is a good example.'
1546 return sprintf(txt.format(**values), end='')
1549 def to_ascii(x: str):
1550 """Encode as ascii bytes string.
1552 >>> to_ascii('test')
1555 >>> to_ascii(b'1, 2, 3')
1559 if isinstance(x, str):
1560 return x.encode('ascii')
1561 if isinstance(x, bytes):
1563 raise Exception('to_ascii works with strings and bytes')
1566 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1567 """Encode txt and then encode the bytes with a 64-character
1568 alphabet. This is compatible with uudecode.
1570 >>> to_base64('hello?')
1574 return base64.encodebytes(txt.encode(encoding, errors))
1577 def is_base64(txt: str) -> bool:
1578 """Determine whether a string is base64 encoded (with Python's standard
1579 base64 alphabet which is the same as what uuencode uses).
1581 >>> is_base64('test') # all letters in the b64 alphabet
1584 >>> is_base64('another test, how do you like this one?')
1587 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1591 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1592 alphabet = set(a.encode('ascii'))
1593 for char in to_ascii(txt.strip()):
1594 if char not in alphabet:
1599 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1600 """Convert base64 encoded string back to normal strings.
1602 >>> from_base64(b'aGVsbG8/\\n')
1606 return base64.decodebytes(b64).decode(encoding, errors)
1609 def chunk(txt: str, chunk_size):
1610 """Chunk up a string.
1612 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1613 '01001101 11000101 10101010 10101010 10011111 10101000'
1616 if len(txt) % chunk_size != 0:
1617 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1619 warnings.warn(msg, stacklevel=2)
1620 for x in range(0, len(txt), chunk_size):
1621 yield txt[x : x + chunk_size]
1624 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1625 """Encode txt and then chop it into bytes. Note: only bitstrings
1626 with delimiter='' are interpretable by from_bitstring.
1628 >>> to_bitstring('hello?')
1629 '011010000110010101101100011011000110111100111111'
1631 >>> to_bitstring('test', delimiter=' ')
1632 '01110100 01100101 01110011 01110100'
1634 >>> to_bitstring(b'test')
1635 '01110100011001010111001101110100'
1638 etxt = to_ascii(txt)
1639 bits = bin(int.from_bytes(etxt, 'big'))
1641 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1644 def is_bitstring(txt: str) -> bool:
1645 """Is this a bitstring?
1647 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1650 >>> is_bitstring('1234')
1654 return is_binary_integer_number(f'0b{txt}')
1657 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1658 """Convert from bitstring back to bytes then decode into a str.
1660 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1665 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1668 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1669 """Turn an IPv4 address into a tuple for sorting purposes.
1671 >>> ip_v4_sort_key('10.0.0.18')
1674 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1675 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1676 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1679 if not is_ip_v4(txt):
1680 print(f"not IP: {txt}")
1682 return tuple(int(x) for x in txt.split('.'))
1685 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1686 """Chunk up a file path so that parent/ancestor paths sort before
1687 children/descendant paths.
1689 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1690 ('usr', 'local', 'bin')
1692 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1693 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1694 ['/usr', '/usr/local', '/usr/local/bin']
1697 return tuple(x for x in volume.split('/') if len(x) > 0)
1700 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1701 """Execute several replace operations in a row.
1703 >>> s = 'this_is a-test!'
1704 >>> replace_all(s, ' _-!', '')
1708 for char in replace_set:
1709 in_str = in_str.replace(char, replacement)
1713 def replace_nth(in_str: str, source: str, target: str, nth: int):
1714 """Replaces the nth occurrance of a substring within a string.
1716 >>> replace_nth('this is a test', ' ', '-', 3)
1720 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
1721 before = in_str[:where]
1722 after = in_str[where:]
1723 after = after.replace(source, target, 1)
1724 return before + after
1727 if __name__ == '__main__':