2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
7 Modifications Copyright (c) 2021-2022 Scott Gasch
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
16 The above copyright notice and this permission notice shall be included in all
17 copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 This class is based on: https://github.com/daveoncode/python-string-utils.
31 import contextlib # type: ignore
42 from itertools import zip_longest
54 from uuid import uuid4
58 logger = logging.getLogger(__name__)
60 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
62 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
64 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
66 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
69 r"([a-z-]+://)" # scheme
70 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
72 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
73 r"(:\d{2,})?" # port number
74 r"(/[a-z\d_%+-]*)*" # folders
75 r"(\.[a-z\d_%+-]+)*" # file extension
76 r"(\?[a-z\d_+%-=]*)?" # query string
80 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
82 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
84 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
86 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
88 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
90 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
92 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
94 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
96 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
98 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
100 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
102 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
105 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
106 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
107 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
108 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
109 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
110 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
113 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
115 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
117 UUID_HEX_OK_RE = re.compile(
118 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
122 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
124 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
126 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
128 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
130 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
132 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
134 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
136 HTML_RE = re.compile(
137 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
138 re.IGNORECASE | re.MULTILINE | re.DOTALL,
141 HTML_TAG_ONLY_RE = re.compile(
142 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
143 re.IGNORECASE | re.MULTILINE | re.DOTALL,
146 SPACES_RE = re.compile(r"\s")
148 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
150 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
152 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
168 def is_none_or_empty(in_str: Optional[str]) -> bool:
170 Returns true if the input string is either None or an empty string.
172 >>> is_none_or_empty("")
174 >>> is_none_or_empty(None)
176 >>> is_none_or_empty(" \t ")
178 >>> is_none_or_empty('Test')
181 return in_str is None or len(in_str.strip()) == 0
184 def is_string(obj: Any) -> bool:
186 Checks if an object is a string.
188 >>> is_string('test')
194 >>> is_string([1, 2, 3])
197 return isinstance(obj, str)
200 def is_empty_string(in_str: Any) -> bool:
201 return is_empty(in_str)
204 def is_empty(in_str: Any) -> bool:
206 Checks if input is a string and empty or only whitespace.
210 >>> is_empty(' \t\t ')
216 >>> is_empty([1, 2, 3])
219 return is_string(in_str) and in_str.strip() == ""
222 def is_full_string(in_str: Any) -> bool:
224 Checks that input is a string and is not empty ('') or only whitespace.
226 >>> is_full_string('test!')
228 >>> is_full_string('')
230 >>> is_full_string(' ')
232 >>> is_full_string(100.999)
234 >>> is_full_string({"a": 1, "b": 2})
237 return is_string(in_str) and in_str.strip() != ""
240 def is_number(in_str: str) -> bool:
242 Checks if a string is a valid number.
245 Traceback (most recent call last):
248 >>> is_number("100.5")
250 >>> is_number("test")
254 >>> is_number([1, 2, 3])
255 Traceback (most recent call last):
257 ValueError: [1, 2, 3]
259 if not is_string(in_str):
260 raise ValueError(in_str)
261 return NUMBER_RE.match(in_str) is not None
264 def is_integer_number(in_str: str) -> bool:
266 Checks whether the given string represents an integer or not.
268 An integer may be signed or unsigned or use a "scientific notation".
270 >>> is_integer_number('42')
272 >>> is_integer_number('42.0')
276 (is_number(in_str) and "." not in in_str)
277 or is_hexidecimal_integer_number(in_str)
278 or is_octal_integer_number(in_str)
279 or is_binary_integer_number(in_str)
283 def is_hexidecimal_integer_number(in_str: str) -> bool:
285 Checks whether a string is a hex integer number.
287 >>> is_hexidecimal_integer_number('0x12345')
289 >>> is_hexidecimal_integer_number('0x1A3E')
291 >>> is_hexidecimal_integer_number('1234') # Needs 0x
293 >>> is_hexidecimal_integer_number('-0xff')
295 >>> is_hexidecimal_integer_number('test')
297 >>> is_hexidecimal_integer_number(12345) # Not a string
298 Traceback (most recent call last):
301 >>> is_hexidecimal_integer_number(101.4)
302 Traceback (most recent call last):
305 >>> is_hexidecimal_integer_number(0x1A3E)
306 Traceback (most recent call last):
310 if not is_string(in_str):
311 raise ValueError(in_str)
312 return HEX_NUMBER_RE.match(in_str) is not None
315 def is_octal_integer_number(in_str: str) -> bool:
317 Checks whether a string is an octal number.
319 >>> is_octal_integer_number('0o777')
321 >>> is_octal_integer_number('-0O115')
323 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
325 >>> is_octal_integer_number('7777') # Needs 0o
327 >>> is_octal_integer_number('test')
330 if not is_string(in_str):
331 raise ValueError(in_str)
332 return OCT_NUMBER_RE.match(in_str) is not None
335 def is_binary_integer_number(in_str: str) -> bool:
337 Returns whether a string contains a binary number.
339 >>> is_binary_integer_number('0b10111')
341 >>> is_binary_integer_number('-0b111')
343 >>> is_binary_integer_number('0B10101')
345 >>> is_binary_integer_number('0b10102')
347 >>> is_binary_integer_number('0xFFF')
349 >>> is_binary_integer_number('test')
352 if not is_string(in_str):
353 raise ValueError(in_str)
354 return BIN_NUMBER_RE.match(in_str) is not None
357 def to_int(in_str: str) -> int:
358 """Returns the integral value of the string or raises on error.
363 Traceback (most recent call last):
365 ValueError: invalid literal for int() with base 10: 'test'
367 if not is_string(in_str):
368 raise ValueError(in_str)
369 if is_binary_integer_number(in_str):
370 return int(in_str, 2)
371 if is_octal_integer_number(in_str):
372 return int(in_str, 8)
373 if is_hexidecimal_integer_number(in_str):
374 return int(in_str, 16)
378 def is_decimal_number(in_str: str) -> bool:
380 Checks whether the given string represents a decimal or not.
382 A decimal may be signed or unsigned or use a "scientific notation".
384 >>> is_decimal_number('42.0')
386 >>> is_decimal_number('42')
389 return is_number(in_str) and "." in in_str
392 def strip_escape_sequences(in_str: str) -> str:
394 Remove escape sequences in the input string.
396 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
399 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
403 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
405 Add thousands separator to a numeric string. Also handles numbers.
407 >>> add_thousands_separator('12345678')
409 >>> add_thousands_separator(12345678)
411 >>> add_thousands_separator(12345678.99)
413 >>> add_thousands_separator('test')
414 Traceback (most recent call last):
419 if isinstance(in_str, numbers.Number):
421 if is_number(in_str):
422 return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
423 raise ValueError(in_str)
426 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
429 (in_str, decimal_part) = in_str.split('.')
430 tmp = [iter(in_str[::-1])] * places
431 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
432 if len(decimal_part) > 0:
439 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
440 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
442 Check if a string is a valid url.
444 >>> is_url('http://www.mysite.com')
446 >>> is_url('https://mysite.com')
448 >>> is_url('.mysite.com')
451 if not is_full_string(in_str):
454 valid = URL_RE.match(in_str) is not None
457 return valid and any([in_str.startswith(s) for s in allowed_schemes])
461 def is_email(in_str: Any) -> bool:
463 Check if a string is a valid email.
465 Reference: https://tools.ietf.org/html/rfc3696#section-3
469 >>> is_email('@gmail.com')
472 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
476 # we expect 2 tokens, one before "@" and one after, otherwise
477 # we have an exception and the email is not valid.
478 head, tail = in_str.split("@")
480 # head's size must be <= 64, tail <= 255, head must not start
481 # with a dot or contain multiple consecutive dots.
482 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
485 # removes escaped spaces, so that later on the test regex will
487 head = head.replace("\\ ", "")
488 if head.startswith('"') and head.endswith('"'):
489 head = head.replace(" ", "")[1:-1]
490 return EMAIL_RE.match(head + "@" + tail) is not None
493 # borderline case in which we have multiple "@" signs but the
494 # head part is correctly escaped.
495 if ESCAPED_AT_SIGN.search(in_str) is not None:
496 # replace "@" with "a" in the head
497 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
501 def suffix_string_to_number(in_str: str) -> Optional[int]:
502 """Take a string like "33Gb" and convert it into a number (of bytes)
503 like 34603008. Return None if the input string is not valid.
505 >>> suffix_string_to_number('1Mb')
507 >>> suffix_string_to_number('13.1Gb')
511 def suffix_capitalize(s: str) -> str:
515 return f"{s[0].upper()}{s[1].lower()}"
516 return suffix_capitalize(s[0:1])
518 if is_string(in_str):
519 if is_integer_number(in_str):
520 return to_int(in_str)
521 suffixes = [in_str[-2:], in_str[-1:]]
522 rest = [in_str[:-2], in_str[:-1]]
523 for x in range(len(suffixes)):
525 s = suffix_capitalize(s)
526 multiplier = NUM_SUFFIXES.get(s, None)
527 if multiplier is not None:
529 if is_integer_number(r):
530 return to_int(r) * multiplier
531 if is_decimal_number(r):
532 return int(float(r) * multiplier)
536 def number_to_suffix_string(num: int) -> Optional[str]:
537 """Take a number (of bytes) and returns a string like "43.8Gb".
538 Returns none if the input is invalid.
540 >>> number_to_suffix_string(14066017894)
542 >>> number_to_suffix_string(1024 * 1024)
548 for (sfx, size) in NUM_SUFFIXES.items():
553 if suffix is not None:
554 return f"{d:.1f}{suffix}"
559 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
561 Checks if a string is a valid credit card number.
562 If card type is provided then it checks against that specific type only,
563 otherwise any known credit card number will be accepted.
565 Supported card types are the following:
574 if not is_full_string(in_str):
577 if card_type is not None:
578 if card_type not in CREDIT_CARDS:
580 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
582 return CREDIT_CARDS[card_type].match(in_str) is not None
583 for c in CREDIT_CARDS:
584 if CREDIT_CARDS[c].match(in_str) is not None:
589 def is_camel_case(in_str: Any) -> bool:
591 Checks if a string is formatted as camel case.
593 A string is considered camel case when:
595 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
596 - it contains both lowercase and uppercase letters
597 - it does not start with a number
599 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
602 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
604 Checks if a string is formatted as "snake case".
606 A string is considered snake case when:
608 - it's composed only by lowercase/uppercase letters and digits
609 - it contains at least one underscore (or provided separator)
610 - it does not start with a number
612 >>> is_snake_case('this_is_a_test')
614 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
616 >>> is_snake_case('this-is-a-test')
618 >>> is_snake_case('this-is-a-test', separator='-')
622 if is_full_string(in_str):
623 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
624 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
627 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
629 return r.match(in_str) is not None
633 def is_json(in_str: Any) -> bool:
635 Check if a string is a valid json.
637 >>> is_json('{"name": "Peter"}')
639 >>> is_json('[1, 2, 3]')
641 >>> is_json('{nope}')
644 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
646 return isinstance(json.loads(in_str), (dict, list))
647 except (TypeError, ValueError, OverflowError):
652 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
654 Check if a string is a valid UUID.
656 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
658 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
660 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
663 # string casting is used to allow UUID itself as input data type
666 return UUID_HEX_OK_RE.match(s) is not None
667 return UUID_RE.match(s) is not None
670 def is_ip_v4(in_str: Any) -> bool:
672 Checks if a string is a valid ip v4.
674 >>> is_ip_v4('255.200.100.75')
678 >>> is_ip_v4('255.200.100.999') # 999 out of range
681 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
684 # checks that each entry in the ip is in the valid range (0 to 255)
685 for token in in_str.split("."):
686 if not 0 <= int(token) <= 255:
691 def extract_ip_v4(in_str: Any) -> Optional[str]:
693 Extracts the IPv4 chunk of a string or None.
695 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
697 >>> extract_ip_v4('Your mom dresses you funny.')
699 if not is_full_string(in_str):
701 m = ANYWHERE_IP_V4_RE.search(in_str)
707 def is_ip_v6(in_str: Any) -> bool:
709 Checks if a string is a valid ip v6.
711 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
713 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
716 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
719 def extract_ip_v6(in_str: Any) -> Optional[str]:
721 Extract IPv6 chunk or None.
723 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
724 '2001:db8:85a3:0000:0000:8a2e:370:7334'
725 >>> extract_ip_v6("(and she's ugly too, btw)")
727 if not is_full_string(in_str):
729 m = ANYWHERE_IP_V6_RE.search(in_str)
735 def is_ip(in_str: Any) -> bool:
737 Checks if a string is a valid ip (either v4 or v6).
739 >>> is_ip('255.200.100.75')
741 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
745 >>> is_ip('1.2.3.999')
748 return is_ip_v6(in_str) or is_ip_v4(in_str)
751 def extract_ip(in_str: Any) -> Optional[str]:
753 Extract the IP address or None.
755 >>> extract_ip('Attacker: 255.200.100.75')
757 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
758 '2001:db8:85a3:0000:0000:8a2e:370:7334'
759 >>> extract_ip('1.2.3')
762 ip = extract_ip_v4(in_str)
764 ip = extract_ip_v6(in_str)
768 def is_mac_address(in_str: Any) -> bool:
769 """Return True if in_str is a valid MAC address false otherwise.
771 >>> is_mac_address("34:29:8F:12:0D:2F")
773 >>> is_mac_address('34:29:8f:12:0d:2f')
775 >>> is_mac_address('34-29-8F-12-0D-2F')
777 >>> is_mac_address("test")
780 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
783 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
785 Extract the MAC address from in_str.
787 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
790 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
794 if not is_full_string(in_str):
797 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
800 mac.replace(":", separator)
801 mac.replace("-", separator)
806 def is_slug(in_str: Any, separator: str = "-") -> bool:
808 Checks if a given string is a slug (as created by `slugify()`).
810 >>> is_slug('my-blog-post-title')
812 >>> is_slug('My blog post title')
816 if not is_full_string(in_str):
818 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
819 return re.match(rex, in_str) is not None
822 def contains_html(in_str: str) -> bool:
824 Checks if the given string contains HTML/XML tags.
826 By design, this function matches ANY type of tag, so don't expect to use it
827 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
829 >>> contains_html('my string is <strong>bold</strong>')
831 >>> contains_html('my string is not bold')
835 if not is_string(in_str):
836 raise ValueError(in_str)
837 return HTML_RE.search(in_str) is not None
840 def words_count(in_str: str) -> int:
842 Returns the number of words contained into the given string.
844 This method is smart, it does consider only sequence of one or more letter and/or numbers
845 as "words", so a string like this: "! @ # % ... []" will return zero!
846 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
847 will be 4 not 1 (even if there are no spaces in the string).
849 >>> words_count('hello world')
851 >>> words_count('one,two,three.stop')
855 if not is_string(in_str):
856 raise ValueError(in_str)
857 return len(WORDS_COUNT_RE.findall(in_str))
860 def generate_uuid(omit_dashes: bool = False) -> str:
862 Generated an UUID string (using `uuid.uuid4()`).
864 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
865 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
874 def generate_random_alphanumeric_string(size: int) -> str:
876 Returns a string of the specified size containing random
877 characters (uppercase/lowercase ascii letters and digits).
879 random_string(9) # possible output: "cx3QQbzYg"
883 raise ValueError("size must be >= 1")
884 chars = string.ascii_letters + string.digits
885 buffer = [random.choice(chars) for _ in range(size)]
886 return from_char_list(buffer)
889 def reverse(in_str: str) -> str:
891 Returns the string with its chars reversed.
897 if not is_string(in_str):
898 raise ValueError(in_str)
902 def camel_case_to_snake_case(in_str, *, separator="_"):
904 Convert a camel case string into a snake case one.
905 (The original string is returned if is not a valid camel case string)
907 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
908 'mac_address_extractor_factory'
909 >>> camel_case_to_snake_case('Luke Skywalker')
912 if not is_string(in_str):
913 raise ValueError(in_str)
914 if not is_camel_case(in_str):
916 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
919 def snake_case_to_camel_case(
920 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
923 Convert a snake case string into a camel case one.
924 (The original string is returned if is not a valid snake case string)
926 >>> snake_case_to_camel_case('this_is_a_test')
928 >>> snake_case_to_camel_case('Han Solo')
931 if not is_string(in_str):
932 raise ValueError(in_str)
933 if not is_snake_case(in_str, separator=separator):
935 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
936 if not upper_case_first:
937 tokens[0] = tokens[0].lower()
938 return from_char_list(tokens)
941 def to_char_list(in_str: str) -> List[str]:
942 """Convert a string into a list of chars.
944 >>> to_char_list('test')
947 if not is_string(in_str):
952 def from_char_list(in_list: List[str]) -> str:
953 """Convert a char list into a string.
955 >>> from_char_list(['t', 'e', 's', 't'])
958 return "".join(in_list)
961 def shuffle(in_str: str) -> str:
962 """Return a new string containing same chars of the given one but in
965 if not is_string(in_str):
966 raise ValueError(in_str)
968 # turn the string into a list of chars
969 chars = to_char_list(in_str)
970 random.shuffle(chars)
971 return from_char_list(chars)
974 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
976 Remove html code contained into the given string.
978 >>> strip_html('test: <a href="foo/bar">click here</a>')
980 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
983 if not is_string(in_str):
984 raise ValueError(in_str)
985 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
986 return r.sub("", in_str)
989 def asciify(in_str: str) -> str:
991 Force string content to be ascii-only by translating all non-ascii
992 chars into the closest possible representation (eg: ó -> o, Ë ->
995 N.B. Some chars may be lost if impossible to translate.
997 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
998 'eeuuooaaeynAAACIINOE'
1000 if not is_string(in_str):
1001 raise ValueError(in_str)
1003 # "NFKD" is the algorithm which is able to successfully translate
1004 # the most of non-ascii chars.
1005 normalized = unicodedata.normalize("NFKD", in_str)
1007 # encode string forcing ascii and ignore any errors
1008 # (unrepresentable chars will be stripped out)
1009 ascii_bytes = normalized.encode("ascii", "ignore")
1011 # turns encoded bytes into an utf-8 string
1012 return ascii_bytes.decode("utf-8")
1015 def slugify(in_str: str, *, separator: str = "-") -> str:
1017 Converts a string into a "slug" using provided separator.
1018 The returned string has the following properties:
1021 - all letters are in lower case
1022 - all punctuation signs and non alphanumeric chars are removed
1023 - words are divided using provided separator
1024 - all chars are encoded as ascii (by using `asciify()`)
1027 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1028 'top-10-reasons-to-love-dogs'
1029 >>> slugify('Mönstér Mägnët')
1032 if not is_string(in_str):
1033 raise ValueError(in_str)
1035 # replace any character that is NOT letter or number with spaces
1036 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1038 # replace spaces with join sign
1039 out = SPACES_RE.sub(separator, out)
1041 # normalize joins (remove duplicates)
1042 out = re.sub(re.escape(separator) + r"+", separator, out)
1046 def to_bool(in_str: str) -> bool:
1048 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1050 A positive boolean (True) is returned if the string value is one
1058 Otherwise False is returned.
1079 if not is_string(in_str):
1080 raise ValueError(in_str)
1081 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1084 def to_date(in_str: str) -> Optional[datetime.date]:
1086 Parses a date string. See DateParser docs for details.
1088 import dateparse.dateparse_utils as du
1091 d = du.DateParser() # type: ignore
1094 except du.ParseException: # type: ignore
1095 msg = f'Unable to parse date {in_str}.'
1100 def valid_date(in_str: str) -> bool:
1102 True if the string represents a valid date.
1104 import dateparse.dateparse_utils as dp
1107 d = dp.DateParser() # type: ignore
1110 except dp.ParseException: # type: ignore
1111 msg = f'Unable to parse date {in_str}.'
1116 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1118 Parses a datetime string. See DateParser docs for more info.
1120 import dateparse.dateparse_utils as dp
1123 d = dp.DateParser() # type: ignore
1124 dt = d.parse(in_str)
1125 if isinstance(dt, datetime.datetime):
1128 msg = f'Unable to parse datetime {in_str}.'
1133 def valid_datetime(in_str: str) -> bool:
1135 True if the string represents a valid datetime.
1137 _ = to_datetime(in_str)
1140 msg = f'Unable to parse datetime {in_str}.'
1145 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1147 Squeeze runs of more than one character_to_squeeze into one.
1149 >>> squeeze(' this is a test ')
1152 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1157 r'(' + re.escape(character_to_squeeze) + r')+',
1158 character_to_squeeze,
1163 def dedent(in_str: str) -> str:
1165 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1167 if not is_string(in_str):
1168 raise ValueError(in_str)
1169 line_separator = '\n'
1170 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1171 return line_separator.join(lines)
1174 def indent(in_str: str, amount: int) -> str:
1176 Indents string by prepending amount spaces.
1178 >>> indent('This is a test', 4)
1182 if not is_string(in_str):
1183 raise ValueError(in_str)
1184 line_separator = '\n'
1185 lines = [" " * amount + line for line in in_str.split(line_separator)]
1186 return line_separator.join(lines)
1189 def sprintf(*args, **kwargs) -> str:
1190 """String printf, like in C"""
1193 sep = kwargs.pop("sep", None)
1195 if not isinstance(sep, str):
1196 raise TypeError("sep must be None or a string")
1198 end = kwargs.pop("end", None)
1200 if not isinstance(end, str):
1201 raise TypeError("end must be None or a string")
1204 raise TypeError("invalid keyword arguments to sprint()")
1210 for i, arg in enumerate(args):
1213 if isinstance(arg, str):
1221 class SprintfStdout(contextlib.AbstractContextManager):
1223 A context manager that captures outputs to stdout.
1225 with SprintfStdout() as buf:
1232 def __init__(self) -> None:
1233 self.destination = io.StringIO()
1234 self.recorder: contextlib.redirect_stdout
1236 def __enter__(self) -> Callable[[], str]:
1237 self.recorder = contextlib.redirect_stdout(self.destination)
1238 self.recorder.__enter__()
1239 return lambda: self.destination.getvalue()
1241 def __exit__(self, *args) -> Literal[False]:
1242 self.recorder.__exit__(*args)
1243 self.destination.seek(0)
1247 def capitalize_first_letter(txt: str) -> str:
1248 """Capitalize the first letter of a string.
1250 >>> capitalize_first_letter('test')
1252 >>> capitalize_first_letter("ALREADY!")
1256 return txt[0].upper() + txt[1:]
1259 def it_they(n: int) -> str:
1273 def is_are(n: int) -> str:
1287 def pluralize(n: int) -> str:
1293 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1296 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1305 def make_contractions(txt: str) -> str:
1306 """Glue words together to form contractions.
1308 >>> make_contractions('It is nice today.')
1311 >>> make_contractions('I can not even...')
1314 >>> make_contractions('She could not see!')
1317 >>> make_contractions('But she will not go.')
1320 >>> make_contractions('Verily, I shall not.')
1323 >>> make_contractions('No you cannot.')
1326 >>> make_contractions('I said you can not go.')
1327 "I said you can't go."
1364 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1368 # Special cases: can't, shan't and won't.
1369 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1370 txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1372 r'\b(w)ill\s*(n)(o)(t)\b',
1376 flags=re.IGNORECASE,
1379 for first_list, second_list in first_second:
1380 for first in first_list:
1381 for second in second_list:
1382 # Disallow there're/where're. They're valid English
1384 if (first in ('there', 'where')) and second == 'a(re)':
1387 pattern = fr'\b({first})\s+{second}\b'
1388 if second == '(n)o(t)':
1389 replacement = r"\1\2'\3"
1391 replacement = r"\1'\2"
1392 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1397 def thify(n: int) -> str:
1398 """Return the proper cardinal suffix for a number.
1409 assert is_integer_number(digit)
1421 def ngrams(txt: str, n: int):
1422 """Return the ngrams from a string.
1424 >>> [x for x in ngrams('This is a test', 2)]
1425 ['This is', 'is a', 'a test']
1429 for ngram in ngrams_presplit(words, n):
1436 def ngrams_presplit(words: Sequence[str], n: int):
1437 return list_utils.ngrams(words, n)
1440 def bigrams(txt: str):
1441 return ngrams(txt, 2)
1444 def trigrams(txt: str):
1445 return ngrams(txt, 3)
1448 def shuffle_columns_into_list(
1449 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1451 """Helper to shuffle / parse columnar data and return the results as a
1452 list. The column_specs argument is an iterable collection of
1453 numeric sequences that indicate one or more column numbers to
1456 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1457 >>> shuffle_columns_into_list(
1459 ... [ [8], [2, 3], [5, 6, 7] ],
1462 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1467 # Column specs map input lines' columns into outputs.
1469 for spec in column_specs:
1472 hunk = hunk + delim + input_lines[n]
1473 hunk = hunk.strip(delim)
1478 def shuffle_columns_into_dict(
1479 input_lines: Sequence[str],
1480 column_specs: Iterable[Tuple[str, Iterable[int]]],
1482 ) -> Dict[str, str]:
1483 """Helper to shuffle / parse columnar data and return the results
1486 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1487 >>> shuffle_columns_into_dict(
1489 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1492 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1497 # Column specs map input lines' columns into outputs.
1498 # "key", [col1, col2...]
1499 for spec in column_specs:
1502 hunk = hunk + delim + input_lines[n]
1503 hunk = hunk.strip(delim)
1508 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1509 """Interpolate a string with data from a dict.
1511 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1512 ... {'adjective': 'good', 'noun': 'example'})
1513 'This is a good example.'
1516 return sprintf(txt.format(**values), end='')
1519 def to_ascii(x: str):
1520 """Encode as ascii bytes string.
1522 >>> to_ascii('test')
1525 >>> to_ascii(b'1, 2, 3')
1529 if isinstance(x, str):
1530 return x.encode('ascii')
1531 if isinstance(x, bytes):
1533 raise Exception('to_ascii works with strings and bytes')
1536 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1537 """Encode txt and then encode the bytes with a 64-character
1538 alphabet. This is compatible with uudecode.
1540 >>> to_base64('hello?')
1544 return base64.encodebytes(txt.encode(encoding, errors))
1547 def is_base64(txt: str) -> bool:
1548 """Determine whether a string is base64 encoded (with Python's standard
1549 base64 alphabet which is the same as what uuencode uses).
1551 >>> is_base64('test') # all letters in the b64 alphabet
1554 >>> is_base64('another test, how do you like this one?')
1557 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1561 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1562 alphabet = set(a.encode('ascii'))
1563 for char in to_ascii(txt.strip()):
1564 if char not in alphabet:
1569 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1570 """Convert base64 encoded string back to normal strings.
1572 >>> from_base64(b'aGVsbG8/\\n')
1576 return base64.decodebytes(b64).decode(encoding, errors)
1579 def chunk(txt: str, chunk_size):
1580 """Chunk up a string.
1582 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1583 '01001101 11000101 10101010 10101010 10011111 10101000'
1586 if len(txt) % chunk_size != 0:
1587 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1589 warnings.warn(msg, stacklevel=2)
1590 for x in range(0, len(txt), chunk_size):
1591 yield txt[x : x + chunk_size]
1594 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1595 """Encode txt and then chop it into bytes. Note: only bitstrings
1596 with delimiter='' are interpretable by from_bitstring.
1598 >>> to_bitstring('hello?')
1599 '011010000110010101101100011011000110111100111111'
1601 >>> to_bitstring('test', delimiter=' ')
1602 '01110100 01100101 01110011 01110100'
1604 >>> to_bitstring(b'test')
1605 '01110100011001010111001101110100'
1608 etxt = to_ascii(txt)
1609 bits = bin(int.from_bytes(etxt, 'big'))
1611 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1614 def is_bitstring(txt: str) -> bool:
1615 """Is this a bitstring?
1617 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1620 >>> is_bitstring('1234')
1624 return is_binary_integer_number(f'0b{txt}')
1627 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1628 """Convert from bitstring back to bytes then decode into a str.
1630 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1635 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1638 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1639 """Turn an IPv4 address into a tuple for sorting purposes.
1641 >>> ip_v4_sort_key('10.0.0.18')
1644 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1645 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1646 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1649 if not is_ip_v4(txt):
1650 print(f"not IP: {txt}")
1652 return tuple(int(x) for x in txt.split('.'))
1655 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1656 """Chunk up a file path so that parent/ancestor paths sort before
1657 children/descendant paths.
1659 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1660 ('usr', 'local', 'bin')
1662 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1663 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1664 ['/usr', '/usr/local', '/usr/local/bin']
1667 return tuple(x for x in volume.split('/') if len(x) > 0)
1670 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1671 """Execute several replace operations in a row.
1673 >>> s = 'this_is a-test!'
1674 >>> replace_all(s, ' _-!', '')
1678 for char in replace_set:
1679 in_str = in_str.replace(char, replacement)
1683 if __name__ == '__main__':