2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
7 Modifications Copyright (c) 2021-2022 Scott Gasch
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
16 The above copyright notice and this permission notice shall be included in all
17 copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 This class is based on: https://github.com/daveoncode/python-string-utils.
31 import contextlib # type: ignore
42 from itertools import zip_longest
54 from uuid import uuid4
58 logger = logging.getLogger(__name__)
60 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
62 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
64 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
66 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
69 r"([a-z-]+://)" # scheme
70 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
72 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
73 r"(:\d{2,})?" # port number
74 r"(/[a-z\d_%+-]*)*" # folders
75 r"(\.[a-z\d_%+-]+)*" # file extension
76 r"(\?[a-z\d_+%-=]*)?" # query string
80 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
82 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
84 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
86 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
88 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
90 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
92 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
94 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
96 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
98 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
100 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
102 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
105 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
106 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
107 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
108 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
109 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
110 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
113 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
115 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
117 UUID_HEX_OK_RE = re.compile(
118 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
122 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
124 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
126 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
128 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
130 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
132 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
134 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
136 HTML_RE = re.compile(
137 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
138 re.IGNORECASE | re.MULTILINE | re.DOTALL,
141 HTML_TAG_ONLY_RE = re.compile(
142 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
143 re.IGNORECASE | re.MULTILINE | re.DOTALL,
146 SPACES_RE = re.compile(r"\s")
148 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
150 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
152 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
168 def is_none_or_empty(in_str: Optional[str]) -> bool:
170 Returns true if the input string is either None or an empty string.
172 >>> is_none_or_empty("")
174 >>> is_none_or_empty(None)
176 >>> is_none_or_empty(" \t ")
178 >>> is_none_or_empty('Test')
181 return in_str is None or len(in_str.strip()) == 0
184 def is_string(obj: Any) -> bool:
186 Checks if an object is a string.
188 >>> is_string('test')
194 >>> is_string([1, 2, 3])
197 return isinstance(obj, str)
200 def is_empty_string(in_str: Any) -> bool:
201 return is_empty(in_str)
204 def is_empty(in_str: Any) -> bool:
206 Checks if input is a string and empty or only whitespace.
210 >>> is_empty(' \t\t ')
216 >>> is_empty([1, 2, 3])
219 return is_string(in_str) and in_str.strip() == ""
222 def is_full_string(in_str: Any) -> bool:
224 Checks that input is a string and is not empty ('') or only whitespace.
226 >>> is_full_string('test!')
228 >>> is_full_string('')
230 >>> is_full_string(' ')
232 >>> is_full_string(100.999)
234 >>> is_full_string({"a": 1, "b": 2})
237 return is_string(in_str) and in_str.strip() != ""
240 def is_number(in_str: str) -> bool:
242 Checks if a string is a valid number.
245 Traceback (most recent call last):
248 >>> is_number("100.5")
250 >>> is_number("test")
254 >>> is_number([1, 2, 3])
255 Traceback (most recent call last):
257 ValueError: [1, 2, 3]
259 if not is_string(in_str):
260 raise ValueError(in_str)
261 return NUMBER_RE.match(in_str) is not None
264 def is_integer_number(in_str: str) -> bool:
266 Checks whether the given string represents an integer or not.
268 An integer may be signed or unsigned or use a "scientific notation".
270 >>> is_integer_number('42')
272 >>> is_integer_number('42.0')
276 (is_number(in_str) and "." not in in_str)
277 or is_hexidecimal_integer_number(in_str)
278 or is_octal_integer_number(in_str)
279 or is_binary_integer_number(in_str)
283 def is_hexidecimal_integer_number(in_str: str) -> bool:
285 Checks whether a string is a hex integer number.
287 >>> is_hexidecimal_integer_number('0x12345')
289 >>> is_hexidecimal_integer_number('0x1A3E')
291 >>> is_hexidecimal_integer_number('1234') # Needs 0x
293 >>> is_hexidecimal_integer_number('-0xff')
295 >>> is_hexidecimal_integer_number('test')
297 >>> is_hexidecimal_integer_number(12345) # Not a string
298 Traceback (most recent call last):
301 >>> is_hexidecimal_integer_number(101.4)
302 Traceback (most recent call last):
305 >>> is_hexidecimal_integer_number(0x1A3E)
306 Traceback (most recent call last):
310 if not is_string(in_str):
311 raise ValueError(in_str)
312 return HEX_NUMBER_RE.match(in_str) is not None
315 def is_octal_integer_number(in_str: str) -> bool:
317 Checks whether a string is an octal number.
319 >>> is_octal_integer_number('0o777')
321 >>> is_octal_integer_number('-0O115')
323 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
325 >>> is_octal_integer_number('7777') # Needs 0o
327 >>> is_octal_integer_number('test')
330 if not is_string(in_str):
331 raise ValueError(in_str)
332 return OCT_NUMBER_RE.match(in_str) is not None
335 def is_binary_integer_number(in_str: str) -> bool:
337 Returns whether a string contains a binary number.
339 >>> is_binary_integer_number('0b10111')
341 >>> is_binary_integer_number('-0b111')
343 >>> is_binary_integer_number('0B10101')
345 >>> is_binary_integer_number('0b10102')
347 >>> is_binary_integer_number('0xFFF')
349 >>> is_binary_integer_number('test')
352 if not is_string(in_str):
353 raise ValueError(in_str)
354 return BIN_NUMBER_RE.match(in_str) is not None
357 def to_int(in_str: str) -> int:
358 """Returns the integral value of the string or raises on error.
363 Traceback (most recent call last):
365 ValueError: invalid literal for int() with base 10: 'test'
367 if not is_string(in_str):
368 raise ValueError(in_str)
369 if is_binary_integer_number(in_str):
370 return int(in_str, 2)
371 if is_octal_integer_number(in_str):
372 return int(in_str, 8)
373 if is_hexidecimal_integer_number(in_str):
374 return int(in_str, 16)
378 def is_decimal_number(in_str: str) -> bool:
380 Checks whether the given string represents a decimal or not.
382 A decimal may be signed or unsigned or use a "scientific notation".
384 >>> is_decimal_number('42.0')
386 >>> is_decimal_number('42')
389 return is_number(in_str) and "." in in_str
392 def strip_escape_sequences(in_str: str) -> str:
394 Remove escape sequences in the input string.
396 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
399 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
403 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
405 Add thousands separator to a numeric string. Also handles numbers.
407 >>> add_thousands_separator('12345678')
409 >>> add_thousands_separator(12345678)
411 >>> add_thousands_separator(12345678.99)
413 >>> add_thousands_separator('test')
414 Traceback (most recent call last):
419 if isinstance(in_str, numbers.Number):
421 if is_number(in_str):
422 return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
423 raise ValueError(in_str)
426 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
429 (in_str, decimal_part) = in_str.split('.')
430 tmp = [iter(in_str[::-1])] * places
431 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
432 if len(decimal_part) > 0:
439 # scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash
440 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
442 Check if a string is a valid url.
444 >>> is_url('http://www.mysite.com')
446 >>> is_url('https://mysite.com')
448 >>> is_url('.mysite.com')
451 if not is_full_string(in_str):
454 valid = URL_RE.match(in_str) is not None
457 return valid and any([in_str.startswith(s) for s in allowed_schemes])
461 def is_email(in_str: Any) -> bool:
463 Check if a string is a valid email.
465 Reference: https://tools.ietf.org/html/rfc3696#section-3
469 >>> is_email('@gmail.com')
472 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
476 # we expect 2 tokens, one before "@" and one after, otherwise
477 # we have an exception and the email is not valid.
478 head, tail = in_str.split("@")
480 # head's size must be <= 64, tail <= 255, head must not start
481 # with a dot or contain multiple consecutive dots.
482 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
485 # removes escaped spaces, so that later on the test regex will
487 head = head.replace("\\ ", "")
488 if head.startswith('"') and head.endswith('"'):
489 head = head.replace(" ", "")[1:-1]
490 return EMAIL_RE.match(head + "@" + tail) is not None
493 # borderline case in which we have multiple "@" signs but the
494 # head part is correctly escaped.
495 if ESCAPED_AT_SIGN.search(in_str) is not None:
496 # replace "@" with "a" in the head
497 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
501 def suffix_string_to_number(in_str: str) -> Optional[int]:
502 """Take a string like "33Gb" and convert it into a number (of bytes)
503 like 34603008. Return None if the input string is not valid.
505 >>> suffix_string_to_number('1Mb')
507 >>> suffix_string_to_number('13.1Gb')
511 def suffix_capitalize(s: str) -> str:
515 return f"{s[0].upper()}{s[1].lower()}"
516 return suffix_capitalize(s[0:1])
518 if is_string(in_str):
519 if is_integer_number(in_str):
520 return to_int(in_str)
521 suffixes = [in_str[-2:], in_str[-1:]]
522 rest = [in_str[:-2], in_str[:-1]]
523 for x in range(len(suffixes)):
525 s = suffix_capitalize(s)
526 multiplier = NUM_SUFFIXES.get(s, None)
527 if multiplier is not None:
529 if is_integer_number(r):
530 return to_int(r) * multiplier
531 if is_decimal_number(r):
532 return int(float(r) * multiplier)
536 def number_to_suffix_string(num: int) -> Optional[str]:
537 """Take a number (of bytes) and returns a string like "43.8Gb".
538 Returns none if the input is invalid.
540 >>> number_to_suffix_string(14066017894)
542 >>> number_to_suffix_string(1024 * 1024)
548 for (sfx, size) in NUM_SUFFIXES.items():
553 if suffix is not None:
554 return f"{d:.1f}{suffix}"
559 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
561 Checks if a string is a valid credit card number.
562 If card type is provided then it checks against that specific type only,
563 otherwise any known credit card number will be accepted.
565 Supported card types are the following:
574 if not is_full_string(in_str):
577 if card_type is not None:
578 if card_type not in CREDIT_CARDS:
580 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
582 return CREDIT_CARDS[card_type].match(in_str) is not None
583 for c in CREDIT_CARDS:
584 if CREDIT_CARDS[c].match(in_str) is not None:
589 def is_camel_case(in_str: Any) -> bool:
591 Checks if a string is formatted as camel case.
593 A string is considered camel case when:
595 - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
596 - it contains both lowercase and uppercase letters
597 - it does not start with a number
599 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
602 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
604 Checks if a string is formatted as "snake case".
606 A string is considered snake case when:
608 - it's composed only by lowercase/uppercase letters and digits
609 - it contains at least one underscore (or provided separator)
610 - it does not start with a number
612 >>> is_snake_case('this_is_a_test')
614 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
616 >>> is_snake_case('this-is-a-test')
618 >>> is_snake_case('this-is-a-test', separator='-')
622 if is_full_string(in_str):
623 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
624 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
627 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
629 return r.match(in_str) is not None
633 def is_json(in_str: Any) -> bool:
635 Check if a string is a valid json.
637 >>> is_json('{"name": "Peter"}')
639 >>> is_json('[1, 2, 3]')
641 >>> is_json('{nope}')
644 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
646 return isinstance(json.loads(in_str), (dict, list))
647 except (TypeError, ValueError, OverflowError):
652 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
654 Check if a string is a valid UUID.
656 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
658 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
660 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
663 # string casting is used to allow UUID itself as input data type
666 return UUID_HEX_OK_RE.match(s) is not None
667 return UUID_RE.match(s) is not None
670 def is_ip_v4(in_str: Any) -> bool:
672 Checks if a string is a valid ip v4.
674 >>> is_ip_v4('255.200.100.75')
678 >>> is_ip_v4('255.200.100.999') # 999 out of range
681 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
684 # checks that each entry in the ip is in the valid range (0 to 255)
685 for token in in_str.split("."):
686 if not 0 <= int(token) <= 255:
691 def extract_ip_v4(in_str: Any) -> Optional[str]:
693 Extracts the IPv4 chunk of a string or None.
695 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
697 >>> extract_ip_v4('Your mom dresses you funny.')
699 if not is_full_string(in_str):
701 m = ANYWHERE_IP_V4_RE.search(in_str)
707 def is_ip_v6(in_str: Any) -> bool:
709 Checks if a string is a valid ip v6.
711 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
713 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
716 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
719 def extract_ip_v6(in_str: Any) -> Optional[str]:
721 Extract IPv6 chunk or None.
723 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
724 '2001:db8:85a3:0000:0000:8a2e:370:7334'
725 >>> extract_ip_v6("(and she's ugly too, btw)")
727 if not is_full_string(in_str):
729 m = ANYWHERE_IP_V6_RE.search(in_str)
735 def is_ip(in_str: Any) -> bool:
737 Checks if a string is a valid ip (either v4 or v6).
739 >>> is_ip('255.200.100.75')
741 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
745 >>> is_ip('1.2.3.999')
748 return is_ip_v6(in_str) or is_ip_v4(in_str)
751 def extract_ip(in_str: Any) -> Optional[str]:
753 Extract the IP address or None.
755 >>> extract_ip('Attacker: 255.200.100.75')
757 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
758 '2001:db8:85a3:0000:0000:8a2e:370:7334'
759 >>> extract_ip('1.2.3')
762 ip = extract_ip_v4(in_str)
764 ip = extract_ip_v6(in_str)
768 def is_mac_address(in_str: Any) -> bool:
769 """Return True if in_str is a valid MAC address false otherwise.
771 >>> is_mac_address("34:29:8F:12:0D:2F")
773 >>> is_mac_address('34:29:8f:12:0d:2f')
775 >>> is_mac_address('34-29-8F-12-0D-2F')
777 >>> is_mac_address("test")
780 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
783 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
785 Extract the MAC address from in_str.
787 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
790 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
794 if not is_full_string(in_str):
797 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
800 mac.replace(":", separator)
801 mac.replace("-", separator)
806 def is_slug(in_str: Any, separator: str = "-") -> bool:
808 Checks if a given string is a slug (as created by `slugify()`).
810 >>> is_slug('my-blog-post-title')
812 >>> is_slug('My blog post title')
816 if not is_full_string(in_str):
818 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
819 return re.match(rex, in_str) is not None
822 def contains_html(in_str: str) -> bool:
824 Checks if the given string contains HTML/XML tags.
826 By design, this function matches ANY type of tag, so don't expect to use it
827 as an HTML validator, its goal is to detect "malicious" or undesired tags in the text.
829 >>> contains_html('my string is <strong>bold</strong>')
831 >>> contains_html('my string is not bold')
835 if not is_string(in_str):
836 raise ValueError(in_str)
837 return HTML_RE.search(in_str) is not None
840 def words_count(in_str: str) -> int:
842 Returns the number of words contained into the given string.
844 This method is smart, it does consider only sequence of one or more letter and/or numbers
845 as "words", so a string like this: "! @ # % ... []" will return zero!
846 Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop"
847 will be 4 not 1 (even if there are no spaces in the string).
849 >>> words_count('hello world')
851 >>> words_count('one,two,three.stop')
855 if not is_string(in_str):
856 raise ValueError(in_str)
857 return len(WORDS_COUNT_RE.findall(in_str))
860 def generate_uuid(omit_dashes: bool = False) -> str:
862 Generated an UUID string (using `uuid.uuid4()`).
864 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
865 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
874 def generate_random_alphanumeric_string(size: int) -> str:
876 Returns a string of the specified size containing random
877 characters (uppercase/lowercase ascii letters and digits).
879 random_string(9) # possible output: "cx3QQbzYg"
883 raise ValueError("size must be >= 1")
884 chars = string.ascii_letters + string.digits
885 buffer = [random.choice(chars) for _ in range(size)]
886 return from_char_list(buffer)
889 def reverse(in_str: str) -> str:
891 Returns the string with its chars reversed.
897 if not is_string(in_str):
898 raise ValueError(in_str)
902 def camel_case_to_snake_case(in_str, *, separator="_"):
904 Convert a camel case string into a snake case one.
905 (The original string is returned if is not a valid camel case string)
907 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
908 'mac_address_extractor_factory'
909 >>> camel_case_to_snake_case('Luke Skywalker')
912 if not is_string(in_str):
913 raise ValueError(in_str)
914 if not is_camel_case(in_str):
916 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
919 def snake_case_to_camel_case(
920 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
923 Convert a snake case string into a camel case one.
924 (The original string is returned if is not a valid snake case string)
926 >>> snake_case_to_camel_case('this_is_a_test')
928 >>> snake_case_to_camel_case('Han Solo')
931 if not is_string(in_str):
932 raise ValueError(in_str)
933 if not is_snake_case(in_str, separator=separator):
935 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
936 if not upper_case_first:
937 tokens[0] = tokens[0].lower()
938 return from_char_list(tokens)
941 def to_char_list(in_str: str) -> List[str]:
942 """Convert a string into a list of chars.
944 >>> to_char_list('test')
947 if not is_string(in_str):
952 def from_char_list(in_list: List[str]) -> str:
953 """Convert a char list into a string.
955 >>> from_char_list(['t', 'e', 's', 't'])
958 return "".join(in_list)
961 def shuffle(in_str: str) -> str:
962 """Return a new string containing same chars of the given one but in
965 if not is_string(in_str):
966 raise ValueError(in_str)
968 # turn the string into a list of chars
969 chars = to_char_list(in_str)
970 random.shuffle(chars)
971 return from_char_list(chars)
974 def scramble(in_str: str) -> str:
975 return shuffle(in_str)
978 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
980 Remove html code contained into the given string.
982 >>> strip_html('test: <a href="foo/bar">click here</a>')
984 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
987 if not is_string(in_str):
988 raise ValueError(in_str)
989 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
990 return r.sub("", in_str)
993 def asciify(in_str: str) -> str:
995 Force string content to be ascii-only by translating all non-ascii
996 chars into the closest possible representation (eg: ó -> o, Ë ->
999 N.B. Some chars may be lost if impossible to translate.
1001 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1002 'eeuuooaaeynAAACIINOE'
1004 if not is_string(in_str):
1005 raise ValueError(in_str)
1007 # "NFKD" is the algorithm which is able to successfully translate
1008 # the most of non-ascii chars.
1009 normalized = unicodedata.normalize("NFKD", in_str)
1011 # encode string forcing ascii and ignore any errors
1012 # (unrepresentable chars will be stripped out)
1013 ascii_bytes = normalized.encode("ascii", "ignore")
1015 # turns encoded bytes into an utf-8 string
1016 return ascii_bytes.decode("utf-8")
1019 def slugify(in_str: str, *, separator: str = "-") -> str:
1021 Converts a string into a "slug" using provided separator.
1022 The returned string has the following properties:
1025 - all letters are in lower case
1026 - all punctuation signs and non alphanumeric chars are removed
1027 - words are divided using provided separator
1028 - all chars are encoded as ascii (by using `asciify()`)
1031 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1032 'top-10-reasons-to-love-dogs'
1033 >>> slugify('Mönstér Mägnët')
1036 if not is_string(in_str):
1037 raise ValueError(in_str)
1039 # replace any character that is NOT letter or number with spaces
1040 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1042 # replace spaces with join sign
1043 out = SPACES_RE.sub(separator, out)
1045 # normalize joins (remove duplicates)
1046 out = re.sub(re.escape(separator) + r"+", separator, out)
1050 def to_bool(in_str: str) -> bool:
1052 Turns a string into a boolean based on its content (CASE INSENSITIVE).
1054 A positive boolean (True) is returned if the string value is one
1062 Otherwise False is returned.
1083 if not is_string(in_str):
1084 raise ValueError(in_str)
1085 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1088 def to_date(in_str: str) -> Optional[datetime.date]:
1090 Parses a date string. See DateParser docs for details.
1092 import dateparse.dateparse_utils as du
1095 d = du.DateParser() # type: ignore
1098 except du.ParseException: # type: ignore
1099 msg = f'Unable to parse date {in_str}.'
1104 def valid_date(in_str: str) -> bool:
1106 True if the string represents a valid date.
1108 import dateparse.dateparse_utils as dp
1111 d = dp.DateParser() # type: ignore
1114 except dp.ParseException: # type: ignore
1115 msg = f'Unable to parse date {in_str}.'
1120 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1122 Parses a datetime string. See DateParser docs for more info.
1124 import dateparse.dateparse_utils as dp
1127 d = dp.DateParser() # type: ignore
1128 dt = d.parse(in_str)
1129 if isinstance(dt, datetime.datetime):
1132 msg = f'Unable to parse datetime {in_str}.'
1137 def valid_datetime(in_str: str) -> bool:
1139 True if the string represents a valid datetime.
1141 _ = to_datetime(in_str)
1144 msg = f'Unable to parse datetime {in_str}.'
1149 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1151 Squeeze runs of more than one character_to_squeeze into one.
1153 >>> squeeze(' this is a test ')
1156 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1161 r'(' + re.escape(character_to_squeeze) + r')+',
1162 character_to_squeeze,
1167 def dedent(in_str: str) -> str:
1169 Removes tab indentation from multi line strings (inspired by analogous Scala function).
1171 if not is_string(in_str):
1172 raise ValueError(in_str)
1173 line_separator = '\n'
1174 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1175 return line_separator.join(lines)
1178 def indent(in_str: str, amount: int) -> str:
1180 Indents string by prepending amount spaces.
1182 >>> indent('This is a test', 4)
1186 if not is_string(in_str):
1187 raise ValueError(in_str)
1188 line_separator = '\n'
1189 lines = [" " * amount + line for line in in_str.split(line_separator)]
1190 return line_separator.join(lines)
1193 def sprintf(*args, **kwargs) -> str:
1194 """String printf, like in C"""
1197 sep = kwargs.pop("sep", None)
1199 if not isinstance(sep, str):
1200 raise TypeError("sep must be None or a string")
1202 end = kwargs.pop("end", None)
1204 if not isinstance(end, str):
1205 raise TypeError("end must be None or a string")
1208 raise TypeError("invalid keyword arguments to sprint()")
1214 for i, arg in enumerate(args):
1217 if isinstance(arg, str):
1225 def strip_ansi_sequences(in_str: str) -> str:
1226 """Strips ANSI sequences out of strings.
1228 >>> import ansi as a
1229 >>> s = a.fg('blue') + 'blue!' + a.reset()
1230 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1232 >>> len(strip_ansi_sequences(s))
1234 >>> strip_ansi_sequences(s)
1238 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1241 class SprintfStdout(contextlib.AbstractContextManager):
1243 A context manager that captures outputs to stdout.
1245 with SprintfStdout() as buf:
1252 def __init__(self) -> None:
1253 self.destination = io.StringIO()
1254 self.recorder: contextlib.redirect_stdout
1256 def __enter__(self) -> Callable[[], str]:
1257 self.recorder = contextlib.redirect_stdout(self.destination)
1258 self.recorder.__enter__()
1259 return lambda: self.destination.getvalue()
1261 def __exit__(self, *args) -> Literal[False]:
1262 self.recorder.__exit__(*args)
1263 self.destination.seek(0)
1267 def capitalize_first_letter(txt: str) -> str:
1268 """Capitalize the first letter of a string.
1270 >>> capitalize_first_letter('test')
1272 >>> capitalize_first_letter("ALREADY!")
1276 return txt[0].upper() + txt[1:]
1279 def it_they(n: int) -> str:
1293 def is_are(n: int) -> str:
1307 def pluralize(n: int) -> str:
1313 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1316 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1325 def make_contractions(txt: str) -> str:
1326 """Glue words together to form contractions.
1328 >>> make_contractions('It is nice today.')
1331 >>> make_contractions('I can not even...')
1334 >>> make_contractions('She could not see!')
1337 >>> make_contractions('But she will not go.')
1340 >>> make_contractions('Verily, I shall not.')
1343 >>> make_contractions('No you cannot.')
1346 >>> make_contractions('I said you can not go.')
1347 "I said you can't go."
1384 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1388 # Special cases: can't, shan't and won't.
1389 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1390 txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1392 r'\b(w)ill\s*(n)(o)(t)\b',
1396 flags=re.IGNORECASE,
1399 for first_list, second_list in first_second:
1400 for first in first_list:
1401 for second in second_list:
1402 # Disallow there're/where're. They're valid English
1404 if (first in ('there', 'where')) and second == 'a(re)':
1407 pattern = fr'\b({first})\s+{second}\b'
1408 if second == '(n)o(t)':
1409 replacement = r"\1\2'\3"
1411 replacement = r"\1'\2"
1412 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1417 def thify(n: int) -> str:
1418 """Return the proper cardinal suffix for a number.
1429 assert is_integer_number(digit)
1441 def ngrams(txt: str, n: int):
1442 """Return the ngrams from a string.
1444 >>> [x for x in ngrams('This is a test', 2)]
1445 ['This is', 'is a', 'a test']
1449 for ngram in ngrams_presplit(words, n):
1456 def ngrams_presplit(words: Sequence[str], n: int):
1457 return list_utils.ngrams(words, n)
1460 def bigrams(txt: str):
1461 return ngrams(txt, 2)
1464 def trigrams(txt: str):
1465 return ngrams(txt, 3)
1468 def shuffle_columns_into_list(
1469 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1471 """Helper to shuffle / parse columnar data and return the results as a
1472 list. The column_specs argument is an iterable collection of
1473 numeric sequences that indicate one or more column numbers to
1476 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1477 >>> shuffle_columns_into_list(
1479 ... [ [8], [2, 3], [5, 6, 7] ],
1482 ['acl_test.py', 'scott wheel', 'Jul 9 11:34']
1487 # Column specs map input lines' columns into outputs.
1489 for spec in column_specs:
1492 hunk = hunk + delim + input_lines[n]
1493 hunk = hunk.strip(delim)
1498 def shuffle_columns_into_dict(
1499 input_lines: Sequence[str],
1500 column_specs: Iterable[Tuple[str, Iterable[int]]],
1502 ) -> Dict[str, str]:
1503 """Helper to shuffle / parse columnar data and return the results
1506 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1507 >>> shuffle_columns_into_dict(
1509 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
1512 {'filename': 'acl_test.py', 'owner': 'scott wheel', 'mtime': 'Jul 9 11:34'}
1517 # Column specs map input lines' columns into outputs.
1518 # "key", [col1, col2...]
1519 for spec in column_specs:
1522 hunk = hunk + delim + input_lines[n]
1523 hunk = hunk.strip(delim)
1528 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
1529 """Interpolate a string with data from a dict.
1531 >>> interpolate_using_dict('This is a {adjective} {noun}.',
1532 ... {'adjective': 'good', 'noun': 'example'})
1533 'This is a good example.'
1536 return sprintf(txt.format(**values), end='')
1539 def to_ascii(x: str):
1540 """Encode as ascii bytes string.
1542 >>> to_ascii('test')
1545 >>> to_ascii(b'1, 2, 3')
1549 if isinstance(x, str):
1550 return x.encode('ascii')
1551 if isinstance(x, bytes):
1553 raise Exception('to_ascii works with strings and bytes')
1556 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
1557 """Encode txt and then encode the bytes with a 64-character
1558 alphabet. This is compatible with uudecode.
1560 >>> to_base64('hello?')
1564 return base64.encodebytes(txt.encode(encoding, errors))
1567 def is_base64(txt: str) -> bool:
1568 """Determine whether a string is base64 encoded (with Python's standard
1569 base64 alphabet which is the same as what uuencode uses).
1571 >>> is_base64('test') # all letters in the b64 alphabet
1574 >>> is_base64('another test, how do you like this one?')
1577 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
1581 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
1582 alphabet = set(a.encode('ascii'))
1583 for char in to_ascii(txt.strip()):
1584 if char not in alphabet:
1589 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
1590 """Convert base64 encoded string back to normal strings.
1592 >>> from_base64(b'aGVsbG8/\\n')
1596 return base64.decodebytes(b64).decode(encoding, errors)
1599 def chunk(txt: str, chunk_size):
1600 """Chunk up a string.
1602 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
1603 '01001101 11000101 10101010 10101010 10011111 10101000'
1606 if len(txt) % chunk_size != 0:
1607 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
1609 warnings.warn(msg, stacklevel=2)
1610 for x in range(0, len(txt), chunk_size):
1611 yield txt[x : x + chunk_size]
1614 def to_bitstring(txt: str, *, delimiter='', encoding='utf-8', errors='surrogatepass') -> str:
1615 """Encode txt and then chop it into bytes. Note: only bitstrings
1616 with delimiter='' are interpretable by from_bitstring.
1618 >>> to_bitstring('hello?')
1619 '011010000110010101101100011011000110111100111111'
1621 >>> to_bitstring('test', delimiter=' ')
1622 '01110100 01100101 01110011 01110100'
1624 >>> to_bitstring(b'test')
1625 '01110100011001010111001101110100'
1628 etxt = to_ascii(txt)
1629 bits = bin(int.from_bytes(etxt, 'big'))
1631 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
1634 def is_bitstring(txt: str) -> bool:
1635 """Is this a bitstring?
1637 >>> is_bitstring('011010000110010101101100011011000110111100111111')
1640 >>> is_bitstring('1234')
1644 return is_binary_integer_number(f'0b{txt}')
1647 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
1648 """Convert from bitstring back to bytes then decode into a str.
1650 >>> from_bitstring('011010000110010101101100011011000110111100111111')
1655 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
1658 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
1659 """Turn an IPv4 address into a tuple for sorting purposes.
1661 >>> ip_v4_sort_key('10.0.0.18')
1664 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
1665 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
1666 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
1669 if not is_ip_v4(txt):
1670 print(f"not IP: {txt}")
1672 return tuple(int(x) for x in txt.split('.'))
1675 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
1676 """Chunk up a file path so that parent/ancestor paths sort before
1677 children/descendant paths.
1679 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
1680 ('usr', 'local', 'bin')
1682 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
1683 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
1684 ['/usr', '/usr/local', '/usr/local/bin']
1687 return tuple(x for x in volume.split('/') if len(x) > 0)
1690 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
1691 """Execute several replace operations in a row.
1693 >>> s = 'this_is a-test!'
1694 >>> replace_all(s, ' _-!', '')
1698 for char in replace_set:
1699 in_str = in_str.replace(char, replacement)
1703 def replace_nth(in_str: str, source: str, target: str, nth: int):
1704 """Replaces the nth occurrance of a substring within a string.
1706 >>> replace_nth('this is a test', ' ', '-', 3)
1710 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
1711 before = in_str[:where]
1712 after = in_str[where:]
1713 after = after.replace(source, target, 1)
1714 return before + after
1717 if __name__ == '__main__':