2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
8 Modifications Copyright (c) 2021-2022 Scott Gasch
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
17 The above copyright notice and this permission notice shall be included in all
18 copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 This class is based on: https://github.com/daveoncode/python-string-utils.
29 See NOTICE in the root of this module for a detailed enumeration of what
30 work is Davide's and what work was added by Scott.
34 import contextlib # type: ignore
45 from itertools import zip_longest
57 from uuid import uuid4
59 from pyutils import list_utils
61 logger = logging.getLogger(__name__)
63 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
65 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
67 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
69 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
72 r"([a-z-]+://)" # scheme
73 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
75 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
76 r"(:\d{2,})?" # port number
77 r"(/[a-z\d_%+-]*)*" # folders
78 r"(\.[a-z\d_%+-]+)*" # file extension
79 r"(\?[a-z\d_+%-=]*)?" # query string
83 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
85 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
87 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
90 r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
93 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
95 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
97 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
99 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
101 SNAKE_CASE_TEST_RE = re.compile(
102 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
105 SNAKE_CASE_TEST_DASH_RE = re.compile(
106 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
109 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
111 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
114 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
115 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
116 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
117 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
118 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
119 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
122 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
124 UUID_RE = re.compile(
125 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
128 UUID_HEX_OK_RE = re.compile(
129 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
133 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
135 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
137 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
139 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
141 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
143 ANYWHERE_MAC_ADDRESS_RE = re.compile(
144 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
147 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
149 HTML_RE = re.compile(
150 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
151 re.IGNORECASE | re.MULTILINE | re.DOTALL,
154 HTML_TAG_ONLY_RE = re.compile(
155 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
156 re.IGNORECASE | re.MULTILINE | re.DOTALL,
159 SPACES_RE = re.compile(r"\s")
161 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
163 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
165 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
216 scales = ["hundred", "thousand", "million", "billion", "trillion"]
219 NUM_WORDS["and"] = (1, 0)
220 for i, word in enumerate(units):
221 NUM_WORDS[word] = (1, i)
222 for i, word in enumerate(tens):
223 NUM_WORDS[word] = (1, i * 10)
224 for i, word in enumerate(scales):
225 NUM_WORDS[word] = (10 ** (i * 3 or 2), 0)
226 NUM_WORDS['score'] = (20, 0)
229 def is_none_or_empty(in_str: Optional[str]) -> bool:
232 in_str: the string to test
235 True if the input string is either None or an empty string,
238 >>> is_none_or_empty("")
240 >>> is_none_or_empty(None)
242 >>> is_none_or_empty(" \t ")
244 >>> is_none_or_empty('Test')
247 return in_str is None or len(in_str.strip()) == 0
250 def is_string(obj: Any) -> bool:
253 in_str: the object to test
256 True if the object is a string and False otherwise.
258 >>> is_string('test')
264 >>> is_string([1, 2, 3])
267 return isinstance(obj, str)
270 def is_empty_string(in_str: Any) -> bool:
273 in_str: the string to test
276 True if the string is empty and False otherwise.
278 return is_empty(in_str)
281 def is_empty(in_str: Any) -> bool:
284 in_str: the string to test
287 True if the string is empty and false otherwise.
291 >>> is_empty(' \t\t ')
297 >>> is_empty([1, 2, 3])
300 return is_string(in_str) and in_str.strip() == ""
303 def is_full_string(in_str: Any) -> bool:
306 in_str: the object to test
309 True if the object is a string and is not empty ('') and
310 is not only composed of whitespace.
312 >>> is_full_string('test!')
314 >>> is_full_string('')
316 >>> is_full_string(' ')
318 >>> is_full_string(100.999)
320 >>> is_full_string({"a": 1, "b": 2})
323 return is_string(in_str) and in_str.strip() != ""
326 def is_number(in_str: str) -> bool:
329 in_str: the string to test
332 True if the string contains a valid numberic value and
336 Traceback (most recent call last):
339 >>> is_number("100.5")
341 >>> is_number("test")
345 >>> is_number([1, 2, 3])
346 Traceback (most recent call last):
348 ValueError: [1, 2, 3]
350 if not is_string(in_str):
351 raise ValueError(in_str)
352 return NUMBER_RE.match(in_str) is not None
355 def is_integer_number(in_str: str) -> bool:
358 in_str: the string to test
361 True if the string contains a valid (signed or unsigned,
362 decimal, hex, or octal, regular or scientific) integral
363 expression and False otherwise.
365 >>> is_integer_number('42')
367 >>> is_integer_number('42.0')
371 (is_number(in_str) and "." not in in_str)
372 or is_hexidecimal_integer_number(in_str)
373 or is_octal_integer_number(in_str)
374 or is_binary_integer_number(in_str)
378 def is_hexidecimal_integer_number(in_str: str) -> bool:
381 in_str: the string to test
384 True if the string is a hex integer number and False otherwise.
386 >>> is_hexidecimal_integer_number('0x12345')
388 >>> is_hexidecimal_integer_number('0x1A3E')
390 >>> is_hexidecimal_integer_number('1234') # Needs 0x
392 >>> is_hexidecimal_integer_number('-0xff')
394 >>> is_hexidecimal_integer_number('test')
396 >>> is_hexidecimal_integer_number(12345) # Not a string
397 Traceback (most recent call last):
400 >>> is_hexidecimal_integer_number(101.4)
401 Traceback (most recent call last):
404 >>> is_hexidecimal_integer_number(0x1A3E)
405 Traceback (most recent call last):
409 if not is_string(in_str):
410 raise ValueError(in_str)
411 return HEX_NUMBER_RE.match(in_str) is not None
414 def is_octal_integer_number(in_str: str) -> bool:
417 in_str: the string to test
420 True if the string is a valid octal integral number and False otherwise.
422 >>> is_octal_integer_number('0o777')
424 >>> is_octal_integer_number('-0O115')
426 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
428 >>> is_octal_integer_number('7777') # Needs 0o
430 >>> is_octal_integer_number('test')
433 if not is_string(in_str):
434 raise ValueError(in_str)
435 return OCT_NUMBER_RE.match(in_str) is not None
438 def is_binary_integer_number(in_str: str) -> bool:
441 in_str: the string to test
444 True if the string contains a binary integral number and False otherwise.
446 >>> is_binary_integer_number('0b10111')
448 >>> is_binary_integer_number('-0b111')
450 >>> is_binary_integer_number('0B10101')
452 >>> is_binary_integer_number('0b10102')
454 >>> is_binary_integer_number('0xFFF')
456 >>> is_binary_integer_number('test')
459 if not is_string(in_str):
460 raise ValueError(in_str)
461 return BIN_NUMBER_RE.match(in_str) is not None
464 def to_int(in_str: str) -> int:
467 in_str: the string to convert
470 The integral value of the string or raises on error.
475 Traceback (most recent call last):
477 ValueError: invalid literal for int() with base 10: 'test'
479 if not is_string(in_str):
480 raise ValueError(in_str)
481 if is_binary_integer_number(in_str):
482 return int(in_str, 2)
483 if is_octal_integer_number(in_str):
484 return int(in_str, 8)
485 if is_hexidecimal_integer_number(in_str):
486 return int(in_str, 16)
490 def number_string_to_integer(in_str: str) -> int:
491 """Convert a string containing a written-out number into an int.
493 >>> number_string_to_integer("one hundred fifty two")
496 >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
499 >>> number_string_to_integer("four-score and 7")
502 >>> number_string_to_integer("fifty xyzzy three")
503 Traceback (most recent call last):
505 ValueError: Unknown word: xyzzy
507 if type(in_str) == int:
511 in_str = in_str.replace('-', ' ')
512 for word in in_str.split():
513 if word not in NUM_WORDS:
514 if is_integer_number(word):
518 raise ValueError("Unknown word: " + word)
519 scale, increment = NUM_WORDS[word]
520 current = current * scale + increment
524 return result + current
527 def is_decimal_number(in_str: str) -> bool:
530 in_str: the string to check
533 True if the given string represents a decimal or False
534 otherwise. A decimal may be signed or unsigned or use
535 a "scientific notation".
538 We do not consider integers without a decimal point
539 to be decimals; they return False (see example).
541 >>> is_decimal_number('42.0')
543 >>> is_decimal_number('42')
546 return is_number(in_str) and "." in in_str
549 def strip_escape_sequences(in_str: str) -> str:
552 in_str: the string to strip of escape sequences.
555 in_str with escape sequences removed.
558 What is considered to be an "escape sequence" is defined
559 by a regular expression. While this gets common ones,
560 there may exist valid sequences that it doesn't match.
562 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
565 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
569 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
572 in_str: string or number to which to add thousands separator(s)
573 separator_char: the separator character to add (defaults to comma)
574 places: add a separator every N places (defaults to three)
577 A numeric string with thousands separators added appropriately.
579 >>> add_thousands_separator('12345678')
581 >>> add_thousands_separator(12345678)
583 >>> add_thousands_separator(12345678.99)
585 >>> add_thousands_separator('test')
586 Traceback (most recent call last):
591 if isinstance(in_str, numbers.Number):
593 if is_number(in_str):
594 return _add_thousands_separator(
595 in_str, separator_char=separator_char, places=places
597 raise ValueError(in_str)
600 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
603 (in_str, decimal_part) = in_str.split('.')
604 tmp = [iter(in_str[::-1])] * places
605 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
606 if len(decimal_part) > 0:
612 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
615 in_str: the string to test
616 allowed_schemes: an optional list of allowed schemes (e.g.
617 ['http', 'https', 'ftp']. If passed, only URLs that
618 begin with the one of the schemes passed will be considered
619 to be valid. Otherwise, any scheme:// will be considered
623 True if in_str contains a valid URL and False otherwise.
625 >>> is_url('http://www.mysite.com')
627 >>> is_url('https://mysite.com')
629 >>> is_url('.mysite.com')
631 >>> is_url('scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash')
634 if not is_full_string(in_str):
637 valid = URL_RE.match(in_str) is not None
640 return valid and any([in_str.startswith(s) for s in allowed_schemes])
644 def is_email(in_str: Any) -> bool:
647 in_str: the email address to check
649 Returns: True if the in_str contains a valid email (as defined by
650 https://tools.ietf.org/html/rfc3696#section-3) or False
655 >>> is_email('@gmail.com')
658 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
662 # we expect 2 tokens, one before "@" and one after, otherwise
663 # we have an exception and the email is not valid.
664 head, tail = in_str.split("@")
666 # head's size must be <= 64, tail <= 255, head must not start
667 # with a dot or contain multiple consecutive dots.
668 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
671 # removes escaped spaces, so that later on the test regex will
673 head = head.replace("\\ ", "")
674 if head.startswith('"') and head.endswith('"'):
675 head = head.replace(" ", "")[1:-1]
676 return EMAIL_RE.match(head + "@" + tail) is not None
679 # borderline case in which we have multiple "@" signs but the
680 # head part is correctly escaped.
681 if ESCAPED_AT_SIGN.search(in_str) is not None:
682 # replace "@" with "a" in the head
683 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
687 def suffix_string_to_number(in_str: str) -> Optional[int]:
688 """Takes a string like "33Gb" and converts it into a number (of bytes)
692 in_str: the string with a suffix to be interpreted and removed.
695 An integer number of bytes or None to indicate an error.
697 >>> suffix_string_to_number('1Mb')
699 >>> suffix_string_to_number('13.1Gb')
703 def suffix_capitalize(s: str) -> str:
707 return f"{s[0].upper()}{s[1].lower()}"
708 return suffix_capitalize(s[0:1])
710 if is_string(in_str):
711 if is_integer_number(in_str):
712 return to_int(in_str)
713 suffixes = [in_str[-2:], in_str[-1:]]
714 rest = [in_str[:-2], in_str[:-1]]
715 for x in range(len(suffixes)):
717 s = suffix_capitalize(s)
718 multiplier = NUM_SUFFIXES.get(s, None)
719 if multiplier is not None:
721 if is_integer_number(r):
722 return to_int(r) * multiplier
723 if is_decimal_number(r):
724 return int(float(r) * multiplier)
728 def number_to_suffix_string(num: int) -> Optional[str]:
729 """Take a number (of bytes) and returns a string like "43.8Gb".
732 num: an integer number of bytes
735 A string with a suffix representing num bytes concisely or
736 None to indicate an error.
738 >>> number_to_suffix_string(14066017894)
740 >>> number_to_suffix_string(1024 * 1024)
745 for (sfx, size) in NUM_SUFFIXES.items():
750 if suffix is not None:
751 return f"{d:.1f}{suffix}"
756 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
759 in_str: a string to check
760 card_type: if provided, contains the card type to validate
761 with. Otherwise, all known credit card number types will
764 Supported card types are the following:
774 True if in_str is a valid credit card number.
776 if not is_full_string(in_str):
779 if card_type is not None:
780 if card_type not in CREDIT_CARDS:
782 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
784 return CREDIT_CARDS[card_type].match(in_str) is not None
785 for c in CREDIT_CARDS:
786 if CREDIT_CARDS[c].match(in_str) is not None:
791 def is_camel_case(in_str: Any) -> bool:
794 in_str: the string to test
797 True if the string is formatted as camel case and False otherwise.
798 A string is considered camel case when:
800 * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
801 * it contains both lowercase and uppercase letters
802 * it does not start with a number
804 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
807 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
810 in_str: the string to test
812 Returns: True if the string is snake case and False otherwise. A
813 string is considered snake case when:
815 * it's composed only by lowercase/uppercase letters and digits
816 * it contains at least one underscore (or provided separator)
817 * it does not start with a number
819 >>> is_snake_case('this_is_a_test')
821 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
823 >>> is_snake_case('this-is-a-test')
825 >>> is_snake_case('this-is-a-test', separator='-')
828 if is_full_string(in_str):
829 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
830 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
833 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
835 return r.match(in_str) is not None
839 def is_json(in_str: Any) -> bool:
842 in_str: the string to test
845 True if the in_str contains valid JSON and False otherwise.
847 >>> is_json('{"name": "Peter"}')
849 >>> is_json('[1, 2, 3]')
851 >>> is_json('{nope}')
854 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
856 return isinstance(json.loads(in_str), (dict, list))
857 except (TypeError, ValueError, OverflowError):
862 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
865 in_str: the string to test
868 True if the in_str contains a valid UUID and False otherwise.
870 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
872 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
874 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
877 # string casting is used to allow UUID itself as input data type
880 return UUID_HEX_OK_RE.match(s) is not None
881 return UUID_RE.match(s) is not None
884 def is_ip_v4(in_str: Any) -> bool:
887 in_str: the string to test
890 True if in_str contains a valid IPv4 address and False otherwise.
892 >>> is_ip_v4('255.200.100.75')
896 >>> is_ip_v4('255.200.100.999') # 999 out of range
899 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
902 # checks that each entry in the ip is in the valid range (0 to 255)
903 for token in in_str.split("."):
904 if not 0 <= int(token) <= 255:
909 def extract_ip_v4(in_str: Any) -> Optional[str]:
912 in_str: the string to extract an IPv4 address from.
915 The first extracted IPv4 address from in_str or None if
916 none were found or an error occurred.
918 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
920 >>> extract_ip_v4('Your mom dresses you funny.')
922 if not is_full_string(in_str):
924 m = ANYWHERE_IP_V4_RE.search(in_str)
930 def is_ip_v6(in_str: Any) -> bool:
933 in_str: the string to test.
936 True if in_str contains a valid IPv6 address and False otherwise.
938 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
940 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
943 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
946 def extract_ip_v6(in_str: Any) -> Optional[str]:
949 in_str: the string from which to extract an IPv6 address.
952 The first IPv6 address found in in_str or None if no address
953 was found or an error occurred.
955 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
956 '2001:db8:85a3:0000:0000:8a2e:370:7334'
957 >>> extract_ip_v6("(and she's ugly too, btw)")
959 if not is_full_string(in_str):
961 m = ANYWHERE_IP_V6_RE.search(in_str)
967 def is_ip(in_str: Any) -> bool:
970 in_str: the string to test.
973 True if in_str contains a valid IP address (either IPv4 or
976 >>> is_ip('255.200.100.75')
978 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
982 >>> is_ip('1.2.3.999')
985 return is_ip_v6(in_str) or is_ip_v4(in_str)
988 def extract_ip(in_str: Any) -> Optional[str]:
991 in_str: the string from which to extract in IP address.
994 The first IP address (IPv4 or IPv6) found in in_str or
995 None to indicate none found or an error condition.
997 >>> extract_ip('Attacker: 255.200.100.75')
999 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1000 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1001 >>> extract_ip('1.2.3')
1003 ip = extract_ip_v4(in_str)
1005 ip = extract_ip_v6(in_str)
1009 def is_mac_address(in_str: Any) -> bool:
1012 in_str: the string to test
1015 True if in_str is a valid MAC address False otherwise.
1017 >>> is_mac_address("34:29:8F:12:0D:2F")
1019 >>> is_mac_address('34:29:8f:12:0d:2f')
1021 >>> is_mac_address('34-29-8F-12-0D-2F')
1023 >>> is_mac_address("test")
1026 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1029 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1032 in_str: the string from which to extract a MAC address.
1035 The first MAC address found in in_str or None to indicate no
1038 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1041 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1044 if not is_full_string(in_str):
1047 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1050 mac.replace(":", separator)
1051 mac.replace("-", separator)
1056 def is_slug(in_str: Any, separator: str = "-") -> bool:
1059 in_str: string to test
1062 True if in_str is a slug string and False otherwise.
1064 >>> is_slug('my-blog-post-title')
1066 >>> is_slug('My blog post title')
1069 if not is_full_string(in_str):
1071 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1072 return re.match(rex, in_str) is not None
1075 def contains_html(in_str: str) -> bool:
1078 in_str: the string to check for tags in
1081 True if the given string contains HTML/XML tags and False
1085 By design, this function matches ANY type of tag, so don't expect
1086 to use it as an HTML validator. It's a quick sanity check at
1087 best. See something like BeautifulSoup for a more full-featuered
1090 >>> contains_html('my string is <strong>bold</strong>')
1092 >>> contains_html('my string is not bold')
1096 if not is_string(in_str):
1097 raise ValueError(in_str)
1098 return HTML_RE.search(in_str) is not None
1101 def words_count(in_str: str) -> int:
1104 in_str: the string to count words in
1107 The number of words contained in the given string.
1111 This method is "smart" in that it does consider only sequences
1112 of one or more letter and/or numbers to be "words". Thus a
1113 string like this: "! @ # % ... []" will return zero. Moreover
1114 it is aware of punctuation, so the count for a string like
1115 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1118 >>> words_count('hello world')
1120 >>> words_count('one,two,three.stop')
1123 if not is_string(in_str):
1124 raise ValueError(in_str)
1125 return len(WORDS_COUNT_RE.findall(in_str))
1128 def word_count(in_str: str) -> int:
1131 in_str: the string to count words in
1134 The number of words contained in the given string.
1138 This method is "smart" in that it does consider only sequences
1139 of one or more letter and/or numbers to be "words". Thus a
1140 string like this: "! @ # % ... []" will return zero. Moreover
1141 it is aware of punctuation, so the count for a string like
1142 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1145 >>> word_count('hello world')
1147 >>> word_count('one,two,three.stop')
1150 return words_count(in_str)
1153 def generate_uuid(omit_dashes: bool = False) -> str:
1156 omit_dashes: should we omit the dashes in the generated UUID?
1159 A generated UUID string (using `uuid.uuid4()`) with or without
1160 dashes per the omit_dashes arg.
1162 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1163 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1171 def generate_random_alphanumeric_string(size: int) -> str:
1174 size: number of characters to generate
1177 A string of the specified size containing random characters
1178 (uppercase/lowercase ascii letters and digits).
1181 >>> generate_random_alphanumeric_string(9)
1185 raise ValueError("size must be >= 1")
1186 chars = string.ascii_letters + string.digits
1187 buffer = [random.choice(chars) for _ in range(size)]
1188 return from_char_list(buffer)
1191 def reverse(in_str: str) -> str:
1194 in_str: the string to reverse
1197 The reversed (chracter by character) string.
1202 if not is_string(in_str):
1203 raise ValueError(in_str)
1207 def camel_case_to_snake_case(in_str, *, separator="_"):
1210 in_str: the camel case string to convert
1213 A snake case string equivalent to the camel case input or the
1214 original string if it is not a valid camel case string or some
1217 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1218 'mac_address_extractor_factory'
1219 >>> camel_case_to_snake_case('Luke Skywalker')
1222 if not is_string(in_str):
1223 raise ValueError(in_str)
1224 if not is_camel_case(in_str):
1226 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1229 def snake_case_to_camel_case(
1230 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1234 in_str: the snake case string to convert
1237 A camel case string that is equivalent to the snake case string
1238 provided or the original string back again if it is not valid
1239 snake case or another error occurs.
1241 >>> snake_case_to_camel_case('this_is_a_test')
1243 >>> snake_case_to_camel_case('Han Solo')
1246 if not is_string(in_str):
1247 raise ValueError(in_str)
1248 if not is_snake_case(in_str, separator=separator):
1250 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1251 if not upper_case_first:
1252 tokens[0] = tokens[0].lower()
1253 return from_char_list(tokens)
1256 def to_char_list(in_str: str) -> List[str]:
1259 in_str: the string to split into a char list
1262 A list of strings of length one each.
1264 >>> to_char_list('test')
1265 ['t', 'e', 's', 't']
1267 if not is_string(in_str):
1272 def from_char_list(in_list: List[str]) -> str:
1275 in_list: A list of characters to convert into a string.
1278 The string resulting from gluing the characters in in_list
1281 >>> from_char_list(['t', 'e', 's', 't'])
1284 return "".join(in_list)
1287 def shuffle(in_str: str) -> Optional[str]:
1290 in_str: a string to shuffle randomly by character
1293 A new string containing same chars of the given one but in
1294 a randomized order. Note that in rare cases this could result
1295 in the same original string as no check is done. Returns
1296 None to indicate error conditions.
1299 >>> shuffle('awesome')
1302 if not is_string(in_str):
1304 chars = to_char_list(in_str)
1305 random.shuffle(chars)
1306 return from_char_list(chars)
1309 def scramble(in_str: str) -> Optional[str]:
1312 in_str: a string to shuffle randomly by character
1315 A new string containing same chars of the given one but in
1316 a randomized order. Note that in rare cases this could result
1317 in the same original string as no check is done. Returns
1318 None to indicate error conditions.
1321 >>> scramble('awesome')
1324 return shuffle(in_str)
1327 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1330 in_str: the string to strip tags from
1331 keep_tag_content: should we keep the inner contents of tags?
1334 A string with all HTML tags removed (optionally with tag contents
1338 This method uses simple regular expressions to strip tags and is
1339 not a full fledged HTML parser by any means. Consider using
1340 something like BeautifulSoup if your needs are more than this
1341 simple code can fulfill.
1343 >>> strip_html('test: <a href="foo/bar">click here</a>')
1345 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1348 if not is_string(in_str):
1349 raise ValueError(in_str)
1350 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1351 return r.sub("", in_str)
1354 def asciify(in_str: str) -> str:
1357 in_str: the string to asciify.
1360 An output string roughly equivalent to the original string
1361 where all content to are ascii-only. This is accomplished
1362 by translating all non-ascii chars into their closest possible
1363 ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1366 Some chars may be lost if impossible to translate.
1368 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1369 'eeuuooaaeynAAACIINOE'
1371 if not is_string(in_str):
1372 raise ValueError(in_str)
1374 # "NFKD" is the algorithm which is able to successfully translate
1375 # the most of non-ascii chars.
1376 normalized = unicodedata.normalize("NFKD", in_str)
1378 # encode string forcing ascii and ignore any errors
1379 # (unrepresentable chars will be stripped out)
1380 ascii_bytes = normalized.encode("ascii", "ignore")
1382 # turns encoded bytes into an utf-8 string
1383 return ascii_bytes.decode("utf-8")
1386 def slugify(in_str: str, *, separator: str = "-") -> str:
1389 in_str: the string to slugify
1390 separator: the character to use during sligification (default
1394 The converted string. The returned string has the following properties:
1397 * all letters are in lower case
1398 * all punctuation signs and non alphanumeric chars are removed
1399 * words are divided using provided separator
1400 * all chars are encoded as ascii (by using :meth:`asciify`)
1403 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1404 'top-10-reasons-to-love-dogs'
1405 >>> slugify('Mönstér Mägnët')
1408 if not is_string(in_str):
1409 raise ValueError(in_str)
1411 # replace any character that is NOT letter or number with spaces
1412 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1414 # replace spaces with join sign
1415 out = SPACES_RE.sub(separator, out)
1417 # normalize joins (remove duplicates)
1418 out = re.sub(re.escape(separator) + r"+", separator, out)
1422 def to_bool(in_str: str) -> bool:
1425 in_str: the string to convert to boolean
1428 A boolean equivalent of the original string based on its contents.
1429 All conversion is case insensitive. A positive boolean (True) is
1430 returned if the string value is any of the following:
1439 Otherwise False is returned.
1459 if not is_string(in_str):
1460 raise ValueError(in_str)
1461 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1464 def to_date(in_str: str) -> Optional[datetime.date]:
1467 in_str: the string to convert into a date
1470 The datetime.date the string contained or None to indicate
1471 an error. This parser is relatively clever; see
1472 :class:`datetimez.dateparse_utils` docs for details.
1474 >>> to_date('9/11/2001')
1475 datetime.date(2001, 9, 11)
1476 >>> to_date('xyzzy')
1478 import pyutils.datetimez.dateparse_utils as du
1481 d = du.DateParser() # type: ignore
1484 except du.ParseException: # type: ignore
1485 msg = f'Unable to parse date {in_str}.'
1490 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1491 """Finds and extracts a date from the string, if possible.
1494 in_str: the string to extract a date from
1497 a datetime if date was found, otherwise None
1499 >>> extract_date("filename.txt dec 13, 2022")
1500 datetime.datetime(2022, 12, 13, 0, 0)
1502 >>> extract_date("Dear Santa, please get me a pony.")
1507 import pyutils.datetimez.dateparse_utils as du
1509 d = du.DateParser() # type: ignore
1510 chunks = in_str.split()
1511 for ngram in itertools.chain(
1512 list_utils.ngrams(chunks, 5),
1513 list_utils.ngrams(chunks, 4),
1514 list_utils.ngrams(chunks, 3),
1515 list_utils.ngrams(chunks, 2),
1518 expr = " ".join(ngram)
1519 logger.debug(f"Trying {expr}")
1521 return d.get_datetime()
1522 except du.ParseException: # type: ignore
1527 def is_valid_date(in_str: str) -> bool:
1530 in_str: the string to check
1533 True if the string represents a valid date that we can recognize
1534 and False otherwise. This parser is relatively clever; see
1535 :class:`datetimez.dateparse_utils` docs for details.
1537 >>> is_valid_date('1/2/2022')
1539 >>> is_valid_date('christmas')
1541 >>> is_valid_date('next wednesday')
1543 >>> is_valid_date('xyzzy')
1546 import pyutils.datetimez.dateparse_utils as dp
1549 d = dp.DateParser() # type: ignore
1552 except dp.ParseException: # type: ignore
1553 msg = f'Unable to parse date {in_str}.'
1558 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1561 in_str: string to parse into a datetime
1564 A python datetime parsed from in_str or None to indicate
1565 an error. This parser is relatively clever; see
1566 :class:`datetimez.dateparse_utils` docs for details.
1568 >>> to_datetime('7/20/1969 02:56 GMT')
1569 datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1571 import pyutils.datetimez.dateparse_utils as dp
1574 d = dp.DateParser() # type: ignore
1575 dt = d.parse(in_str)
1576 if isinstance(dt, datetime.datetime):
1579 msg = f'Unable to parse datetime {in_str}.'
1584 def valid_datetime(in_str: str) -> bool:
1587 in_str: the string to check
1590 True if in_str contains a valid datetime and False otherwise.
1591 This parser is relatively clever; see
1592 :class:`datetimez.dateparse_utils` docs for details.
1594 >>> valid_datetime('next wednesday at noon')
1596 >>> valid_datetime('3 weeks ago at midnight')
1598 >>> valid_datetime('next easter at 5:00 am')
1600 >>> valid_datetime('sometime soon')
1603 _ = to_datetime(in_str)
1606 msg = f'Unable to parse datetime {in_str}.'
1611 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1614 in_str: the string to squeeze
1615 character_to_squeeze: the character to remove runs of
1616 more than one in a row (default = space)
1618 Returns: A "squeezed string" where runs of more than one
1619 character_to_squeeze into one.
1621 >>> squeeze(' this is a test ')
1624 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1629 r'(' + re.escape(character_to_squeeze) + r')+',
1630 character_to_squeeze,
1635 def dedent(in_str: str) -> Optional[str]:
1638 in_str: the string to dedent
1641 A string with tab indentation removed or None on error.
1645 Inspired by analogous Scala function.
1647 >>> dedent('\t\ttest\\n\t\ting')
1650 if not is_string(in_str):
1652 line_separator = '\n'
1653 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1654 return line_separator.join(lines)
1657 def indent(in_str: str, amount: int) -> str:
1660 in_str: the string to indent
1661 amount: count of spaces to indent each line by
1664 An indented string created by prepending amount spaces.
1666 >>> indent('This is a test', 4)
1669 if not is_string(in_str):
1670 raise ValueError(in_str)
1671 line_separator = '\n'
1672 lines = [" " * amount + line for line in in_str.split(line_separator)]
1673 return line_separator.join(lines)
1676 def sprintf(*args, **kwargs) -> str:
1679 This function uses the same syntax as the builtin print
1683 An interpolated string capturing print output, like man(3)
1688 sep = kwargs.pop("sep", None)
1690 if not isinstance(sep, str):
1691 raise TypeError("sep must be None or a string")
1693 end = kwargs.pop("end", None)
1695 if not isinstance(end, str):
1696 raise TypeError("end must be None or a string")
1699 raise TypeError("invalid keyword arguments to sprint()")
1705 for i, arg in enumerate(args):
1708 if isinstance(arg, str):
1716 def strip_ansi_sequences(in_str: str) -> str:
1719 in_str: the string to strip
1722 in_str with recognized ANSI escape sequences removed.
1725 This method works by using a regular expression.
1726 It works for all ANSI escape sequences I've tested with but
1727 may miss some; caveat emptor.
1729 >>> import ansi as a
1730 >>> s = a.fg('blue') + 'blue!' + a.reset()
1731 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1733 >>> len(strip_ansi_sequences(s))
1735 >>> strip_ansi_sequences(s)
1739 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1742 class SprintfStdout(contextlib.AbstractContextManager):
1744 A context manager that captures outputs to stdout to a buffer
1745 without printing them.
1747 >>> with SprintfStdout() as buf:
1749 ... print("1, 2, 3")
1751 >>> print(buf(), end='')
1757 def __init__(self) -> None:
1758 self.destination = io.StringIO()
1759 self.recorder: contextlib.redirect_stdout
1761 def __enter__(self) -> Callable[[], str]:
1762 self.recorder = contextlib.redirect_stdout(self.destination)
1763 self.recorder.__enter__()
1764 return lambda: self.destination.getvalue()
1766 def __exit__(self, *args) -> Literal[False]:
1767 self.recorder.__exit__(*args)
1768 self.destination.seek(0)
1772 def capitalize_first_letter(in_str: str) -> str:
1775 in_str: the string to capitalize
1778 in_str with the first character capitalized.
1780 >>> capitalize_first_letter('test')
1782 >>> capitalize_first_letter("ALREADY!")
1786 return in_str[0].upper() + in_str[1:]
1789 def it_they(n: int) -> str:
1792 n: how many of them are there?
1795 'it' if n is one or 'they' otherwize.
1799 n = num_files_saved_to_tmp()
1800 print(f'Saved file{pluralize(n)} successfully.')
1801 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1813 def is_are(n: int) -> str:
1816 n: how many of them are there?
1819 'is' if n is one or 'are' otherwize.
1823 n = num_files_saved_to_tmp()
1824 print(f'Saved file{pluralize(n)} successfully.')
1825 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1838 def pluralize(n: int) -> str:
1841 n: how many of them are there?
1844 's' if n is greater than one otherwize ''.
1848 n = num_files_saved_to_tmp()
1849 print(f'Saved file{pluralize(n)} successfully.')
1850 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1855 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1858 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1866 def make_contractions(txt: str) -> str:
1867 """This code glues words in txt together to form (English)
1871 txt: the input text to be contractionized.
1874 Output text identical to original input except for any
1875 recognized contractions are formed.
1878 The order in which we create contractions is defined by the
1879 implementation and what I thought made more sense when writing
1882 >>> make_contractions('It is nice today.')
1885 >>> make_contractions('I can not even...')
1888 >>> make_contractions('She could not see!')
1891 >>> make_contractions('But she will not go.')
1894 >>> make_contractions('Verily, I shall not.')
1897 >>> make_contractions('No you cannot.')
1900 >>> make_contractions('I said you can not go.')
1901 "I said you can't go."
1937 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1941 # Special cases: can't, shan't and won't.
1942 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1944 r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
1947 r'\b(w)ill\s*(n)(o)(t)\b',
1951 flags=re.IGNORECASE,
1954 for first_list, second_list in first_second:
1955 for first in first_list:
1956 for second in second_list:
1957 # Disallow there're/where're. They're valid English
1959 if (first in ('there', 'where')) and second == 'a(re)':
1962 pattern = fr'\b({first})\s+{second}\b'
1963 if second == '(n)o(t)':
1964 replacement = r"\1\2'\3"
1966 replacement = r"\1'\2"
1967 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1972 def thify(n: int) -> str:
1975 n: how many of them are there?
1978 The proper cardinal suffix for a number.
1987 print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
1997 assert is_integer_number(digit)
2009 def ngrams(txt: str, n: int):
2012 txt: the string to create ngrams using
2013 n: how many words per ngram created?
2016 Generates the ngrams from the input string.
2018 >>> [x for x in ngrams('This is a test', 2)]
2019 ['This is', 'is a', 'a test']
2022 for ngram in ngrams_presplit(words, n):
2029 def ngrams_presplit(words: Sequence[str], n: int):
2031 Same as :meth:`ngrams` but with the string pre-split.
2033 return list_utils.ngrams(words, n)
2036 def bigrams(txt: str):
2037 """Generates the bigrams (n=2) of the given string.
2039 >>> [x for x in bigrams('this is a test')]
2040 ['this is', 'is a', 'a test']
2042 return ngrams(txt, 2)
2045 def trigrams(txt: str):
2046 """Generates the trigrams (n=3) of the given string."""
2047 return ngrams(txt, 3)
2050 def shuffle_columns_into_list(
2051 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
2053 """Helper to shuffle / parse columnar data and return the results as a
2057 input_lines: A sequence of strings that represents text that
2058 has been broken into columns by the caller
2059 column_specs: an iterable collection of numeric sequences that
2060 indicate one or more column numbers to copy to form the Nth
2061 position in the output list. See example below.
2062 delim: for column_specs that indicate we should copy more than
2063 one column from the input into this position, use delim to
2064 separate source data. Defaults to ''.
2067 A list of string created by following the instructions set forth
2070 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2071 >>> shuffle_columns_into_list(
2073 ... [ [8], [2, 3], [5, 6, 7] ],
2076 ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2080 # Column specs map input lines' columns into outputs.
2082 for spec in column_specs:
2085 hunk = hunk + delim + input_lines[n]
2086 hunk = hunk.strip(delim)
2091 def shuffle_columns_into_dict(
2092 input_lines: Sequence[str],
2093 column_specs: Iterable[Tuple[str, Iterable[int]]],
2095 ) -> Dict[str, str]:
2096 """Helper to shuffle / parse columnar data and return the results
2100 input_lines: a sequence of strings that represents text that
2101 has been broken into columns by the caller
2102 column_specs: instructions for what dictionary keys to apply
2103 to individual or compound input column data. See example
2105 delim: when forming compound output data by gluing more than
2106 one input column together, use this character to separate
2107 the source data. Defaults to ''.
2110 A dict formed by applying the column_specs instructions.
2112 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2113 >>> shuffle_columns_into_dict(
2115 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2118 {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2122 # Column specs map input lines' columns into outputs.
2123 # "key", [col1, col2...]
2124 for spec in column_specs:
2127 hunk = hunk + delim + input_lines[n]
2128 hunk = hunk.strip(delim)
2133 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2135 Interpolate a string with data from a dict.
2138 txt: the mad libs template
2139 values: what you and your kids chose for each category.
2141 >>> interpolate_using_dict('This is a {adjective} {noun}.',
2142 ... {'adjective': 'good', 'noun': 'example'})
2143 'This is a good example.'
2145 return sprintf(txt.format(**values), end='')
2148 def to_ascii(txt: str):
2151 txt: the input data to encode
2154 txt encoded as an ASCII byte string.
2156 >>> to_ascii('test')
2159 >>> to_ascii(b'1, 2, 3')
2162 if isinstance(txt, str):
2163 return txt.encode('ascii')
2164 if isinstance(txt, bytes):
2166 raise Exception('to_ascii works with strings and bytes')
2169 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
2172 txt: the input data to encode
2175 txt encoded with a 64-chracter alphabet. Similar to and compatible
2176 with uuencode/uudecode.
2178 >>> to_base64('hello?')
2181 return base64.encodebytes(txt.encode(encoding, errors))
2184 def is_base64(txt: str) -> bool:
2187 txt: the string to check
2190 True if txt is a valid base64 encoded string. This assumes
2191 txt was encoded with Python's standard base64 alphabet which
2192 is the same as what uuencode/uudecode uses).
2194 >>> is_base64('test') # all letters in the b64 alphabet
2197 >>> is_base64('another test, how do you like this one?')
2200 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
2204 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2205 alphabet = set(a.encode('ascii'))
2206 for char in to_ascii(txt.strip()):
2207 if char not in alphabet:
2212 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
2215 b64: bytestring of 64-bit encoded data to decode / convert.
2218 The decoded form of b64 as a normal python string. Similar to
2219 and compatible with uuencode / uudecode.
2221 >>> from_base64(b'aGVsbG8/\\n')
2224 return base64.decodebytes(b64).decode(encoding, errors)
2227 def chunk(txt: str, chunk_size: int):
2230 txt: a string to be chunked into evenly spaced pieces.
2231 chunk_size: the size of each chunk to make
2234 The original string chunked into evenly spaced pieces.
2236 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2237 '01001101 11000101 10101010 10101010 10011111 10101000'
2239 if len(txt) % chunk_size != 0:
2240 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2242 warnings.warn(msg, stacklevel=2)
2243 for x in range(0, len(txt), chunk_size):
2244 yield txt[x : x + chunk_size]
2247 def to_bitstring(txt: str, *, delimiter='') -> str:
2250 txt: the string to convert into a bitstring
2251 delimiter: character to insert between adjacent bytes. Note that
2252 only bitstrings with delimiter='' are interpretable by
2253 :meth:`from_bitstring`.
2256 txt converted to ascii/binary and then chopped into bytes.
2258 >>> to_bitstring('hello?')
2259 '011010000110010101101100011011000110111100111111'
2261 >>> to_bitstring('test', delimiter=' ')
2262 '01110100 01100101 01110011 01110100'
2264 >>> to_bitstring(b'test')
2265 '01110100011001010111001101110100'
2267 etxt = to_ascii(txt)
2268 bits = bin(int.from_bytes(etxt, 'big'))
2270 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2273 def is_bitstring(txt: str) -> bool:
2276 txt: the string to check
2279 True if txt is a recognized bitstring and False otherwise.
2280 Note that if delimiter is non empty this code will not
2281 recognize the bitstring.
2283 >>> is_bitstring('011010000110010101101100011011000110111100111111')
2286 >>> is_bitstring('1234')
2289 return is_binary_integer_number(f'0b{txt}')
2292 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
2295 bits: the bitstring to convert back into a python string
2296 encoding: the encoding to use
2299 The regular python string represented by bits. Note that this
2300 code does not work with to_bitstring when delimiter is non-empty.
2302 >>> from_bitstring('011010000110010101101100011011000110111100111111')
2306 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2309 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2312 txt: an IP address to chunk up for sorting purposes
2315 A tuple of IP components arranged such that the sorting of
2316 IP addresses using a normal comparator will do something sane
2319 >>> ip_v4_sort_key('10.0.0.18')
2322 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2323 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2324 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2326 if not is_ip_v4(txt):
2327 print(f"not IP: {txt}")
2329 return tuple(int(x) for x in txt.split('.'))
2332 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2335 volume: the string to chunk up for sorting purposes
2338 A tuple of volume's components such that the sorting of
2339 volumes using a normal comparator will do something sane
2342 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2343 ('usr', 'local', 'bin')
2345 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2346 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2347 ['/usr', '/usr/local', '/usr/local/bin']
2349 return tuple(x for x in volume.split('/') if len(x) > 0)
2352 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2354 Execute several replace operations in a row.
2357 in_str: the string in which to replace characters
2358 replace_set: the set of target characters to replace
2359 replacement: the character to replace any member of replace_set
2363 The string with replacements executed.
2365 >>> s = 'this_is a-test!'
2366 >>> replace_all(s, ' _-!', '')
2369 for char in replace_set:
2370 in_str = in_str.replace(char, replacement)
2374 def replace_nth(in_str: str, source: str, target: str, nth: int):
2376 Replaces the nth occurrance of a substring within a string.
2379 in_str: the string in which to run the replacement
2380 source: the substring to replace
2381 target: the replacement text
2382 nth: which occurrance of source to replace?
2384 >>> replace_nth('this is a test', ' ', '-', 3)
2387 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2388 before = in_str[:where]
2389 after = in_str[where:]
2390 after = after.replace(source, target, 1)
2391 return before + after
2394 if __name__ == '__main__':