2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
8 Modifications Copyright (c) 2021-2022 Scott Gasch
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
17 The above copyright notice and this permission notice shall be included in all
18 copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 This class is based on: https://github.com/daveoncode/python-string-utils.
29 See NOTICE in the root of this module for a detailed enumeration of what
30 work is Davide's and what work was added by Scott.
34 import contextlib # type: ignore
45 from itertools import zip_longest
57 from uuid import uuid4
59 from pyutils import list_utils
61 logger = logging.getLogger(__name__)
63 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
65 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
67 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
69 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
72 r"([a-z-]+://)" # scheme
73 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
75 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
76 r"(:\d{2,})?" # port number
77 r"(/[a-z\d_%+-]*)*" # folders
78 r"(\.[a-z\d_%+-]+)*" # file extension
79 r"(\?[a-z\d_+%-=]*)?" # query string
83 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
85 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
87 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
90 r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
93 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
95 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
97 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
99 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
101 SNAKE_CASE_TEST_RE = re.compile(
102 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
105 SNAKE_CASE_TEST_DASH_RE = re.compile(
106 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
109 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
111 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
114 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
115 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
116 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
117 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
118 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
119 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
122 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
124 UUID_RE = re.compile(
125 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
128 UUID_HEX_OK_RE = re.compile(
129 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
133 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
135 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
137 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
139 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
141 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
143 ANYWHERE_MAC_ADDRESS_RE = re.compile(
144 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
147 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
149 HTML_RE = re.compile(
150 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
151 re.IGNORECASE | re.MULTILINE | re.DOTALL,
154 HTML_TAG_ONLY_RE = re.compile(
155 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
156 re.IGNORECASE | re.MULTILINE | re.DOTALL,
159 SPACES_RE = re.compile(r"\s")
161 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
163 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
165 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
216 scales = ["hundred", "thousand", "million", "billion", "trillion", "quadrillion"]
219 NUM_WORDS["and"] = (1, 0)
220 for i, word in enumerate(UNIT_WORDS):
221 NUM_WORDS[word] = (1, i)
222 for i, word in enumerate(TENS_WORDS):
223 NUM_WORDS[word] = (1, i * 10)
224 for i, word in enumerate(scales):
226 NUM_WORDS[word] = (100, 0)
228 NUM_WORDS[word] = (10 ** (i * 3), 0)
229 NUM_WORDS['score'] = (20, 0)
232 def is_none_or_empty(in_str: Optional[str]) -> bool:
235 in_str: the string to test
238 True if the input string is either None or an empty string,
241 >>> is_none_or_empty("")
243 >>> is_none_or_empty(None)
245 >>> is_none_or_empty(" \t ")
247 >>> is_none_or_empty('Test')
250 return in_str is None or len(in_str.strip()) == 0
253 def is_string(obj: Any) -> bool:
256 in_str: the object to test
259 True if the object is a string and False otherwise.
261 >>> is_string('test')
267 >>> is_string([1, 2, 3])
270 return isinstance(obj, str)
273 def is_empty_string(in_str: Any) -> bool:
276 in_str: the string to test
279 True if the string is empty and False otherwise.
281 return is_empty(in_str)
284 def is_empty(in_str: Any) -> bool:
287 in_str: the string to test
290 True if the string is empty and false otherwise.
294 >>> is_empty(' \t\t ')
300 >>> is_empty([1, 2, 3])
303 return is_string(in_str) and in_str.strip() == ""
306 def is_full_string(in_str: Any) -> bool:
309 in_str: the object to test
312 True if the object is a string and is not empty ('') and
313 is not only composed of whitespace.
315 >>> is_full_string('test!')
317 >>> is_full_string('')
319 >>> is_full_string(' ')
321 >>> is_full_string(100.999)
323 >>> is_full_string({"a": 1, "b": 2})
326 return is_string(in_str) and in_str.strip() != ""
329 def is_number(in_str: str) -> bool:
332 in_str: the string to test
335 True if the string contains a valid numberic value and
339 Traceback (most recent call last):
342 >>> is_number("100.5")
344 >>> is_number("test")
348 >>> is_number([1, 2, 3])
349 Traceback (most recent call last):
351 ValueError: [1, 2, 3]
353 if not is_string(in_str):
354 raise ValueError(in_str)
355 return NUMBER_RE.match(in_str) is not None
358 def is_integer_number(in_str: str) -> bool:
361 in_str: the string to test
364 True if the string contains a valid (signed or unsigned,
365 decimal, hex, or octal, regular or scientific) integral
366 expression and False otherwise.
368 >>> is_integer_number('42')
370 >>> is_integer_number('42.0')
374 (is_number(in_str) and "." not in in_str)
375 or is_hexidecimal_integer_number(in_str)
376 or is_octal_integer_number(in_str)
377 or is_binary_integer_number(in_str)
381 def is_hexidecimal_integer_number(in_str: str) -> bool:
384 in_str: the string to test
387 True if the string is a hex integer number and False otherwise.
389 >>> is_hexidecimal_integer_number('0x12345')
391 >>> is_hexidecimal_integer_number('0x1A3E')
393 >>> is_hexidecimal_integer_number('1234') # Needs 0x
395 >>> is_hexidecimal_integer_number('-0xff')
397 >>> is_hexidecimal_integer_number('test')
399 >>> is_hexidecimal_integer_number(12345) # Not a string
400 Traceback (most recent call last):
403 >>> is_hexidecimal_integer_number(101.4)
404 Traceback (most recent call last):
407 >>> is_hexidecimal_integer_number(0x1A3E)
408 Traceback (most recent call last):
412 if not is_string(in_str):
413 raise ValueError(in_str)
414 return HEX_NUMBER_RE.match(in_str) is not None
417 def is_octal_integer_number(in_str: str) -> bool:
420 in_str: the string to test
423 True if the string is a valid octal integral number and False otherwise.
425 >>> is_octal_integer_number('0o777')
427 >>> is_octal_integer_number('-0O115')
429 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
431 >>> is_octal_integer_number('7777') # Needs 0o
433 >>> is_octal_integer_number('test')
436 if not is_string(in_str):
437 raise ValueError(in_str)
438 return OCT_NUMBER_RE.match(in_str) is not None
441 def is_binary_integer_number(in_str: str) -> bool:
444 in_str: the string to test
447 True if the string contains a binary integral number and False otherwise.
449 >>> is_binary_integer_number('0b10111')
451 >>> is_binary_integer_number('-0b111')
453 >>> is_binary_integer_number('0B10101')
455 >>> is_binary_integer_number('0b10102')
457 >>> is_binary_integer_number('0xFFF')
459 >>> is_binary_integer_number('test')
462 if not is_string(in_str):
463 raise ValueError(in_str)
464 return BIN_NUMBER_RE.match(in_str) is not None
467 def to_int(in_str: str) -> int:
470 in_str: the string to convert
473 The integral value of the string or raises on error.
478 Traceback (most recent call last):
480 ValueError: invalid literal for int() with base 10: 'test'
482 if not is_string(in_str):
483 raise ValueError(in_str)
484 if is_binary_integer_number(in_str):
485 return int(in_str, 2)
486 if is_octal_integer_number(in_str):
487 return int(in_str, 8)
488 if is_hexidecimal_integer_number(in_str):
489 return int(in_str, 16)
493 def number_string_to_integer(in_str: str) -> int:
494 """Convert a string containing a written-out number into an int.
496 >>> number_string_to_integer("one hundred fifty two")
499 >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
502 >>> number_string_to_integer("four-score and 7")
505 >>> number_string_to_integer("fifty xyzzy three")
506 Traceback (most recent call last):
508 ValueError: Unknown word: xyzzy
510 if type(in_str) == int:
514 in_str = in_str.replace('-', ' ')
515 for word in in_str.split():
516 if word not in NUM_WORDS:
517 if is_integer_number(word):
521 raise ValueError("Unknown word: " + word)
522 scale, increment = NUM_WORDS[word]
523 current = current * scale + increment
527 return result + current
530 def integer_to_number_string(num: int) -> str:
532 Opposite of number_string_to_integer; convert a number to a written out
535 >>> integer_to_number_string(9)
538 >>> integer_to_number_string(42)
541 >>> integer_to_number_string(123219982)
542 'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
547 return UNIT_WORDS[num]
549 ret = TENS_WORDS[num // 10]
552 ret += ' ' + UNIT_WORDS[leftover]
555 # If num > 100 go find the highest chunk and convert that, then recursively
556 # convert the rest. NUM_WORDS contains items like 'thousand' -> (1000, 0).
557 # The second item in the tuple is an increment that can be ignored; the first
558 # is the numeric "scale" of the entry. So find the greatest entry in NUM_WORDS
559 # still less than num. For 123,456 it would be thousand. Then pull out the
560 # 123, convert it, and append "thousand". Then do the rest.
562 for name, val in NUM_WORDS.items():
564 scales[name] = val[0]
565 scale = max(scales.items(), key=lambda _: _[1])
567 # scale[1] = numeric magnitude (e.g. 1000)
568 # scale[0] = name (e.g. "thousand")
569 ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
570 leftover = num % scale[1]
572 ret += ' ' + integer_to_number_string(leftover)
576 def is_decimal_number(in_str: str) -> bool:
579 in_str: the string to check
582 True if the given string represents a decimal or False
583 otherwise. A decimal may be signed or unsigned or use
584 a "scientific notation".
587 We do not consider integers without a decimal point
588 to be decimals; they return False (see example).
590 >>> is_decimal_number('42.0')
592 >>> is_decimal_number('42')
595 return is_number(in_str) and "." in in_str
598 def strip_escape_sequences(in_str: str) -> str:
601 in_str: the string to strip of escape sequences.
604 in_str with escape sequences removed.
607 What is considered to be an "escape sequence" is defined
608 by a regular expression. While this gets common ones,
609 there may exist valid sequences that it doesn't match.
611 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
614 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
618 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
621 in_str: string or number to which to add thousands separator(s)
622 separator_char: the separator character to add (defaults to comma)
623 places: add a separator every N places (defaults to three)
626 A numeric string with thousands separators added appropriately.
628 >>> add_thousands_separator('12345678')
630 >>> add_thousands_separator(12345678)
632 >>> add_thousands_separator(12345678.99)
634 >>> add_thousands_separator('test')
635 Traceback (most recent call last):
640 if isinstance(in_str, numbers.Number):
642 if is_number(in_str):
643 return _add_thousands_separator(
644 in_str, separator_char=separator_char, places=places
646 raise ValueError(in_str)
649 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
652 (in_str, decimal_part) = in_str.split('.')
653 tmp = [iter(in_str[::-1])] * places
654 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
655 if len(decimal_part) > 0:
661 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
664 in_str: the string to test
665 allowed_schemes: an optional list of allowed schemes (e.g.
666 ['http', 'https', 'ftp']. If passed, only URLs that
667 begin with the one of the schemes passed will be considered
668 to be valid. Otherwise, any scheme:// will be considered
672 True if in_str contains a valid URL and False otherwise.
674 >>> is_url('http://www.mysite.com')
676 >>> is_url('https://mysite.com')
678 >>> is_url('.mysite.com')
680 >>> is_url('scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash')
683 if not is_full_string(in_str):
686 valid = URL_RE.match(in_str) is not None
689 return valid and any([in_str.startswith(s) for s in allowed_schemes])
693 def is_email(in_str: Any) -> bool:
696 in_str: the email address to check
698 Returns: True if the in_str contains a valid email (as defined by
699 https://tools.ietf.org/html/rfc3696#section-3) or False
704 >>> is_email('@gmail.com')
707 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
711 # we expect 2 tokens, one before "@" and one after, otherwise
712 # we have an exception and the email is not valid.
713 head, tail = in_str.split("@")
715 # head's size must be <= 64, tail <= 255, head must not start
716 # with a dot or contain multiple consecutive dots.
717 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
720 # removes escaped spaces, so that later on the test regex will
722 head = head.replace("\\ ", "")
723 if head.startswith('"') and head.endswith('"'):
724 head = head.replace(" ", "")[1:-1]
725 return EMAIL_RE.match(head + "@" + tail) is not None
728 # borderline case in which we have multiple "@" signs but the
729 # head part is correctly escaped.
730 if ESCAPED_AT_SIGN.search(in_str) is not None:
731 # replace "@" with "a" in the head
732 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
736 def suffix_string_to_number(in_str: str) -> Optional[int]:
737 """Takes a string like "33Gb" and converts it into a number (of bytes)
741 in_str: the string with a suffix to be interpreted and removed.
744 An integer number of bytes or None to indicate an error.
746 >>> suffix_string_to_number('1Mb')
748 >>> suffix_string_to_number('13.1Gb')
752 def suffix_capitalize(s: str) -> str:
756 return f"{s[0].upper()}{s[1].lower()}"
757 return suffix_capitalize(s[0:1])
759 if is_string(in_str):
760 if is_integer_number(in_str):
761 return to_int(in_str)
762 suffixes = [in_str[-2:], in_str[-1:]]
763 rest = [in_str[:-2], in_str[:-1]]
764 for x in range(len(suffixes)):
766 s = suffix_capitalize(s)
767 multiplier = NUM_SUFFIXES.get(s, None)
768 if multiplier is not None:
770 if is_integer_number(r):
771 return to_int(r) * multiplier
772 if is_decimal_number(r):
773 return int(float(r) * multiplier)
777 def number_to_suffix_string(num: int) -> Optional[str]:
778 """Take a number (of bytes) and returns a string like "43.8Gb".
781 num: an integer number of bytes
784 A string with a suffix representing num bytes concisely or
785 None to indicate an error.
787 >>> number_to_suffix_string(14066017894)
789 >>> number_to_suffix_string(1024 * 1024)
794 for (sfx, size) in NUM_SUFFIXES.items():
799 if suffix is not None:
800 return f"{d:.1f}{suffix}"
805 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
808 in_str: a string to check
809 card_type: if provided, contains the card type to validate
810 with. Otherwise, all known credit card number types will
813 Supported card types are the following:
823 True if in_str is a valid credit card number.
825 if not is_full_string(in_str):
828 if card_type is not None:
829 if card_type not in CREDIT_CARDS:
831 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
833 return CREDIT_CARDS[card_type].match(in_str) is not None
834 for c in CREDIT_CARDS:
835 if CREDIT_CARDS[c].match(in_str) is not None:
840 def is_camel_case(in_str: Any) -> bool:
843 in_str: the string to test
846 True if the string is formatted as camel case and False otherwise.
847 A string is considered camel case when:
849 * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
850 * it contains both lowercase and uppercase letters
851 * it does not start with a number
853 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
856 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
859 in_str: the string to test
861 Returns: True if the string is snake case and False otherwise. A
862 string is considered snake case when:
864 * it's composed only by lowercase/uppercase letters and digits
865 * it contains at least one underscore (or provided separator)
866 * it does not start with a number
868 >>> is_snake_case('this_is_a_test')
870 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
872 >>> is_snake_case('this-is-a-test')
874 >>> is_snake_case('this-is-a-test', separator='-')
877 if is_full_string(in_str):
878 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
879 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
882 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
884 return r.match(in_str) is not None
888 def is_json(in_str: Any) -> bool:
891 in_str: the string to test
894 True if the in_str contains valid JSON and False otherwise.
896 >>> is_json('{"name": "Peter"}')
898 >>> is_json('[1, 2, 3]')
900 >>> is_json('{nope}')
903 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
905 return isinstance(json.loads(in_str), (dict, list))
906 except (TypeError, ValueError, OverflowError):
911 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
914 in_str: the string to test
917 True if the in_str contains a valid UUID and False otherwise.
919 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
921 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
923 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
926 # string casting is used to allow UUID itself as input data type
929 return UUID_HEX_OK_RE.match(s) is not None
930 return UUID_RE.match(s) is not None
933 def is_ip_v4(in_str: Any) -> bool:
936 in_str: the string to test
939 True if in_str contains a valid IPv4 address and False otherwise.
941 >>> is_ip_v4('255.200.100.75')
945 >>> is_ip_v4('255.200.100.999') # 999 out of range
948 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
951 # checks that each entry in the ip is in the valid range (0 to 255)
952 for token in in_str.split("."):
953 if not 0 <= int(token) <= 255:
958 def extract_ip_v4(in_str: Any) -> Optional[str]:
961 in_str: the string to extract an IPv4 address from.
964 The first extracted IPv4 address from in_str or None if
965 none were found or an error occurred.
967 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
969 >>> extract_ip_v4('Your mom dresses you funny.')
971 if not is_full_string(in_str):
973 m = ANYWHERE_IP_V4_RE.search(in_str)
979 def is_ip_v6(in_str: Any) -> bool:
982 in_str: the string to test.
985 True if in_str contains a valid IPv6 address and False otherwise.
987 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
989 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
992 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
995 def extract_ip_v6(in_str: Any) -> Optional[str]:
998 in_str: the string from which to extract an IPv6 address.
1001 The first IPv6 address found in in_str or None if no address
1002 was found or an error occurred.
1004 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1005 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1006 >>> extract_ip_v6("(and she's ugly too, btw)")
1008 if not is_full_string(in_str):
1010 m = ANYWHERE_IP_V6_RE.search(in_str)
1016 def is_ip(in_str: Any) -> bool:
1019 in_str: the string to test.
1022 True if in_str contains a valid IP address (either IPv4 or
1025 >>> is_ip('255.200.100.75')
1027 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1031 >>> is_ip('1.2.3.999')
1034 return is_ip_v6(in_str) or is_ip_v4(in_str)
1037 def extract_ip(in_str: Any) -> Optional[str]:
1040 in_str: the string from which to extract in IP address.
1043 The first IP address (IPv4 or IPv6) found in in_str or
1044 None to indicate none found or an error condition.
1046 >>> extract_ip('Attacker: 255.200.100.75')
1048 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1049 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1050 >>> extract_ip('1.2.3')
1052 ip = extract_ip_v4(in_str)
1054 ip = extract_ip_v6(in_str)
1058 def is_mac_address(in_str: Any) -> bool:
1061 in_str: the string to test
1064 True if in_str is a valid MAC address False otherwise.
1066 >>> is_mac_address("34:29:8F:12:0D:2F")
1068 >>> is_mac_address('34:29:8f:12:0d:2f')
1070 >>> is_mac_address('34-29-8F-12-0D-2F')
1072 >>> is_mac_address("test")
1075 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1078 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1081 in_str: the string from which to extract a MAC address.
1084 The first MAC address found in in_str or None to indicate no
1087 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1090 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1093 if not is_full_string(in_str):
1096 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1099 mac.replace(":", separator)
1100 mac.replace("-", separator)
1105 def is_slug(in_str: Any, separator: str = "-") -> bool:
1108 in_str: string to test
1111 True if in_str is a slug string and False otherwise.
1113 >>> is_slug('my-blog-post-title')
1115 >>> is_slug('My blog post title')
1118 if not is_full_string(in_str):
1120 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1121 return re.match(rex, in_str) is not None
1124 def contains_html(in_str: str) -> bool:
1127 in_str: the string to check for tags in
1130 True if the given string contains HTML/XML tags and False
1134 By design, this function matches ANY type of tag, so don't expect
1135 to use it as an HTML validator. It's a quick sanity check at
1136 best. See something like BeautifulSoup for a more full-featuered
1139 >>> contains_html('my string is <strong>bold</strong>')
1141 >>> contains_html('my string is not bold')
1145 if not is_string(in_str):
1146 raise ValueError(in_str)
1147 return HTML_RE.search(in_str) is not None
1150 def words_count(in_str: str) -> int:
1153 in_str: the string to count words in
1156 The number of words contained in the given string.
1160 This method is "smart" in that it does consider only sequences
1161 of one or more letter and/or numbers to be "words". Thus a
1162 string like this: "! @ # % ... []" will return zero. Moreover
1163 it is aware of punctuation, so the count for a string like
1164 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1167 >>> words_count('hello world')
1169 >>> words_count('one,two,three.stop')
1172 if not is_string(in_str):
1173 raise ValueError(in_str)
1174 return len(WORDS_COUNT_RE.findall(in_str))
1177 def word_count(in_str: str) -> int:
1180 in_str: the string to count words in
1183 The number of words contained in the given string.
1187 This method is "smart" in that it does consider only sequences
1188 of one or more letter and/or numbers to be "words". Thus a
1189 string like this: "! @ # % ... []" will return zero. Moreover
1190 it is aware of punctuation, so the count for a string like
1191 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1194 >>> word_count('hello world')
1196 >>> word_count('one,two,three.stop')
1199 return words_count(in_str)
1202 def generate_uuid(omit_dashes: bool = False) -> str:
1205 omit_dashes: should we omit the dashes in the generated UUID?
1208 A generated UUID string (using `uuid.uuid4()`) with or without
1209 dashes per the omit_dashes arg.
1211 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1212 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1220 def generate_random_alphanumeric_string(size: int) -> str:
1223 size: number of characters to generate
1226 A string of the specified size containing random characters
1227 (uppercase/lowercase ascii letters and digits).
1230 >>> generate_random_alphanumeric_string(9)
1234 raise ValueError("size must be >= 1")
1235 chars = string.ascii_letters + string.digits
1236 buffer = [random.choice(chars) for _ in range(size)]
1237 return from_char_list(buffer)
1240 def reverse(in_str: str) -> str:
1243 in_str: the string to reverse
1246 The reversed (chracter by character) string.
1251 if not is_string(in_str):
1252 raise ValueError(in_str)
1256 def camel_case_to_snake_case(in_str, *, separator="_"):
1259 in_str: the camel case string to convert
1262 A snake case string equivalent to the camel case input or the
1263 original string if it is not a valid camel case string or some
1266 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1267 'mac_address_extractor_factory'
1268 >>> camel_case_to_snake_case('Luke Skywalker')
1271 if not is_string(in_str):
1272 raise ValueError(in_str)
1273 if not is_camel_case(in_str):
1275 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1278 def snake_case_to_camel_case(
1279 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1283 in_str: the snake case string to convert
1286 A camel case string that is equivalent to the snake case string
1287 provided or the original string back again if it is not valid
1288 snake case or another error occurs.
1290 >>> snake_case_to_camel_case('this_is_a_test')
1292 >>> snake_case_to_camel_case('Han Solo')
1295 if not is_string(in_str):
1296 raise ValueError(in_str)
1297 if not is_snake_case(in_str, separator=separator):
1299 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1300 if not upper_case_first:
1301 tokens[0] = tokens[0].lower()
1302 return from_char_list(tokens)
1305 def to_char_list(in_str: str) -> List[str]:
1308 in_str: the string to split into a char list
1311 A list of strings of length one each.
1313 >>> to_char_list('test')
1314 ['t', 'e', 's', 't']
1316 if not is_string(in_str):
1321 def from_char_list(in_list: List[str]) -> str:
1324 in_list: A list of characters to convert into a string.
1327 The string resulting from gluing the characters in in_list
1330 >>> from_char_list(['t', 'e', 's', 't'])
1333 return "".join(in_list)
1336 def shuffle(in_str: str) -> Optional[str]:
1339 in_str: a string to shuffle randomly by character
1342 A new string containing same chars of the given one but in
1343 a randomized order. Note that in rare cases this could result
1344 in the same original string as no check is done. Returns
1345 None to indicate error conditions.
1348 >>> shuffle('awesome')
1351 if not is_string(in_str):
1353 chars = to_char_list(in_str)
1354 random.shuffle(chars)
1355 return from_char_list(chars)
1358 def scramble(in_str: str) -> Optional[str]:
1361 in_str: a string to shuffle randomly by character
1364 A new string containing same chars of the given one but in
1365 a randomized order. Note that in rare cases this could result
1366 in the same original string as no check is done. Returns
1367 None to indicate error conditions.
1370 >>> scramble('awesome')
1373 return shuffle(in_str)
1376 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1379 in_str: the string to strip tags from
1380 keep_tag_content: should we keep the inner contents of tags?
1383 A string with all HTML tags removed (optionally with tag contents
1387 This method uses simple regular expressions to strip tags and is
1388 not a full fledged HTML parser by any means. Consider using
1389 something like BeautifulSoup if your needs are more than this
1390 simple code can fulfill.
1392 >>> strip_html('test: <a href="foo/bar">click here</a>')
1394 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1397 if not is_string(in_str):
1398 raise ValueError(in_str)
1399 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1400 return r.sub("", in_str)
1403 def asciify(in_str: str) -> str:
1406 in_str: the string to asciify.
1409 An output string roughly equivalent to the original string
1410 where all content to are ascii-only. This is accomplished
1411 by translating all non-ascii chars into their closest possible
1412 ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1415 Some chars may be lost if impossible to translate.
1417 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1418 'eeuuooaaeynAAACIINOE'
1420 if not is_string(in_str):
1421 raise ValueError(in_str)
1423 # "NFKD" is the algorithm which is able to successfully translate
1424 # the most of non-ascii chars.
1425 normalized = unicodedata.normalize("NFKD", in_str)
1427 # encode string forcing ascii and ignore any errors
1428 # (unrepresentable chars will be stripped out)
1429 ascii_bytes = normalized.encode("ascii", "ignore")
1431 # turns encoded bytes into an utf-8 string
1432 return ascii_bytes.decode("utf-8")
1435 def slugify(in_str: str, *, separator: str = "-") -> str:
1438 in_str: the string to slugify
1439 separator: the character to use during sligification (default
1443 The converted string. The returned string has the following properties:
1446 * all letters are in lower case
1447 * all punctuation signs and non alphanumeric chars are removed
1448 * words are divided using provided separator
1449 * all chars are encoded as ascii (by using :meth:`asciify`)
1452 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1453 'top-10-reasons-to-love-dogs'
1454 >>> slugify('Mönstér Mägnët')
1457 if not is_string(in_str):
1458 raise ValueError(in_str)
1460 # replace any character that is NOT letter or number with spaces
1461 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1463 # replace spaces with join sign
1464 out = SPACES_RE.sub(separator, out)
1466 # normalize joins (remove duplicates)
1467 out = re.sub(re.escape(separator) + r"+", separator, out)
1471 def to_bool(in_str: str) -> bool:
1474 in_str: the string to convert to boolean
1477 A boolean equivalent of the original string based on its contents.
1478 All conversion is case insensitive. A positive boolean (True) is
1479 returned if the string value is any of the following:
1488 Otherwise False is returned.
1508 if not is_string(in_str):
1509 raise ValueError(in_str)
1510 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1513 def to_date(in_str: str) -> Optional[datetime.date]:
1516 in_str: the string to convert into a date
1519 The datetime.date the string contained or None to indicate
1520 an error. This parser is relatively clever; see
1521 :class:`datetimez.dateparse_utils` docs for details.
1523 >>> to_date('9/11/2001')
1524 datetime.date(2001, 9, 11)
1525 >>> to_date('xyzzy')
1527 import pyutils.datetimez.dateparse_utils as du
1530 d = du.DateParser() # type: ignore
1533 except du.ParseException: # type: ignore
1534 msg = f'Unable to parse date {in_str}.'
1539 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1540 """Finds and extracts a date from the string, if possible.
1543 in_str: the string to extract a date from
1546 a datetime if date was found, otherwise None
1548 >>> extract_date("filename.txt dec 13, 2022")
1549 datetime.datetime(2022, 12, 13, 0, 0)
1551 >>> extract_date("Dear Santa, please get me a pony.")
1556 import pyutils.datetimez.dateparse_utils as du
1558 d = du.DateParser() # type: ignore
1559 chunks = in_str.split()
1560 for ngram in itertools.chain(
1561 list_utils.ngrams(chunks, 5),
1562 list_utils.ngrams(chunks, 4),
1563 list_utils.ngrams(chunks, 3),
1564 list_utils.ngrams(chunks, 2),
1567 expr = " ".join(ngram)
1568 logger.debug(f"Trying {expr}")
1570 return d.get_datetime()
1571 except du.ParseException: # type: ignore
1576 def is_valid_date(in_str: str) -> bool:
1579 in_str: the string to check
1582 True if the string represents a valid date that we can recognize
1583 and False otherwise. This parser is relatively clever; see
1584 :class:`datetimez.dateparse_utils` docs for details.
1586 >>> is_valid_date('1/2/2022')
1588 >>> is_valid_date('christmas')
1590 >>> is_valid_date('next wednesday')
1592 >>> is_valid_date('xyzzy')
1595 import pyutils.datetimez.dateparse_utils as dp
1598 d = dp.DateParser() # type: ignore
1601 except dp.ParseException: # type: ignore
1602 msg = f'Unable to parse date {in_str}.'
1607 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1610 in_str: string to parse into a datetime
1613 A python datetime parsed from in_str or None to indicate
1614 an error. This parser is relatively clever; see
1615 :class:`datetimez.dateparse_utils` docs for details.
1617 >>> to_datetime('7/20/1969 02:56 GMT')
1618 datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1620 import pyutils.datetimez.dateparse_utils as dp
1623 d = dp.DateParser() # type: ignore
1624 dt = d.parse(in_str)
1625 if isinstance(dt, datetime.datetime):
1628 msg = f'Unable to parse datetime {in_str}.'
1633 def valid_datetime(in_str: str) -> bool:
1636 in_str: the string to check
1639 True if in_str contains a valid datetime and False otherwise.
1640 This parser is relatively clever; see
1641 :class:`datetimez.dateparse_utils` docs for details.
1643 >>> valid_datetime('next wednesday at noon')
1645 >>> valid_datetime('3 weeks ago at midnight')
1647 >>> valid_datetime('next easter at 5:00 am')
1649 >>> valid_datetime('sometime soon')
1652 _ = to_datetime(in_str)
1655 msg = f'Unable to parse datetime {in_str}.'
1660 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1663 in_str: the string to squeeze
1664 character_to_squeeze: the character to remove runs of
1665 more than one in a row (default = space)
1667 Returns: A "squeezed string" where runs of more than one
1668 character_to_squeeze into one.
1670 >>> squeeze(' this is a test ')
1673 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1678 r'(' + re.escape(character_to_squeeze) + r')+',
1679 character_to_squeeze,
1684 def dedent(in_str: str) -> Optional[str]:
1687 in_str: the string to dedent
1690 A string with tab indentation removed or None on error.
1694 Inspired by analogous Scala function.
1696 >>> dedent('\t\ttest\\n\t\ting')
1699 if not is_string(in_str):
1701 line_separator = '\n'
1702 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1703 return line_separator.join(lines)
1706 def indent(in_str: str, amount: int) -> str:
1709 in_str: the string to indent
1710 amount: count of spaces to indent each line by
1713 An indented string created by prepending amount spaces.
1715 >>> indent('This is a test', 4)
1718 if not is_string(in_str):
1719 raise ValueError(in_str)
1720 line_separator = '\n'
1721 lines = [" " * amount + line for line in in_str.split(line_separator)]
1722 return line_separator.join(lines)
1725 def sprintf(*args, **kwargs) -> str:
1728 This function uses the same syntax as the builtin print
1732 An interpolated string capturing print output, like man(3)
1737 sep = kwargs.pop("sep", None)
1739 if not isinstance(sep, str):
1740 raise TypeError("sep must be None or a string")
1742 end = kwargs.pop("end", None)
1744 if not isinstance(end, str):
1745 raise TypeError("end must be None or a string")
1748 raise TypeError("invalid keyword arguments to sprint()")
1754 for i, arg in enumerate(args):
1757 if isinstance(arg, str):
1765 def strip_ansi_sequences(in_str: str) -> str:
1768 in_str: the string to strip
1771 in_str with recognized ANSI escape sequences removed.
1774 This method works by using a regular expression.
1775 It works for all ANSI escape sequences I've tested with but
1776 may miss some; caveat emptor.
1778 >>> import ansi as a
1779 >>> s = a.fg('blue') + 'blue!' + a.reset()
1780 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1782 >>> len(strip_ansi_sequences(s))
1784 >>> strip_ansi_sequences(s)
1788 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1791 class SprintfStdout(contextlib.AbstractContextManager):
1793 A context manager that captures outputs to stdout to a buffer
1794 without printing them.
1796 >>> with SprintfStdout() as buf:
1798 ... print("1, 2, 3")
1800 >>> print(buf(), end='')
1806 def __init__(self) -> None:
1807 self.destination = io.StringIO()
1808 self.recorder: contextlib.redirect_stdout
1810 def __enter__(self) -> Callable[[], str]:
1811 self.recorder = contextlib.redirect_stdout(self.destination)
1812 self.recorder.__enter__()
1813 return lambda: self.destination.getvalue()
1815 def __exit__(self, *args) -> Literal[False]:
1816 self.recorder.__exit__(*args)
1817 self.destination.seek(0)
1821 def capitalize_first_letter(in_str: str) -> str:
1824 in_str: the string to capitalize
1827 in_str with the first character capitalized.
1829 >>> capitalize_first_letter('test')
1831 >>> capitalize_first_letter("ALREADY!")
1835 return in_str[0].upper() + in_str[1:]
1838 def it_they(n: int) -> str:
1841 n: how many of them are there?
1844 'it' if n is one or 'they' otherwize.
1848 n = num_files_saved_to_tmp()
1849 print(f'Saved file{pluralize(n)} successfully.')
1850 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1862 def is_are(n: int) -> str:
1865 n: how many of them are there?
1868 'is' if n is one or 'are' otherwize.
1872 n = num_files_saved_to_tmp()
1873 print(f'Saved file{pluralize(n)} successfully.')
1874 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1887 def pluralize(n: int) -> str:
1890 n: how many of them are there?
1893 's' if n is greater than one otherwize ''.
1897 n = num_files_saved_to_tmp()
1898 print(f'Saved file{pluralize(n)} successfully.')
1899 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1904 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1907 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1915 def make_contractions(txt: str) -> str:
1916 """This code glues words in txt together to form (English)
1920 txt: the input text to be contractionized.
1923 Output text identical to original input except for any
1924 recognized contractions are formed.
1927 The order in which we create contractions is defined by the
1928 implementation and what I thought made more sense when writing
1931 >>> make_contractions('It is nice today.')
1934 >>> make_contractions('I can not even...')
1937 >>> make_contractions('She could not see!')
1940 >>> make_contractions('But she will not go.')
1943 >>> make_contractions('Verily, I shall not.')
1946 >>> make_contractions('No you cannot.')
1949 >>> make_contractions('I said you can not go.')
1950 "I said you can't go."
1986 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1990 # Special cases: can't, shan't and won't.
1991 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1993 r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
1996 r'\b(w)ill\s*(n)(o)(t)\b',
2000 flags=re.IGNORECASE,
2003 for first_list, second_list in first_second:
2004 for first in first_list:
2005 for second in second_list:
2006 # Disallow there're/where're. They're valid English
2008 if (first in ('there', 'where')) and second == 'a(re)':
2011 pattern = fr'\b({first})\s+{second}\b'
2012 if second == '(n)o(t)':
2013 replacement = r"\1\2'\3"
2015 replacement = r"\1'\2"
2016 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2021 def thify(n: int) -> str:
2024 n: how many of them are there?
2027 The proper cardinal suffix for a number.
2036 print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2046 assert is_integer_number(digit)
2058 def ngrams(txt: str, n: int):
2061 txt: the string to create ngrams using
2062 n: how many words per ngram created?
2065 Generates the ngrams from the input string.
2067 >>> [x for x in ngrams('This is a test', 2)]
2068 ['This is', 'is a', 'a test']
2071 for ngram in ngrams_presplit(words, n):
2078 def ngrams_presplit(words: Sequence[str], n: int):
2080 Same as :meth:`ngrams` but with the string pre-split.
2082 return list_utils.ngrams(words, n)
2085 def bigrams(txt: str):
2086 """Generates the bigrams (n=2) of the given string.
2088 >>> [x for x in bigrams('this is a test')]
2089 ['this is', 'is a', 'a test']
2091 return ngrams(txt, 2)
2094 def trigrams(txt: str):
2095 """Generates the trigrams (n=3) of the given string."""
2096 return ngrams(txt, 3)
2099 def shuffle_columns_into_list(
2100 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
2102 """Helper to shuffle / parse columnar data and return the results as a
2106 input_lines: A sequence of strings that represents text that
2107 has been broken into columns by the caller
2108 column_specs: an iterable collection of numeric sequences that
2109 indicate one or more column numbers to copy to form the Nth
2110 position in the output list. See example below.
2111 delim: for column_specs that indicate we should copy more than
2112 one column from the input into this position, use delim to
2113 separate source data. Defaults to ''.
2116 A list of string created by following the instructions set forth
2119 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2120 >>> shuffle_columns_into_list(
2122 ... [ [8], [2, 3], [5, 6, 7] ],
2125 ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2129 # Column specs map input lines' columns into outputs.
2131 for spec in column_specs:
2134 hunk = hunk + delim + input_lines[n]
2135 hunk = hunk.strip(delim)
2140 def shuffle_columns_into_dict(
2141 input_lines: Sequence[str],
2142 column_specs: Iterable[Tuple[str, Iterable[int]]],
2144 ) -> Dict[str, str]:
2145 """Helper to shuffle / parse columnar data and return the results
2149 input_lines: a sequence of strings that represents text that
2150 has been broken into columns by the caller
2151 column_specs: instructions for what dictionary keys to apply
2152 to individual or compound input column data. See example
2154 delim: when forming compound output data by gluing more than
2155 one input column together, use this character to separate
2156 the source data. Defaults to ''.
2159 A dict formed by applying the column_specs instructions.
2161 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2162 >>> shuffle_columns_into_dict(
2164 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2167 {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2171 # Column specs map input lines' columns into outputs.
2172 # "key", [col1, col2...]
2173 for spec in column_specs:
2176 hunk = hunk + delim + input_lines[n]
2177 hunk = hunk.strip(delim)
2182 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2184 Interpolate a string with data from a dict.
2187 txt: the mad libs template
2188 values: what you and your kids chose for each category.
2190 >>> interpolate_using_dict('This is a {adjective} {noun}.',
2191 ... {'adjective': 'good', 'noun': 'example'})
2192 'This is a good example.'
2194 return sprintf(txt.format(**values), end='')
2197 def to_ascii(txt: str):
2200 txt: the input data to encode
2203 txt encoded as an ASCII byte string.
2205 >>> to_ascii('test')
2208 >>> to_ascii(b'1, 2, 3')
2211 if isinstance(txt, str):
2212 return txt.encode('ascii')
2213 if isinstance(txt, bytes):
2215 raise Exception('to_ascii works with strings and bytes')
2218 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
2221 txt: the input data to encode
2224 txt encoded with a 64-chracter alphabet. Similar to and compatible
2225 with uuencode/uudecode.
2227 >>> to_base64('hello?')
2230 return base64.encodebytes(txt.encode(encoding, errors))
2233 def is_base64(txt: str) -> bool:
2236 txt: the string to check
2239 True if txt is a valid base64 encoded string. This assumes
2240 txt was encoded with Python's standard base64 alphabet which
2241 is the same as what uuencode/uudecode uses).
2243 >>> is_base64('test') # all letters in the b64 alphabet
2246 >>> is_base64('another test, how do you like this one?')
2249 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
2253 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2254 alphabet = set(a.encode('ascii'))
2255 for char in to_ascii(txt.strip()):
2256 if char not in alphabet:
2261 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
2264 b64: bytestring of 64-bit encoded data to decode / convert.
2267 The decoded form of b64 as a normal python string. Similar to
2268 and compatible with uuencode / uudecode.
2270 >>> from_base64(b'aGVsbG8/\\n')
2273 return base64.decodebytes(b64).decode(encoding, errors)
2276 def chunk(txt: str, chunk_size: int):
2279 txt: a string to be chunked into evenly spaced pieces.
2280 chunk_size: the size of each chunk to make
2283 The original string chunked into evenly spaced pieces.
2285 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2286 '01001101 11000101 10101010 10101010 10011111 10101000'
2288 if len(txt) % chunk_size != 0:
2289 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2291 warnings.warn(msg, stacklevel=2)
2292 for x in range(0, len(txt), chunk_size):
2293 yield txt[x : x + chunk_size]
2296 def to_bitstring(txt: str, *, delimiter='') -> str:
2299 txt: the string to convert into a bitstring
2300 delimiter: character to insert between adjacent bytes. Note that
2301 only bitstrings with delimiter='' are interpretable by
2302 :meth:`from_bitstring`.
2305 txt converted to ascii/binary and then chopped into bytes.
2307 >>> to_bitstring('hello?')
2308 '011010000110010101101100011011000110111100111111'
2310 >>> to_bitstring('test', delimiter=' ')
2311 '01110100 01100101 01110011 01110100'
2313 >>> to_bitstring(b'test')
2314 '01110100011001010111001101110100'
2316 etxt = to_ascii(txt)
2317 bits = bin(int.from_bytes(etxt, 'big'))
2319 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2322 def is_bitstring(txt: str) -> bool:
2325 txt: the string to check
2328 True if txt is a recognized bitstring and False otherwise.
2329 Note that if delimiter is non empty this code will not
2330 recognize the bitstring.
2332 >>> is_bitstring('011010000110010101101100011011000110111100111111')
2335 >>> is_bitstring('1234')
2338 return is_binary_integer_number(f'0b{txt}')
2341 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
2344 bits: the bitstring to convert back into a python string
2345 encoding: the encoding to use
2348 The regular python string represented by bits. Note that this
2349 code does not work with to_bitstring when delimiter is non-empty.
2351 >>> from_bitstring('011010000110010101101100011011000110111100111111')
2355 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2358 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2361 txt: an IP address to chunk up for sorting purposes
2364 A tuple of IP components arranged such that the sorting of
2365 IP addresses using a normal comparator will do something sane
2368 >>> ip_v4_sort_key('10.0.0.18')
2371 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2372 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2373 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2375 if not is_ip_v4(txt):
2376 print(f"not IP: {txt}")
2378 return tuple(int(x) for x in txt.split('.'))
2381 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2384 volume: the string to chunk up for sorting purposes
2387 A tuple of volume's components such that the sorting of
2388 volumes using a normal comparator will do something sane
2391 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2392 ('usr', 'local', 'bin')
2394 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2395 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2396 ['/usr', '/usr/local', '/usr/local/bin']
2398 return tuple(x for x in volume.split('/') if len(x) > 0)
2401 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2403 Execute several replace operations in a row.
2406 in_str: the string in which to replace characters
2407 replace_set: the set of target characters to replace
2408 replacement: the character to replace any member of replace_set
2412 The string with replacements executed.
2414 >>> s = 'this_is a-test!'
2415 >>> replace_all(s, ' _-!', '')
2418 for char in replace_set:
2419 in_str = in_str.replace(char, replacement)
2423 def replace_nth(in_str: str, source: str, target: str, nth: int):
2425 Replaces the nth occurrance of a substring within a string.
2428 in_str: the string in which to run the replacement
2429 source: the substring to replace
2430 target: the replacement text
2431 nth: which occurrance of source to replace?
2433 >>> replace_nth('this is a test', ' ', '-', 3)
2436 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2437 before = in_str[:where]
2438 after = in_str[where:]
2439 after = after.replace(source, target, 1)
2440 return before + after
2443 if __name__ == '__main__':