2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
7 Modifications Copyright (c) 2021-2022 Scott Gasch
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
16 The above copyright notice and this permission notice shall be included in all
17 copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 This class is based on: https://github.com/daveoncode/python-string-utils.
28 See NOTICE in the root of this module for a detailed enumeration of what
29 work is Davide's and what work was added by Scott.
33 import contextlib # type: ignore
44 from itertools import zip_longest
56 from uuid import uuid4
58 from pyutils import list_utils
60 logger = logging.getLogger(__name__)
62 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
64 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
66 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
68 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
71 r"([a-z-]+://)" # scheme
72 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
74 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
75 r"(:\d{2,})?" # port number
76 r"(/[a-z\d_%+-]*)*" # folders
77 r"(\.[a-z\d_%+-]+)*" # file extension
78 r"(\?[a-z\d_+%-=]*)?" # query string
82 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
84 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
86 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
89 r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
92 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
94 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
96 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
98 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
100 SNAKE_CASE_TEST_RE = re.compile(
101 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
104 SNAKE_CASE_TEST_DASH_RE = re.compile(
105 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
108 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
110 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
113 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
114 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
115 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
116 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
117 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
118 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
121 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
123 UUID_RE = re.compile(
124 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
127 UUID_HEX_OK_RE = re.compile(
128 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
132 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
134 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
136 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
138 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
140 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
142 ANYWHERE_MAC_ADDRESS_RE = re.compile(
143 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
146 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
148 HTML_RE = re.compile(
149 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
150 re.IGNORECASE | re.MULTILINE | re.DOTALL,
153 HTML_TAG_ONLY_RE = re.compile(
154 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
155 re.IGNORECASE | re.MULTILINE | re.DOTALL,
158 SPACES_RE = re.compile(r"\s")
160 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
162 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
164 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
215 scales = ["hundred", "thousand", "million", "billion", "trillion"]
218 NUM_WORDS["and"] = (1, 0)
219 for i, word in enumerate(units):
220 NUM_WORDS[word] = (1, i)
221 for i, word in enumerate(tens):
222 NUM_WORDS[word] = (1, i * 10)
223 for i, word in enumerate(scales):
224 NUM_WORDS[word] = (10 ** (i * 3 or 2), 0)
225 NUM_WORDS['score'] = (20, 0)
228 def is_none_or_empty(in_str: Optional[str]) -> bool:
231 in_str: the string to test
234 True if the input string is either None or an empty string,
237 >>> is_none_or_empty("")
239 >>> is_none_or_empty(None)
241 >>> is_none_or_empty(" \t ")
243 >>> is_none_or_empty('Test')
246 return in_str is None or len(in_str.strip()) == 0
249 def is_string(obj: Any) -> bool:
252 in_str: the object to test
255 True if the object is a string and False otherwise.
257 >>> is_string('test')
263 >>> is_string([1, 2, 3])
266 return isinstance(obj, str)
269 def is_empty_string(in_str: Any) -> bool:
272 in_str: the string to test
275 True if the string is empty and False otherwise.
277 return is_empty(in_str)
280 def is_empty(in_str: Any) -> bool:
283 in_str: the string to test
286 True if the string is empty and false otherwise.
290 >>> is_empty(' \t\t ')
296 >>> is_empty([1, 2, 3])
299 return is_string(in_str) and in_str.strip() == ""
302 def is_full_string(in_str: Any) -> bool:
305 in_str: the object to test
308 True if the object is a string and is not empty ('') and
309 is not only composed of whitespace.
311 >>> is_full_string('test!')
313 >>> is_full_string('')
315 >>> is_full_string(' ')
317 >>> is_full_string(100.999)
319 >>> is_full_string({"a": 1, "b": 2})
322 return is_string(in_str) and in_str.strip() != ""
325 def is_number(in_str: str) -> bool:
328 in_str: the string to test
331 True if the string contains a valid numberic value and
335 Traceback (most recent call last):
338 >>> is_number("100.5")
340 >>> is_number("test")
344 >>> is_number([1, 2, 3])
345 Traceback (most recent call last):
347 ValueError: [1, 2, 3]
349 if not is_string(in_str):
350 raise ValueError(in_str)
351 return NUMBER_RE.match(in_str) is not None
354 def is_integer_number(in_str: str) -> bool:
357 in_str: the string to test
360 True if the string contains a valid (signed or unsigned,
361 decimal, hex, or octal, regular or scientific) integral
362 expression and False otherwise.
364 >>> is_integer_number('42')
366 >>> is_integer_number('42.0')
370 (is_number(in_str) and "." not in in_str)
371 or is_hexidecimal_integer_number(in_str)
372 or is_octal_integer_number(in_str)
373 or is_binary_integer_number(in_str)
377 def is_hexidecimal_integer_number(in_str: str) -> bool:
380 in_str: the string to test
383 True if the string is a hex integer number and False otherwise.
385 >>> is_hexidecimal_integer_number('0x12345')
387 >>> is_hexidecimal_integer_number('0x1A3E')
389 >>> is_hexidecimal_integer_number('1234') # Needs 0x
391 >>> is_hexidecimal_integer_number('-0xff')
393 >>> is_hexidecimal_integer_number('test')
395 >>> is_hexidecimal_integer_number(12345) # Not a string
396 Traceback (most recent call last):
399 >>> is_hexidecimal_integer_number(101.4)
400 Traceback (most recent call last):
403 >>> is_hexidecimal_integer_number(0x1A3E)
404 Traceback (most recent call last):
408 if not is_string(in_str):
409 raise ValueError(in_str)
410 return HEX_NUMBER_RE.match(in_str) is not None
413 def is_octal_integer_number(in_str: str) -> bool:
416 in_str: the string to test
419 True if the string is a valid octal integral number and False otherwise.
421 >>> is_octal_integer_number('0o777')
423 >>> is_octal_integer_number('-0O115')
425 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
427 >>> is_octal_integer_number('7777') # Needs 0o
429 >>> is_octal_integer_number('test')
432 if not is_string(in_str):
433 raise ValueError(in_str)
434 return OCT_NUMBER_RE.match(in_str) is not None
437 def is_binary_integer_number(in_str: str) -> bool:
440 in_str: the string to test
443 True if the string contains a binary integral number and False otherwise.
445 >>> is_binary_integer_number('0b10111')
447 >>> is_binary_integer_number('-0b111')
449 >>> is_binary_integer_number('0B10101')
451 >>> is_binary_integer_number('0b10102')
453 >>> is_binary_integer_number('0xFFF')
455 >>> is_binary_integer_number('test')
458 if not is_string(in_str):
459 raise ValueError(in_str)
460 return BIN_NUMBER_RE.match(in_str) is not None
463 def to_int(in_str: str) -> int:
466 in_str: the string to convert
469 The integral value of the string or raises on error.
474 Traceback (most recent call last):
476 ValueError: invalid literal for int() with base 10: 'test'
478 if not is_string(in_str):
479 raise ValueError(in_str)
480 if is_binary_integer_number(in_str):
481 return int(in_str, 2)
482 if is_octal_integer_number(in_str):
483 return int(in_str, 8)
484 if is_hexidecimal_integer_number(in_str):
485 return int(in_str, 16)
489 def number_string_to_integer(in_str: str) -> int:
490 """Convert a string containing a written-out number into an int.
492 >>> number_string_to_integer("one hundred fifty two")
495 >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
498 >>> number_string_to_integer("four-score and 7")
501 >>> number_string_to_integer("fifty xyzzy three")
502 Traceback (most recent call last):
504 ValueError: Unknown word: xyzzy
506 if type(in_str) == int:
510 in_str = in_str.replace('-', ' ')
511 for word in in_str.split():
512 if word not in NUM_WORDS:
513 if is_integer_number(word):
517 raise ValueError("Unknown word: " + word)
518 scale, increment = NUM_WORDS[word]
519 current = current * scale + increment
523 return result + current
526 def is_decimal_number(in_str: str) -> bool:
529 in_str: the string to check
532 True if the given string represents a decimal or False
533 otherwise. A decimal may be signed or unsigned or use
534 a "scientific notation".
537 We do not consider integers without a decimal point
538 to be decimals; they return False (see example).
540 >>> is_decimal_number('42.0')
542 >>> is_decimal_number('42')
545 return is_number(in_str) and "." in in_str
548 def strip_escape_sequences(in_str: str) -> str:
551 in_str: the string to strip of escape sequences.
554 in_str with escape sequences removed.
557 What is considered to be an "escape sequence" is defined
558 by a regular expression. While this gets common ones,
559 there may exist valid sequences that it doesn't match.
561 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
564 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
568 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
571 in_str: string or number to which to add thousands separator(s)
572 separator_char: the separator character to add (defaults to comma)
573 places: add a separator every N places (defaults to three)
576 A numeric string with thousands separators added appropriately.
578 >>> add_thousands_separator('12345678')
580 >>> add_thousands_separator(12345678)
582 >>> add_thousands_separator(12345678.99)
584 >>> add_thousands_separator('test')
585 Traceback (most recent call last):
590 if isinstance(in_str, numbers.Number):
592 if is_number(in_str):
593 return _add_thousands_separator(
594 in_str, separator_char=separator_char, places=places
596 raise ValueError(in_str)
599 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
602 (in_str, decimal_part) = in_str.split('.')
603 tmp = [iter(in_str[::-1])] * places
604 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
605 if len(decimal_part) > 0:
611 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
614 in_str: the string to test
615 allowed_schemes: an optional list of allowed schemes (e.g.
616 ['http', 'https', 'ftp']. If passed, only URLs that
617 begin with the one of the schemes passed will be considered
618 to be valid. Otherwise, any scheme:// will be considered
622 True if in_str contains a valid URL and False otherwise.
624 >>> is_url('http://www.mysite.com')
626 >>> is_url('https://mysite.com')
628 >>> is_url('.mysite.com')
630 >>> is_url('scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash')
633 if not is_full_string(in_str):
636 valid = URL_RE.match(in_str) is not None
639 return valid and any([in_str.startswith(s) for s in allowed_schemes])
643 def is_email(in_str: Any) -> bool:
646 in_str: the email address to check
648 Returns: True if the in_str contains a valid email (as defined by
649 https://tools.ietf.org/html/rfc3696#section-3) or False
654 >>> is_email('@gmail.com')
657 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
661 # we expect 2 tokens, one before "@" and one after, otherwise
662 # we have an exception and the email is not valid.
663 head, tail = in_str.split("@")
665 # head's size must be <= 64, tail <= 255, head must not start
666 # with a dot or contain multiple consecutive dots.
667 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
670 # removes escaped spaces, so that later on the test regex will
672 head = head.replace("\\ ", "")
673 if head.startswith('"') and head.endswith('"'):
674 head = head.replace(" ", "")[1:-1]
675 return EMAIL_RE.match(head + "@" + tail) is not None
678 # borderline case in which we have multiple "@" signs but the
679 # head part is correctly escaped.
680 if ESCAPED_AT_SIGN.search(in_str) is not None:
681 # replace "@" with "a" in the head
682 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
686 def suffix_string_to_number(in_str: str) -> Optional[int]:
687 """Takes a string like "33Gb" and converts it into a number (of bytes)
691 in_str: the string with a suffix to be interpreted and removed.
694 An integer number of bytes or None to indicate an error.
696 >>> suffix_string_to_number('1Mb')
698 >>> suffix_string_to_number('13.1Gb')
702 def suffix_capitalize(s: str) -> str:
706 return f"{s[0].upper()}{s[1].lower()}"
707 return suffix_capitalize(s[0:1])
709 if is_string(in_str):
710 if is_integer_number(in_str):
711 return to_int(in_str)
712 suffixes = [in_str[-2:], in_str[-1:]]
713 rest = [in_str[:-2], in_str[:-1]]
714 for x in range(len(suffixes)):
716 s = suffix_capitalize(s)
717 multiplier = NUM_SUFFIXES.get(s, None)
718 if multiplier is not None:
720 if is_integer_number(r):
721 return to_int(r) * multiplier
722 if is_decimal_number(r):
723 return int(float(r) * multiplier)
727 def number_to_suffix_string(num: int) -> Optional[str]:
728 """Take a number (of bytes) and returns a string like "43.8Gb".
731 num: an integer number of bytes
734 A string with a suffix representing num bytes concisely or
735 None to indicate an error.
737 >>> number_to_suffix_string(14066017894)
739 >>> number_to_suffix_string(1024 * 1024)
744 for (sfx, size) in NUM_SUFFIXES.items():
749 if suffix is not None:
750 return f"{d:.1f}{suffix}"
755 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
758 in_str: a string to check
759 card_type: if provided, contains the card type to validate
760 with. Otherwise, all known credit card number types will
763 Supported card types are the following:
773 True if in_str is a valid credit card number.
775 if not is_full_string(in_str):
778 if card_type is not None:
779 if card_type not in CREDIT_CARDS:
781 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
783 return CREDIT_CARDS[card_type].match(in_str) is not None
784 for c in CREDIT_CARDS:
785 if CREDIT_CARDS[c].match(in_str) is not None:
790 def is_camel_case(in_str: Any) -> bool:
793 in_str: the string to test
796 True if the string is formatted as camel case and False otherwise.
797 A string is considered camel case when:
799 * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
800 * it contains both lowercase and uppercase letters
801 * it does not start with a number
803 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
806 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
809 in_str: the string to test
811 Returns: True if the string is snake case and False otherwise. A
812 string is considered snake case when:
814 * it's composed only by lowercase/uppercase letters and digits
815 * it contains at least one underscore (or provided separator)
816 * it does not start with a number
818 >>> is_snake_case('this_is_a_test')
820 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
822 >>> is_snake_case('this-is-a-test')
824 >>> is_snake_case('this-is-a-test', separator='-')
827 if is_full_string(in_str):
828 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
829 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
832 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
834 return r.match(in_str) is not None
838 def is_json(in_str: Any) -> bool:
841 in_str: the string to test
844 True if the in_str contains valid JSON and False otherwise.
846 >>> is_json('{"name": "Peter"}')
848 >>> is_json('[1, 2, 3]')
850 >>> is_json('{nope}')
853 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
855 return isinstance(json.loads(in_str), (dict, list))
856 except (TypeError, ValueError, OverflowError):
861 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
864 in_str: the string to test
867 True if the in_str contains a valid UUID and False otherwise.
869 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
871 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
873 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
876 # string casting is used to allow UUID itself as input data type
879 return UUID_HEX_OK_RE.match(s) is not None
880 return UUID_RE.match(s) is not None
883 def is_ip_v4(in_str: Any) -> bool:
886 in_str: the string to test
889 True if in_str contains a valid IPv4 address and False otherwise.
891 >>> is_ip_v4('255.200.100.75')
895 >>> is_ip_v4('255.200.100.999') # 999 out of range
898 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
901 # checks that each entry in the ip is in the valid range (0 to 255)
902 for token in in_str.split("."):
903 if not 0 <= int(token) <= 255:
908 def extract_ip_v4(in_str: Any) -> Optional[str]:
911 in_str: the string to extract an IPv4 address from.
914 The first extracted IPv4 address from in_str or None if
915 none were found or an error occurred.
917 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
919 >>> extract_ip_v4('Your mom dresses you funny.')
921 if not is_full_string(in_str):
923 m = ANYWHERE_IP_V4_RE.search(in_str)
929 def is_ip_v6(in_str: Any) -> bool:
932 in_str: the string to test.
935 True if in_str contains a valid IPv6 address and False otherwise.
937 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
939 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
942 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
945 def extract_ip_v6(in_str: Any) -> Optional[str]:
948 in_str: the string from which to extract an IPv6 address.
951 The first IPv6 address found in in_str or None if no address
952 was found or an error occurred.
954 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
955 '2001:db8:85a3:0000:0000:8a2e:370:7334'
956 >>> extract_ip_v6("(and she's ugly too, btw)")
958 if not is_full_string(in_str):
960 m = ANYWHERE_IP_V6_RE.search(in_str)
966 def is_ip(in_str: Any) -> bool:
969 in_str: the string to test.
972 True if in_str contains a valid IP address (either IPv4 or
975 >>> is_ip('255.200.100.75')
977 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
981 >>> is_ip('1.2.3.999')
984 return is_ip_v6(in_str) or is_ip_v4(in_str)
987 def extract_ip(in_str: Any) -> Optional[str]:
990 in_str: the string from which to extract in IP address.
993 The first IP address (IPv4 or IPv6) found in in_str or
994 None to indicate none found or an error condition.
996 >>> extract_ip('Attacker: 255.200.100.75')
998 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
999 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1000 >>> extract_ip('1.2.3')
1002 ip = extract_ip_v4(in_str)
1004 ip = extract_ip_v6(in_str)
1008 def is_mac_address(in_str: Any) -> bool:
1011 in_str: the string to test
1014 True if in_str is a valid MAC address False otherwise.
1016 >>> is_mac_address("34:29:8F:12:0D:2F")
1018 >>> is_mac_address('34:29:8f:12:0d:2f')
1020 >>> is_mac_address('34-29-8F-12-0D-2F')
1022 >>> is_mac_address("test")
1025 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1028 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1031 in_str: the string from which to extract a MAC address.
1034 The first MAC address found in in_str or None to indicate no
1037 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1040 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1043 if not is_full_string(in_str):
1046 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1049 mac.replace(":", separator)
1050 mac.replace("-", separator)
1055 def is_slug(in_str: Any, separator: str = "-") -> bool:
1058 in_str: string to test
1061 True if in_str is a slug string and False otherwise.
1063 >>> is_slug('my-blog-post-title')
1065 >>> is_slug('My blog post title')
1068 if not is_full_string(in_str):
1070 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1071 return re.match(rex, in_str) is not None
1074 def contains_html(in_str: str) -> bool:
1077 in_str: the string to check for tags in
1080 True if the given string contains HTML/XML tags and False
1084 By design, this function matches ANY type of tag, so don't expect
1085 to use it as an HTML validator. It's a quick sanity check at
1086 best. See something like BeautifulSoup for a more full-featuered
1089 >>> contains_html('my string is <strong>bold</strong>')
1091 >>> contains_html('my string is not bold')
1095 if not is_string(in_str):
1096 raise ValueError(in_str)
1097 return HTML_RE.search(in_str) is not None
1100 def words_count(in_str: str) -> int:
1103 in_str: the string to count words in
1106 The number of words contained in the given string.
1110 This method is "smart" in that it does consider only sequences
1111 of one or more letter and/or numbers to be "words". Thus a
1112 string like this: "! @ # % ... []" will return zero. Moreover
1113 it is aware of punctuation, so the count for a string like
1114 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1117 >>> words_count('hello world')
1119 >>> words_count('one,two,three.stop')
1122 if not is_string(in_str):
1123 raise ValueError(in_str)
1124 return len(WORDS_COUNT_RE.findall(in_str))
1127 def word_count(in_str: str) -> int:
1130 in_str: the string to count words in
1133 The number of words contained in the given string.
1137 This method is "smart" in that it does consider only sequences
1138 of one or more letter and/or numbers to be "words". Thus a
1139 string like this: "! @ # % ... []" will return zero. Moreover
1140 it is aware of punctuation, so the count for a string like
1141 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1144 >>> word_count('hello world')
1146 >>> word_count('one,two,three.stop')
1149 return words_count(in_str)
1152 def generate_uuid(omit_dashes: bool = False) -> str:
1155 omit_dashes: should we omit the dashes in the generated UUID?
1158 A generated UUID string (using `uuid.uuid4()`) with or without
1159 dashes per the omit_dashes arg.
1161 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1162 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1170 def generate_random_alphanumeric_string(size: int) -> str:
1173 size: number of characters to generate
1176 A string of the specified size containing random characters
1177 (uppercase/lowercase ascii letters and digits).
1180 >>> generate_random_alphanumeric_string(9)
1184 raise ValueError("size must be >= 1")
1185 chars = string.ascii_letters + string.digits
1186 buffer = [random.choice(chars) for _ in range(size)]
1187 return from_char_list(buffer)
1190 def reverse(in_str: str) -> str:
1193 in_str: the string to reverse
1196 The reversed (chracter by character) string.
1201 if not is_string(in_str):
1202 raise ValueError(in_str)
1206 def camel_case_to_snake_case(in_str, *, separator="_"):
1209 in_str: the camel case string to convert
1212 A snake case string equivalent to the camel case input or the
1213 original string if it is not a valid camel case string or some
1216 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1217 'mac_address_extractor_factory'
1218 >>> camel_case_to_snake_case('Luke Skywalker')
1221 if not is_string(in_str):
1222 raise ValueError(in_str)
1223 if not is_camel_case(in_str):
1225 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1228 def snake_case_to_camel_case(
1229 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1233 in_str: the snake case string to convert
1236 A camel case string that is equivalent to the snake case string
1237 provided or the original string back again if it is not valid
1238 snake case or another error occurs.
1240 >>> snake_case_to_camel_case('this_is_a_test')
1242 >>> snake_case_to_camel_case('Han Solo')
1245 if not is_string(in_str):
1246 raise ValueError(in_str)
1247 if not is_snake_case(in_str, separator=separator):
1249 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1250 if not upper_case_first:
1251 tokens[0] = tokens[0].lower()
1252 return from_char_list(tokens)
1255 def to_char_list(in_str: str) -> List[str]:
1258 in_str: the string to split into a char list
1261 A list of strings of length one each.
1263 >>> to_char_list('test')
1264 ['t', 'e', 's', 't']
1266 if not is_string(in_str):
1271 def from_char_list(in_list: List[str]) -> str:
1274 in_list: A list of characters to convert into a string.
1277 The string resulting from gluing the characters in in_list
1280 >>> from_char_list(['t', 'e', 's', 't'])
1283 return "".join(in_list)
1286 def shuffle(in_str: str) -> Optional[str]:
1289 in_str: a string to shuffle randomly by character
1292 A new string containing same chars of the given one but in
1293 a randomized order. Note that in rare cases this could result
1294 in the same original string as no check is done. Returns
1295 None to indicate error conditions.
1298 >>> shuffle('awesome')
1301 if not is_string(in_str):
1303 chars = to_char_list(in_str)
1304 random.shuffle(chars)
1305 return from_char_list(chars)
1308 def scramble(in_str: str) -> Optional[str]:
1311 in_str: a string to shuffle randomly by character
1314 A new string containing same chars of the given one but in
1315 a randomized order. Note that in rare cases this could result
1316 in the same original string as no check is done. Returns
1317 None to indicate error conditions.
1320 >>> scramble('awesome')
1323 return shuffle(in_str)
1326 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1329 in_str: the string to strip tags from
1330 keep_tag_content: should we keep the inner contents of tags?
1333 A string with all HTML tags removed (optionally with tag contents
1337 This method uses simple regular expressions to strip tags and is
1338 not a full fledged HTML parser by any means. Consider using
1339 something like BeautifulSoup if your needs are more than this
1340 simple code can fulfill.
1342 >>> strip_html('test: <a href="foo/bar">click here</a>')
1344 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1347 if not is_string(in_str):
1348 raise ValueError(in_str)
1349 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1350 return r.sub("", in_str)
1353 def asciify(in_str: str) -> str:
1356 in_str: the string to asciify.
1359 An output string roughly equivalent to the original string
1360 where all content to are ascii-only. This is accomplished
1361 by translating all non-ascii chars into their closest possible
1362 ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1365 Some chars may be lost if impossible to translate.
1367 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1368 'eeuuooaaeynAAACIINOE'
1370 if not is_string(in_str):
1371 raise ValueError(in_str)
1373 # "NFKD" is the algorithm which is able to successfully translate
1374 # the most of non-ascii chars.
1375 normalized = unicodedata.normalize("NFKD", in_str)
1377 # encode string forcing ascii and ignore any errors
1378 # (unrepresentable chars will be stripped out)
1379 ascii_bytes = normalized.encode("ascii", "ignore")
1381 # turns encoded bytes into an utf-8 string
1382 return ascii_bytes.decode("utf-8")
1385 def slugify(in_str: str, *, separator: str = "-") -> str:
1388 in_str: the string to slugify
1389 separator: the character to use during sligification (default
1393 The converted string. The returned string has the following properties:
1396 * all letters are in lower case
1397 * all punctuation signs and non alphanumeric chars are removed
1398 * words are divided using provided separator
1399 * all chars are encoded as ascii (by using :meth:`asciify`)
1402 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1403 'top-10-reasons-to-love-dogs'
1404 >>> slugify('Mönstér Mägnët')
1407 if not is_string(in_str):
1408 raise ValueError(in_str)
1410 # replace any character that is NOT letter or number with spaces
1411 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1413 # replace spaces with join sign
1414 out = SPACES_RE.sub(separator, out)
1416 # normalize joins (remove duplicates)
1417 out = re.sub(re.escape(separator) + r"+", separator, out)
1421 def to_bool(in_str: str) -> bool:
1424 in_str: the string to convert to boolean
1427 A boolean equivalent of the original string based on its contents.
1428 All conversion is case insensitive. A positive boolean (True) is
1429 returned if the string value is any of the following:
1438 Otherwise False is returned.
1458 if not is_string(in_str):
1459 raise ValueError(in_str)
1460 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1463 def to_date(in_str: str) -> Optional[datetime.date]:
1466 in_str: the string to convert into a date
1469 The datetime.date the string contained or None to indicate
1470 an error. This parser is relatively clever; see
1471 :class:`datetimez.dateparse_utils` docs for details.
1473 >>> to_date('9/11/2001')
1474 datetime.date(2001, 9, 11)
1475 >>> to_date('xyzzy')
1477 import pyutils.datetimez.dateparse_utils as du
1480 d = du.DateParser() # type: ignore
1483 except du.ParseException: # type: ignore
1484 msg = f'Unable to parse date {in_str}.'
1489 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1490 """Finds and extracts a date from the string, if possible.
1493 in_str: the string to extract a date from
1496 a datetime if date was found, otherwise None
1498 >>> extract_date("filename.txt dec 13, 2022")
1499 datetime.datetime(2022, 12, 13, 0, 0)
1501 >>> extract_date("Dear Santa, please get me a pony.")
1506 import pyutils.datetimez.dateparse_utils as du
1508 d = du.DateParser() # type: ignore
1509 chunks = in_str.split()
1510 for ngram in itertools.chain(
1511 list_utils.ngrams(chunks, 5),
1512 list_utils.ngrams(chunks, 4),
1513 list_utils.ngrams(chunks, 3),
1514 list_utils.ngrams(chunks, 2),
1517 expr = " ".join(ngram)
1518 logger.debug(f"Trying {expr}")
1520 return d.get_datetime()
1521 except du.ParseException: # type: ignore
1526 def is_valid_date(in_str: str) -> bool:
1529 in_str: the string to check
1532 True if the string represents a valid date that we can recognize
1533 and False otherwise. This parser is relatively clever; see
1534 :class:`datetimez.dateparse_utils` docs for details.
1536 >>> is_valid_date('1/2/2022')
1538 >>> is_valid_date('christmas')
1540 >>> is_valid_date('next wednesday')
1542 >>> is_valid_date('xyzzy')
1545 import pyutils.datetimez.dateparse_utils as dp
1548 d = dp.DateParser() # type: ignore
1551 except dp.ParseException: # type: ignore
1552 msg = f'Unable to parse date {in_str}.'
1557 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1560 in_str: string to parse into a datetime
1563 A python datetime parsed from in_str or None to indicate
1564 an error. This parser is relatively clever; see
1565 :class:`datetimez.dateparse_utils` docs for details.
1567 >>> to_datetime('7/20/1969 02:56 GMT')
1568 datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1570 import pyutils.datetimez.dateparse_utils as dp
1573 d = dp.DateParser() # type: ignore
1574 dt = d.parse(in_str)
1575 if isinstance(dt, datetime.datetime):
1578 msg = f'Unable to parse datetime {in_str}.'
1583 def valid_datetime(in_str: str) -> bool:
1586 in_str: the string to check
1589 True if in_str contains a valid datetime and False otherwise.
1590 This parser is relatively clever; see
1591 :class:`datetimez.dateparse_utils` docs for details.
1593 >>> valid_datetime('next wednesday at noon')
1595 >>> valid_datetime('3 weeks ago at midnight')
1597 >>> valid_datetime('next easter at 5:00 am')
1599 >>> valid_datetime('sometime soon')
1602 _ = to_datetime(in_str)
1605 msg = f'Unable to parse datetime {in_str}.'
1610 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1613 in_str: the string to squeeze
1614 character_to_squeeze: the character to remove runs of
1615 more than one in a row (default = space)
1617 Returns: A "squeezed string" where runs of more than one
1618 character_to_squeeze into one.
1620 >>> squeeze(' this is a test ')
1623 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1628 r'(' + re.escape(character_to_squeeze) + r')+',
1629 character_to_squeeze,
1634 def dedent(in_str: str) -> Optional[str]:
1637 in_str: the string to dedent
1640 A string with tab indentation removed or None on error.
1644 Inspired by analogous Scala function.
1646 >>> dedent('\t\ttest\\n\t\ting')
1649 if not is_string(in_str):
1651 line_separator = '\n'
1652 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1653 return line_separator.join(lines)
1656 def indent(in_str: str, amount: int) -> str:
1659 in_str: the string to indent
1660 amount: count of spaces to indent each line by
1663 An indented string created by prepending amount spaces.
1665 >>> indent('This is a test', 4)
1668 if not is_string(in_str):
1669 raise ValueError(in_str)
1670 line_separator = '\n'
1671 lines = [" " * amount + line for line in in_str.split(line_separator)]
1672 return line_separator.join(lines)
1675 def sprintf(*args, **kwargs) -> str:
1678 This function uses the same syntax as the builtin print
1682 An interpolated string capturing print output, like man(3)
1687 sep = kwargs.pop("sep", None)
1689 if not isinstance(sep, str):
1690 raise TypeError("sep must be None or a string")
1692 end = kwargs.pop("end", None)
1694 if not isinstance(end, str):
1695 raise TypeError("end must be None or a string")
1698 raise TypeError("invalid keyword arguments to sprint()")
1704 for i, arg in enumerate(args):
1707 if isinstance(arg, str):
1715 def strip_ansi_sequences(in_str: str) -> str:
1718 in_str: the string to strip
1721 in_str with recognized ANSI escape sequences removed.
1724 This method works by using a regular expression.
1725 It works for all ANSI escape sequences I've tested with but
1726 may miss some; caveat emptor.
1728 >>> import ansi as a
1729 >>> s = a.fg('blue') + 'blue!' + a.reset()
1730 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1732 >>> len(strip_ansi_sequences(s))
1734 >>> strip_ansi_sequences(s)
1738 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1741 class SprintfStdout(contextlib.AbstractContextManager):
1743 A context manager that captures outputs to stdout to a buffer
1744 without printing them.
1746 >>> with SprintfStdout() as buf:
1748 ... print("1, 2, 3")
1750 >>> print(buf(), end='')
1756 def __init__(self) -> None:
1757 self.destination = io.StringIO()
1758 self.recorder: contextlib.redirect_stdout
1760 def __enter__(self) -> Callable[[], str]:
1761 self.recorder = contextlib.redirect_stdout(self.destination)
1762 self.recorder.__enter__()
1763 return lambda: self.destination.getvalue()
1765 def __exit__(self, *args) -> Literal[False]:
1766 self.recorder.__exit__(*args)
1767 self.destination.seek(0)
1771 def capitalize_first_letter(in_str: str) -> str:
1774 in_str: the string to capitalize
1777 in_str with the first character capitalized.
1779 >>> capitalize_first_letter('test')
1781 >>> capitalize_first_letter("ALREADY!")
1785 return in_str[0].upper() + in_str[1:]
1788 def it_they(n: int) -> str:
1791 n: how many of them are there?
1794 'it' if n is one or 'they' otherwize.
1798 n = num_files_saved_to_tmp()
1799 print(f'Saved file{pluralize(n)} successfully.')
1800 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1812 def is_are(n: int) -> str:
1815 n: how many of them are there?
1818 'is' if n is one or 'are' otherwize.
1822 n = num_files_saved_to_tmp()
1823 print(f'Saved file{pluralize(n)} successfully.')
1824 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1837 def pluralize(n: int) -> str:
1840 n: how many of them are there?
1843 's' if n is greater than one otherwize ''.
1847 n = num_files_saved_to_tmp()
1848 print(f'Saved file{pluralize(n)} successfully.')
1849 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1854 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1857 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1865 def make_contractions(txt: str) -> str:
1866 """This code glues words in txt together to form (English)
1870 txt: the input text to be contractionized.
1873 Output text identical to original input except for any
1874 recognized contractions are formed.
1877 The order in which we create contractions is defined by the
1878 implementation and what I thought made more sense when writing
1881 >>> make_contractions('It is nice today.')
1884 >>> make_contractions('I can not even...')
1887 >>> make_contractions('She could not see!')
1890 >>> make_contractions('But she will not go.')
1893 >>> make_contractions('Verily, I shall not.')
1896 >>> make_contractions('No you cannot.')
1899 >>> make_contractions('I said you can not go.')
1900 "I said you can't go."
1936 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1940 # Special cases: can't, shan't and won't.
1941 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1943 r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
1946 r'\b(w)ill\s*(n)(o)(t)\b',
1950 flags=re.IGNORECASE,
1953 for first_list, second_list in first_second:
1954 for first in first_list:
1955 for second in second_list:
1956 # Disallow there're/where're. They're valid English
1958 if (first in ('there', 'where')) and second == 'a(re)':
1961 pattern = fr'\b({first})\s+{second}\b'
1962 if second == '(n)o(t)':
1963 replacement = r"\1\2'\3"
1965 replacement = r"\1'\2"
1966 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1971 def thify(n: int) -> str:
1974 n: how many of them are there?
1977 The proper cardinal suffix for a number.
1986 print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
1996 assert is_integer_number(digit)
2008 def ngrams(txt: str, n: int):
2011 txt: the string to create ngrams using
2012 n: how many words per ngram created?
2015 Generates the ngrams from the input string.
2017 >>> [x for x in ngrams('This is a test', 2)]
2018 ['This is', 'is a', 'a test']
2021 for ngram in ngrams_presplit(words, n):
2028 def ngrams_presplit(words: Sequence[str], n: int):
2030 Same as :meth:`ngrams` but with the string pre-split.
2032 return list_utils.ngrams(words, n)
2035 def bigrams(txt: str):
2036 """Generates the bigrams (n=2) of the given string."""
2037 return ngrams(txt, 2)
2040 def trigrams(txt: str):
2041 """Generates the trigrams (n=3) of the given string."""
2042 return ngrams(txt, 3)
2045 def shuffle_columns_into_list(
2046 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
2048 """Helper to shuffle / parse columnar data and return the results as a
2052 input_lines: A sequence of strings that represents text that
2053 has been broken into columns by the caller
2054 column_specs: an iterable collection of numeric sequences that
2055 indicate one or more column numbers to copy to form the Nth
2056 position in the output list. See example below.
2057 delim: for column_specs that indicate we should copy more than
2058 one column from the input into this position, use delim to
2059 separate source data. Defaults to ''.
2062 A list of string created by following the instructions set forth
2065 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2066 >>> shuffle_columns_into_list(
2068 ... [ [8], [2, 3], [5, 6, 7] ],
2071 ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2075 # Column specs map input lines' columns into outputs.
2077 for spec in column_specs:
2080 hunk = hunk + delim + input_lines[n]
2081 hunk = hunk.strip(delim)
2086 def shuffle_columns_into_dict(
2087 input_lines: Sequence[str],
2088 column_specs: Iterable[Tuple[str, Iterable[int]]],
2090 ) -> Dict[str, str]:
2091 """Helper to shuffle / parse columnar data and return the results
2095 input_lines: a sequence of strings that represents text that
2096 has been broken into columns by the caller
2097 column_specs: instructions for what dictionary keys to apply
2098 to individual or compound input column data. See example
2100 delim: when forming compound output data by gluing more than
2101 one input column together, use this character to separate
2102 the source data. Defaults to ''.
2105 A dict formed by applying the column_specs instructions.
2107 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2108 >>> shuffle_columns_into_dict(
2110 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2113 {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2117 # Column specs map input lines' columns into outputs.
2118 # "key", [col1, col2...]
2119 for spec in column_specs:
2122 hunk = hunk + delim + input_lines[n]
2123 hunk = hunk.strip(delim)
2128 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2130 Interpolate a string with data from a dict.
2133 txt: the mad libs template
2134 values: what you and your kids chose for each category.
2136 >>> interpolate_using_dict('This is a {adjective} {noun}.',
2137 ... {'adjective': 'good', 'noun': 'example'})
2138 'This is a good example.'
2140 return sprintf(txt.format(**values), end='')
2143 def to_ascii(txt: str):
2146 txt: the input data to encode
2149 txt encoded as an ASCII byte string.
2151 >>> to_ascii('test')
2154 >>> to_ascii(b'1, 2, 3')
2157 if isinstance(txt, str):
2158 return txt.encode('ascii')
2159 if isinstance(txt, bytes):
2161 raise Exception('to_ascii works with strings and bytes')
2164 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
2167 txt: the input data to encode
2170 txt encoded with a 64-chracter alphabet. Similar to and compatible
2171 with uuencode/uudecode.
2173 >>> to_base64('hello?')
2176 return base64.encodebytes(txt.encode(encoding, errors))
2179 def is_base64(txt: str) -> bool:
2182 txt: the string to check
2185 True if txt is a valid base64 encoded string. This assumes
2186 txt was encoded with Python's standard base64 alphabet which
2187 is the same as what uuencode/uudecode uses).
2189 >>> is_base64('test') # all letters in the b64 alphabet
2192 >>> is_base64('another test, how do you like this one?')
2195 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
2199 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2200 alphabet = set(a.encode('ascii'))
2201 for char in to_ascii(txt.strip()):
2202 if char not in alphabet:
2207 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
2210 b64: bytestring of 64-bit encoded data to decode / convert.
2213 The decoded form of b64 as a normal python string. Similar to
2214 and compatible with uuencode / uudecode.
2216 >>> from_base64(b'aGVsbG8/\\n')
2219 return base64.decodebytes(b64).decode(encoding, errors)
2222 def chunk(txt: str, chunk_size: int):
2225 txt: a string to be chunked into evenly spaced pieces.
2226 chunk_size: the size of each chunk to make
2229 The original string chunked into evenly spaced pieces.
2231 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2232 '01001101 11000101 10101010 10101010 10011111 10101000'
2234 if len(txt) % chunk_size != 0:
2235 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2237 warnings.warn(msg, stacklevel=2)
2238 for x in range(0, len(txt), chunk_size):
2239 yield txt[x : x + chunk_size]
2242 def to_bitstring(txt: str, *, delimiter='') -> str:
2245 txt: the string to convert into a bitstring
2246 delimiter: character to insert between adjacent bytes. Note that
2247 only bitstrings with delimiter='' are interpretable by
2248 :meth:`from_bitstring`.
2251 txt converted to ascii/binary and then chopped into bytes.
2253 >>> to_bitstring('hello?')
2254 '011010000110010101101100011011000110111100111111'
2256 >>> to_bitstring('test', delimiter=' ')
2257 '01110100 01100101 01110011 01110100'
2259 >>> to_bitstring(b'test')
2260 '01110100011001010111001101110100'
2262 etxt = to_ascii(txt)
2263 bits = bin(int.from_bytes(etxt, 'big'))
2265 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2268 def is_bitstring(txt: str) -> bool:
2271 txt: the string to check
2274 True if txt is a recognized bitstring and False otherwise.
2275 Note that if delimiter is non empty this code will not
2276 recognize the bitstring.
2278 >>> is_bitstring('011010000110010101101100011011000110111100111111')
2281 >>> is_bitstring('1234')
2284 return is_binary_integer_number(f'0b{txt}')
2287 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
2290 bits: the bitstring to convert back into a python string
2291 encoding: the encoding to use
2294 The regular python string represented by bits. Note that this
2295 code does not work with to_bitstring when delimiter is non-empty.
2297 >>> from_bitstring('011010000110010101101100011011000110111100111111')
2301 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2304 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2307 txt: an IP address to chunk up for sorting purposes
2310 A tuple of IP components arranged such that the sorting of
2311 IP addresses using a normal comparator will do something sane
2314 >>> ip_v4_sort_key('10.0.0.18')
2317 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2318 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2319 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2321 if not is_ip_v4(txt):
2322 print(f"not IP: {txt}")
2324 return tuple(int(x) for x in txt.split('.'))
2327 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2330 volume: the string to chunk up for sorting purposes
2333 A tuple of volume's components such that the sorting of
2334 volumes using a normal comparator will do something sane
2337 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2338 ('usr', 'local', 'bin')
2340 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2341 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2342 ['/usr', '/usr/local', '/usr/local/bin']
2344 return tuple(x for x in volume.split('/') if len(x) > 0)
2347 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2349 Execute several replace operations in a row.
2352 in_str: the string in which to replace characters
2353 replace_set: the set of target characters to replace
2354 replacement: the character to replace any member of replace_set
2358 The string with replacements executed.
2360 >>> s = 'this_is a-test!'
2361 >>> replace_all(s, ' _-!', '')
2364 for char in replace_set:
2365 in_str = in_str.replace(char, replacement)
2369 def replace_nth(in_str: str, source: str, target: str, nth: int):
2371 Replaces the nth occurrance of a substring within a string.
2374 in_str: the string in which to run the replacement
2375 source: the substring to replace
2376 target: the replacement text
2377 nth: which occurrance of source to replace?
2379 >>> replace_nth('this is a test', ' ', '-', 3)
2382 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2383 before = in_str[:where]
2384 after = in_str[where:]
2385 after = after.replace(source, target, 1)
2386 return before + after
2389 if __name__ == '__main__':