2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
7 Modifications Copyright (c) 2021-2022 Scott Gasch
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
16 The above copyright notice and this permission notice shall be included in all
17 copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 This class is based on: https://github.com/daveoncode/python-string-utils.
31 import contextlib # type: ignore
42 from itertools import zip_longest
54 from uuid import uuid4
58 logger = logging.getLogger(__name__)
60 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
62 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
64 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
66 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
69 r"([a-z-]+://)" # scheme
70 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
72 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
73 r"(:\d{2,})?" # port number
74 r"(/[a-z\d_%+-]*)*" # folders
75 r"(\.[a-z\d_%+-]+)*" # file extension
76 r"(\?[a-z\d_+%-=]*)?" # query string
80 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
82 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
84 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
86 EMAILS_RAW_STRING = r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
88 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
90 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
92 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
94 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
96 SNAKE_CASE_TEST_RE = re.compile(r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE)
98 SNAKE_CASE_TEST_DASH_RE = re.compile(r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE)
100 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
102 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
105 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
106 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
107 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
108 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
109 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
110 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
113 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
115 UUID_RE = re.compile(r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE)
117 UUID_HEX_OK_RE = re.compile(
118 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
122 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
124 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
126 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
128 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
130 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
132 ANYWHERE_MAC_ADDRESS_RE = re.compile(r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE)
134 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
136 HTML_RE = re.compile(
137 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
138 re.IGNORECASE | re.MULTILINE | re.DOTALL,
141 HTML_TAG_ONLY_RE = re.compile(
142 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
143 re.IGNORECASE | re.MULTILINE | re.DOTALL,
146 SPACES_RE = re.compile(r"\s")
148 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
150 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
152 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
168 def is_none_or_empty(in_str: Optional[str]) -> bool:
171 in_str: the string to test
174 True if the input string is either None or an empty string,
177 >>> is_none_or_empty("")
179 >>> is_none_or_empty(None)
181 >>> is_none_or_empty(" \t ")
183 >>> is_none_or_empty('Test')
186 return in_str is None or len(in_str.strip()) == 0
189 def is_string(obj: Any) -> bool:
192 in_str: the object to test
195 True if the object is a string and False otherwise.
197 >>> is_string('test')
203 >>> is_string([1, 2, 3])
206 return isinstance(obj, str)
209 def is_empty_string(in_str: Any) -> bool:
212 in_str: the string to test
215 True if the string is empty and False otherwise.
217 return is_empty(in_str)
220 def is_empty(in_str: Any) -> bool:
223 in_str: the string to test
226 True if the string is empty and false otherwise.
230 >>> is_empty(' \t\t ')
236 >>> is_empty([1, 2, 3])
239 return is_string(in_str) and in_str.strip() == ""
242 def is_full_string(in_str: Any) -> bool:
245 in_str: the object to test
248 True if the object is a string and is not empty ('') and
249 is not only composed of whitespace.
251 >>> is_full_string('test!')
253 >>> is_full_string('')
255 >>> is_full_string(' ')
257 >>> is_full_string(100.999)
259 >>> is_full_string({"a": 1, "b": 2})
262 return is_string(in_str) and in_str.strip() != ""
265 def is_number(in_str: str) -> bool:
268 in_str: the string to test
271 True if the string contains a valid numberic value and
275 Traceback (most recent call last):
278 >>> is_number("100.5")
280 >>> is_number("test")
284 >>> is_number([1, 2, 3])
285 Traceback (most recent call last):
287 ValueError: [1, 2, 3]
289 if not is_string(in_str):
290 raise ValueError(in_str)
291 return NUMBER_RE.match(in_str) is not None
294 def is_integer_number(in_str: str) -> bool:
297 in_str: the string to test
300 True if the string contains a valid (signed or unsigned,
301 decimal, hex, or octal, regular or scientific) integral
302 expression and False otherwise.
304 >>> is_integer_number('42')
306 >>> is_integer_number('42.0')
310 (is_number(in_str) and "." not in in_str)
311 or is_hexidecimal_integer_number(in_str)
312 or is_octal_integer_number(in_str)
313 or is_binary_integer_number(in_str)
317 def is_hexidecimal_integer_number(in_str: str) -> bool:
320 in_str: the string to test
323 True if the string is a hex integer number and False otherwise.
325 >>> is_hexidecimal_integer_number('0x12345')
327 >>> is_hexidecimal_integer_number('0x1A3E')
329 >>> is_hexidecimal_integer_number('1234') # Needs 0x
331 >>> is_hexidecimal_integer_number('-0xff')
333 >>> is_hexidecimal_integer_number('test')
335 >>> is_hexidecimal_integer_number(12345) # Not a string
336 Traceback (most recent call last):
339 >>> is_hexidecimal_integer_number(101.4)
340 Traceback (most recent call last):
343 >>> is_hexidecimal_integer_number(0x1A3E)
344 Traceback (most recent call last):
348 if not is_string(in_str):
349 raise ValueError(in_str)
350 return HEX_NUMBER_RE.match(in_str) is not None
353 def is_octal_integer_number(in_str: str) -> bool:
356 in_str: the string to test
359 True if the string is a valid octal integral number and False otherwise.
361 >>> is_octal_integer_number('0o777')
363 >>> is_octal_integer_number('-0O115')
365 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
367 >>> is_octal_integer_number('7777') # Needs 0o
369 >>> is_octal_integer_number('test')
372 if not is_string(in_str):
373 raise ValueError(in_str)
374 return OCT_NUMBER_RE.match(in_str) is not None
377 def is_binary_integer_number(in_str: str) -> bool:
380 in_str: the string to test
383 True if the string contains a binary integral number and False otherwise.
385 >>> is_binary_integer_number('0b10111')
387 >>> is_binary_integer_number('-0b111')
389 >>> is_binary_integer_number('0B10101')
391 >>> is_binary_integer_number('0b10102')
393 >>> is_binary_integer_number('0xFFF')
395 >>> is_binary_integer_number('test')
398 if not is_string(in_str):
399 raise ValueError(in_str)
400 return BIN_NUMBER_RE.match(in_str) is not None
403 def to_int(in_str: str) -> int:
406 in_str: the string to convert
409 The integral value of the string or raises on error.
414 Traceback (most recent call last):
416 ValueError: invalid literal for int() with base 10: 'test'
418 if not is_string(in_str):
419 raise ValueError(in_str)
420 if is_binary_integer_number(in_str):
421 return int(in_str, 2)
422 if is_octal_integer_number(in_str):
423 return int(in_str, 8)
424 if is_hexidecimal_integer_number(in_str):
425 return int(in_str, 16)
429 def is_decimal_number(in_str: str) -> bool:
432 in_str: the string to check
435 True if the given string represents a decimal or False
436 otherwise. A decimal may be signed or unsigned or use
437 a "scientific notation".
440 We do not consider integers without a decimal point
441 to be decimals; they return False (see example).
443 >>> is_decimal_number('42.0')
445 >>> is_decimal_number('42')
448 return is_number(in_str) and "." in in_str
451 def strip_escape_sequences(in_str: str) -> str:
454 in_str: the string to strip of escape sequences.
457 in_str with escape sequences removed.
460 What is considered to be an "escape sequence" is defined
461 by a regular expression. While this gets common ones,
462 there may exist valid sequences that it doesn't match.
464 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
467 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
471 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
474 in_str: string or number to which to add thousands separator(s)
475 separator_char: the separator character to add (defaults to comma)
476 places: add a separator every N places (defaults to three)
479 A numeric string with thousands separators added appropriately.
481 >>> add_thousands_separator('12345678')
483 >>> add_thousands_separator(12345678)
485 >>> add_thousands_separator(12345678.99)
487 >>> add_thousands_separator('test')
488 Traceback (most recent call last):
493 if isinstance(in_str, numbers.Number):
495 if is_number(in_str):
496 return _add_thousands_separator(in_str, separator_char=separator_char, places=places)
497 raise ValueError(in_str)
500 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
503 (in_str, decimal_part) = in_str.split('.')
504 tmp = [iter(in_str[::-1])] * places
505 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
506 if len(decimal_part) > 0:
512 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
515 in_str: the string to test
516 allowed_schemes: an optional list of allowed schemes (e.g.
517 ['http', 'https', 'ftp']. If passed, only URLs that
518 begin with the one of the schemes passed will be considered
519 to be valid. Otherwise, any scheme:// will be considered
523 True if in_str contains a valid URL and False otherwise.
525 >>> is_url('http://www.mysite.com')
527 >>> is_url('https://mysite.com')
529 >>> is_url('.mysite.com')
531 >>> is_url('scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash')
534 if not is_full_string(in_str):
537 valid = URL_RE.match(in_str) is not None
540 return valid and any([in_str.startswith(s) for s in allowed_schemes])
544 def is_email(in_str: Any) -> bool:
547 in_str: the email address to check
549 Returns: True if the in_str contains a valid email (as defined by
550 https://tools.ietf.org/html/rfc3696#section-3) or False
555 >>> is_email('@gmail.com')
558 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
562 # we expect 2 tokens, one before "@" and one after, otherwise
563 # we have an exception and the email is not valid.
564 head, tail = in_str.split("@")
566 # head's size must be <= 64, tail <= 255, head must not start
567 # with a dot or contain multiple consecutive dots.
568 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
571 # removes escaped spaces, so that later on the test regex will
573 head = head.replace("\\ ", "")
574 if head.startswith('"') and head.endswith('"'):
575 head = head.replace(" ", "")[1:-1]
576 return EMAIL_RE.match(head + "@" + tail) is not None
579 # borderline case in which we have multiple "@" signs but the
580 # head part is correctly escaped.
581 if ESCAPED_AT_SIGN.search(in_str) is not None:
582 # replace "@" with "a" in the head
583 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
587 def suffix_string_to_number(in_str: str) -> Optional[int]:
588 """Takes a string like "33Gb" and converts it into a number (of bytes)
592 in_str: the string with a suffix to be interpreted and removed.
595 An integer number of bytes or None to indicate an error.
597 >>> suffix_string_to_number('1Mb')
599 >>> suffix_string_to_number('13.1Gb')
603 def suffix_capitalize(s: str) -> str:
607 return f"{s[0].upper()}{s[1].lower()}"
608 return suffix_capitalize(s[0:1])
610 if is_string(in_str):
611 if is_integer_number(in_str):
612 return to_int(in_str)
613 suffixes = [in_str[-2:], in_str[-1:]]
614 rest = [in_str[:-2], in_str[:-1]]
615 for x in range(len(suffixes)):
617 s = suffix_capitalize(s)
618 multiplier = NUM_SUFFIXES.get(s, None)
619 if multiplier is not None:
621 if is_integer_number(r):
622 return to_int(r) * multiplier
623 if is_decimal_number(r):
624 return int(float(r) * multiplier)
628 def number_to_suffix_string(num: int) -> Optional[str]:
629 """Take a number (of bytes) and returns a string like "43.8Gb".
632 num: an integer number of bytes
635 A string with a suffix representing num bytes concisely or
636 None to indicate an error.
638 >>> number_to_suffix_string(14066017894)
640 >>> number_to_suffix_string(1024 * 1024)
645 for (sfx, size) in NUM_SUFFIXES.items():
650 if suffix is not None:
651 return f"{d:.1f}{suffix}"
656 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
659 in_str: a string to check
660 card_type: if provided, contains the card type to validate
661 with. Otherwise, all known credit card number types will
664 Supported card types are the following:
674 True if in_str is a valid credit card number.
676 if not is_full_string(in_str):
679 if card_type is not None:
680 if card_type not in CREDIT_CARDS:
682 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
684 return CREDIT_CARDS[card_type].match(in_str) is not None
685 for c in CREDIT_CARDS:
686 if CREDIT_CARDS[c].match(in_str) is not None:
691 def is_camel_case(in_str: Any) -> bool:
694 in_str: the string to test
697 True if the string is formatted as camel case and False otherwise.
698 A string is considered camel case when:
700 * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
701 * it contains both lowercase and uppercase letters
702 * it does not start with a number
704 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
707 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
710 in_str: the string to test
712 Returns: True if the string is snake case and False otherwise. A
713 string is considered snake case when:
715 * it's composed only by lowercase/uppercase letters and digits
716 * it contains at least one underscore (or provided separator)
717 * it does not start with a number
719 >>> is_snake_case('this_is_a_test')
721 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
723 >>> is_snake_case('this-is-a-test')
725 >>> is_snake_case('this-is-a-test', separator='-')
728 if is_full_string(in_str):
729 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
730 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
733 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
735 return r.match(in_str) is not None
739 def is_json(in_str: Any) -> bool:
742 in_str: the string to test
745 True if the in_str contains valid JSON and False otherwise.
747 >>> is_json('{"name": "Peter"}')
749 >>> is_json('[1, 2, 3]')
751 >>> is_json('{nope}')
754 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
756 return isinstance(json.loads(in_str), (dict, list))
757 except (TypeError, ValueError, OverflowError):
762 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
765 in_str: the string to test
768 True if the in_str contains a valid UUID and False otherwise.
770 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
772 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
774 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
777 # string casting is used to allow UUID itself as input data type
780 return UUID_HEX_OK_RE.match(s) is not None
781 return UUID_RE.match(s) is not None
784 def is_ip_v4(in_str: Any) -> bool:
787 in_str: the string to test
790 True if in_str contains a valid IPv4 address and False otherwise.
792 >>> is_ip_v4('255.200.100.75')
796 >>> is_ip_v4('255.200.100.999') # 999 out of range
799 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
802 # checks that each entry in the ip is in the valid range (0 to 255)
803 for token in in_str.split("."):
804 if not 0 <= int(token) <= 255:
809 def extract_ip_v4(in_str: Any) -> Optional[str]:
812 in_str: the string to extract an IPv4 address from.
815 The first extracted IPv4 address from in_str or None if
816 none were found or an error occurred.
818 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
820 >>> extract_ip_v4('Your mom dresses you funny.')
822 if not is_full_string(in_str):
824 m = ANYWHERE_IP_V4_RE.search(in_str)
830 def is_ip_v6(in_str: Any) -> bool:
833 in_str: the string to test.
836 True if in_str contains a valid IPv6 address and False otherwise.
838 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
840 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
843 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
846 def extract_ip_v6(in_str: Any) -> Optional[str]:
849 in_str: the string from which to extract an IPv6 address.
852 The first IPv6 address found in in_str or None if no address
853 was found or an error occurred.
855 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
856 '2001:db8:85a3:0000:0000:8a2e:370:7334'
857 >>> extract_ip_v6("(and she's ugly too, btw)")
859 if not is_full_string(in_str):
861 m = ANYWHERE_IP_V6_RE.search(in_str)
867 def is_ip(in_str: Any) -> bool:
870 in_str: the string to test.
873 True if in_str contains a valid IP address (either IPv4 or
876 >>> is_ip('255.200.100.75')
878 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
882 >>> is_ip('1.2.3.999')
885 return is_ip_v6(in_str) or is_ip_v4(in_str)
888 def extract_ip(in_str: Any) -> Optional[str]:
891 in_str: the string from which to extract in IP address.
894 The first IP address (IPv4 or IPv6) found in in_str or
895 None to indicate none found or an error condition.
897 >>> extract_ip('Attacker: 255.200.100.75')
899 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
900 '2001:db8:85a3:0000:0000:8a2e:370:7334'
901 >>> extract_ip('1.2.3')
903 ip = extract_ip_v4(in_str)
905 ip = extract_ip_v6(in_str)
909 def is_mac_address(in_str: Any) -> bool:
912 in_str: the string to test
915 True if in_str is a valid MAC address False otherwise.
917 >>> is_mac_address("34:29:8F:12:0D:2F")
919 >>> is_mac_address('34:29:8f:12:0d:2f')
921 >>> is_mac_address('34-29-8F-12-0D-2F')
923 >>> is_mac_address("test")
926 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
929 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
932 in_str: the string from which to extract a MAC address.
935 The first MAC address found in in_str or None to indicate no
938 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
941 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
944 if not is_full_string(in_str):
947 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
950 mac.replace(":", separator)
951 mac.replace("-", separator)
956 def is_slug(in_str: Any, separator: str = "-") -> bool:
959 in_str: string to test
962 True if in_str is a slug string and False otherwise.
964 >>> is_slug('my-blog-post-title')
966 >>> is_slug('My blog post title')
969 if not is_full_string(in_str):
971 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
972 return re.match(rex, in_str) is not None
975 def contains_html(in_str: str) -> bool:
978 in_str: the string to check for tags in
981 True if the given string contains HTML/XML tags and False
985 By design, this function matches ANY type of tag, so don't expect
986 to use it as an HTML validator. It's a quick sanity check at
987 best. See something like BeautifulSoup for a more full-featuered
990 >>> contains_html('my string is <strong>bold</strong>')
992 >>> contains_html('my string is not bold')
996 if not is_string(in_str):
997 raise ValueError(in_str)
998 return HTML_RE.search(in_str) is not None
1001 def words_count(in_str: str) -> int:
1004 in_str: the string to count words in
1007 The number of words contained in the given string.
1011 This method is "smart" in that it does consider only sequences
1012 of one or more letter and/or numbers to be "words". Thus a
1013 string like this: "! @ # % ... []" will return zero. Moreover
1014 it is aware of punctuation, so the count for a string like
1015 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1018 >>> words_count('hello world')
1020 >>> words_count('one,two,three.stop')
1023 if not is_string(in_str):
1024 raise ValueError(in_str)
1025 return len(WORDS_COUNT_RE.findall(in_str))
1028 def word_count(in_str: str) -> int:
1031 in_str: the string to count words in
1034 The number of words contained in the given string.
1038 This method is "smart" in that it does consider only sequences
1039 of one or more letter and/or numbers to be "words". Thus a
1040 string like this: "! @ # % ... []" will return zero. Moreover
1041 it is aware of punctuation, so the count for a string like
1042 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1045 >>> word_count('hello world')
1047 >>> word_count('one,two,three.stop')
1050 return words_count(in_str)
1053 def generate_uuid(omit_dashes: bool = False) -> str:
1056 omit_dashes: should we omit the dashes in the generated UUID?
1059 A generated UUID string (using `uuid.uuid4()`) with or without
1060 dashes per the omit_dashes arg.
1062 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1063 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1071 def generate_random_alphanumeric_string(size: int) -> str:
1074 size: number of characters to generate
1077 A string of the specified size containing random characters
1078 (uppercase/lowercase ascii letters and digits).
1081 >>> generate_random_alphanumeric_string(9)
1085 raise ValueError("size must be >= 1")
1086 chars = string.ascii_letters + string.digits
1087 buffer = [random.choice(chars) for _ in range(size)]
1088 return from_char_list(buffer)
1091 def reverse(in_str: str) -> str:
1094 in_str: the string to reverse
1097 The reversed (chracter by character) string.
1102 if not is_string(in_str):
1103 raise ValueError(in_str)
1107 def camel_case_to_snake_case(in_str, *, separator="_"):
1110 in_str: the camel case string to convert
1113 A snake case string equivalent to the camel case input or the
1114 original string if it is not a valid camel case string or some
1117 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1118 'mac_address_extractor_factory'
1119 >>> camel_case_to_snake_case('Luke Skywalker')
1122 if not is_string(in_str):
1123 raise ValueError(in_str)
1124 if not is_camel_case(in_str):
1126 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1129 def snake_case_to_camel_case(
1130 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1134 in_str: the snake case string to convert
1137 A camel case string that is equivalent to the snake case string
1138 provided or the original string back again if it is not valid
1139 snake case or another error occurs.
1141 >>> snake_case_to_camel_case('this_is_a_test')
1143 >>> snake_case_to_camel_case('Han Solo')
1146 if not is_string(in_str):
1147 raise ValueError(in_str)
1148 if not is_snake_case(in_str, separator=separator):
1150 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1151 if not upper_case_first:
1152 tokens[0] = tokens[0].lower()
1153 return from_char_list(tokens)
1156 def to_char_list(in_str: str) -> List[str]:
1159 in_str: the string to split into a char list
1162 A list of strings of length one each.
1164 >>> to_char_list('test')
1165 ['t', 'e', 's', 't']
1167 if not is_string(in_str):
1172 def from_char_list(in_list: List[str]) -> str:
1175 in_list: A list of characters to convert into a string.
1178 The string resulting from gluing the characters in in_list
1181 >>> from_char_list(['t', 'e', 's', 't'])
1184 return "".join(in_list)
1187 def shuffle(in_str: str) -> Optional[str]:
1190 in_str: a string to shuffle randomly by character
1193 A new string containing same chars of the given one but in
1194 a randomized order. Note that in rare cases this could result
1195 in the same original string as no check is done. Returns
1196 None to indicate error conditions.
1199 >>> shuffle('awesome')
1202 if not is_string(in_str):
1204 chars = to_char_list(in_str)
1205 random.shuffle(chars)
1206 return from_char_list(chars)
1209 def scramble(in_str: str) -> Optional[str]:
1212 in_str: a string to shuffle randomly by character
1215 A new string containing same chars of the given one but in
1216 a randomized order. Note that in rare cases this could result
1217 in the same original string as no check is done. Returns
1218 None to indicate error conditions.
1221 >>> scramble('awesome')
1224 return shuffle(in_str)
1227 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1230 in_str: the string to strip tags from
1231 keep_tag_content: should we keep the inner contents of tags?
1234 A string with all HTML tags removed (optionally with tag contents
1238 This method uses simple regular expressions to strip tags and is
1239 not a full fledged HTML parser by any means. Consider using
1240 something like BeautifulSoup if your needs are more than this
1241 simple code can fulfill.
1243 >>> strip_html('test: <a href="foo/bar">click here</a>')
1245 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1248 if not is_string(in_str):
1249 raise ValueError(in_str)
1250 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1251 return r.sub("", in_str)
1254 def asciify(in_str: str) -> str:
1257 in_str: the string to asciify.
1260 An output string roughly equivalent to the original string
1261 where all content to are ascii-only. This is accomplished
1262 by translating all non-ascii chars into their closest possible
1263 ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1266 Some chars may be lost if impossible to translate.
1268 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1269 'eeuuooaaeynAAACIINOE'
1271 if not is_string(in_str):
1272 raise ValueError(in_str)
1274 # "NFKD" is the algorithm which is able to successfully translate
1275 # the most of non-ascii chars.
1276 normalized = unicodedata.normalize("NFKD", in_str)
1278 # encode string forcing ascii and ignore any errors
1279 # (unrepresentable chars will be stripped out)
1280 ascii_bytes = normalized.encode("ascii", "ignore")
1282 # turns encoded bytes into an utf-8 string
1283 return ascii_bytes.decode("utf-8")
1286 def slugify(in_str: str, *, separator: str = "-") -> str:
1289 in_str: the string to slugify
1290 separator: the character to use during sligification (default
1294 The converted string. The returned string has the following properties:
1297 * all letters are in lower case
1298 * all punctuation signs and non alphanumeric chars are removed
1299 * words are divided using provided separator
1300 * all chars are encoded as ascii (by using :meth:`asciify`)
1303 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1304 'top-10-reasons-to-love-dogs'
1305 >>> slugify('Mönstér Mägnët')
1308 if not is_string(in_str):
1309 raise ValueError(in_str)
1311 # replace any character that is NOT letter or number with spaces
1312 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1314 # replace spaces with join sign
1315 out = SPACES_RE.sub(separator, out)
1317 # normalize joins (remove duplicates)
1318 out = re.sub(re.escape(separator) + r"+", separator, out)
1322 def to_bool(in_str: str) -> bool:
1325 in_str: the string to convert to boolean
1328 A boolean equivalent of the original string based on its contents.
1329 All conversion is case insensitive. A positive boolean (True) is
1330 returned if the string value is any of the following:
1339 Otherwise False is returned.
1359 if not is_string(in_str):
1360 raise ValueError(in_str)
1361 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1364 def to_date(in_str: str) -> Optional[datetime.date]:
1367 in_str: the string to convert into a date
1370 The datetime.date the string contained or None to indicate
1371 an error. This parser is relatively clever; see
1372 :class:`python_modules.dateparse.dateparse_utils` docs for
1375 >>> to_date('9/11/2001')
1376 datetime.date(2001, 9, 11)
1377 >>> to_date('xyzzy')
1379 import dateparse.dateparse_utils as du
1382 d = du.DateParser() # type: ignore
1385 except du.ParseException: # type: ignore
1386 msg = f'Unable to parse date {in_str}.'
1391 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1392 """Finds and extracts a date from the string, if possible.
1395 in_str: the string to extract a date from
1398 a datetime if date was found, otherwise None
1400 >>> extract_date("filename.txt dec 13, 2022")
1401 datetime.datetime(2022, 12, 13, 0, 0)
1403 >>> extract_date("Dear Santa, please get me a pony.")
1408 import dateparse.dateparse_utils as du
1410 d = du.DateParser() # type: ignore
1411 chunks = in_str.split()
1412 for ngram in itertools.chain(
1413 list_utils.ngrams(chunks, 5),
1414 list_utils.ngrams(chunks, 4),
1415 list_utils.ngrams(chunks, 3),
1416 list_utils.ngrams(chunks, 2),
1419 expr = " ".join(ngram)
1420 logger.debug(f"Trying {expr}")
1422 return d.get_datetime()
1423 except du.ParseException: # type: ignore
1428 def is_valid_date(in_str: str) -> bool:
1431 in_str: the string to check
1434 True if the string represents a valid date that we can recognize
1435 and False otherwise. This parser is relatively clever; see
1436 :class:`python_modules.dateparse.dateparse_utils` docs for
1439 >>> is_valid_date('1/2/2022')
1441 >>> is_valid_date('christmas')
1443 >>> is_valid_date('next wednesday')
1445 >>> is_valid_date('xyzzy')
1448 import dateparse.dateparse_utils as dp
1451 d = dp.DateParser() # type: ignore
1454 except dp.ParseException: # type: ignore
1455 msg = f'Unable to parse date {in_str}.'
1460 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1463 in_str: string to parse into a datetime
1466 A python datetime parsed from in_str or None to indicate
1467 an error. This parser is relatively clever; see
1468 :class:`python_modules.dateparse.dateparse_utils` docs for
1471 >>> to_datetime('7/20/1969 02:56 GMT')
1472 datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1474 import dateparse.dateparse_utils as dp
1477 d = dp.DateParser() # type: ignore
1478 dt = d.parse(in_str)
1479 if isinstance(dt, datetime.datetime):
1482 msg = f'Unable to parse datetime {in_str}.'
1487 def valid_datetime(in_str: str) -> bool:
1490 in_str: the string to check
1493 True if in_str contains a valid datetime and False otherwise.
1494 This parser is relatively clever; see
1495 :class:`python_modules.dateparse.dateparse_utils` docs for
1498 >>> valid_datetime('next wednesday at noon')
1500 >>> valid_datetime('3 weeks ago at midnight')
1502 >>> valid_datetime('next easter at 5:00 am')
1504 >>> valid_datetime('sometime soon')
1507 _ = to_datetime(in_str)
1510 msg = f'Unable to parse datetime {in_str}.'
1515 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1518 in_str: the string to squeeze
1519 character_to_squeeze: the character to remove runs of
1520 more than one in a row (default = space)
1522 Returns: A "squeezed string" where runs of more than one
1523 character_to_squeeze into one.
1525 >>> squeeze(' this is a test ')
1528 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1533 r'(' + re.escape(character_to_squeeze) + r')+',
1534 character_to_squeeze,
1539 def dedent(in_str: str) -> Optional[str]:
1542 in_str: the string to dedent
1545 A string with tab indentation removed or None on error.
1549 Inspired by analogous Scala function.
1551 >>> dedent('\t\ttest\\n\t\ting')
1554 if not is_string(in_str):
1556 line_separator = '\n'
1557 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1558 return line_separator.join(lines)
1561 def indent(in_str: str, amount: int) -> str:
1564 in_str: the string to indent
1565 amount: count of spaces to indent each line by
1568 An indented string created by prepending amount spaces.
1570 >>> indent('This is a test', 4)
1573 if not is_string(in_str):
1574 raise ValueError(in_str)
1575 line_separator = '\n'
1576 lines = [" " * amount + line for line in in_str.split(line_separator)]
1577 return line_separator.join(lines)
1580 def sprintf(*args, **kwargs) -> str:
1583 This function uses the same syntax as the builtin print
1587 An interpolated string capturing print output, like man(3)
1592 sep = kwargs.pop("sep", None)
1594 if not isinstance(sep, str):
1595 raise TypeError("sep must be None or a string")
1597 end = kwargs.pop("end", None)
1599 if not isinstance(end, str):
1600 raise TypeError("end must be None or a string")
1603 raise TypeError("invalid keyword arguments to sprint()")
1609 for i, arg in enumerate(args):
1612 if isinstance(arg, str):
1620 def strip_ansi_sequences(in_str: str) -> str:
1623 in_str: the string to strip
1626 in_str with recognized ANSI escape sequences removed.
1629 This method works by using a regular expression.
1630 It works for all ANSI escape sequences I've tested with but
1631 may miss some; caveat emptor.
1633 >>> import ansi as a
1634 >>> s = a.fg('blue') + 'blue!' + a.reset()
1635 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1637 >>> len(strip_ansi_sequences(s))
1639 >>> strip_ansi_sequences(s)
1643 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1646 class SprintfStdout(contextlib.AbstractContextManager):
1648 A context manager that captures outputs to stdout to a buffer
1649 without printing them.
1651 >>> with SprintfStdout() as buf:
1653 ... print("1, 2, 3")
1655 >>> print(buf(), end='')
1661 def __init__(self) -> None:
1662 self.destination = io.StringIO()
1663 self.recorder: contextlib.redirect_stdout
1665 def __enter__(self) -> Callable[[], str]:
1666 self.recorder = contextlib.redirect_stdout(self.destination)
1667 self.recorder.__enter__()
1668 return lambda: self.destination.getvalue()
1670 def __exit__(self, *args) -> Literal[False]:
1671 self.recorder.__exit__(*args)
1672 self.destination.seek(0)
1676 def capitalize_first_letter(in_str: str) -> str:
1679 in_str: the string to capitalize
1682 in_str with the first character capitalized.
1684 >>> capitalize_first_letter('test')
1686 >>> capitalize_first_letter("ALREADY!")
1690 return in_str[0].upper() + in_str[1:]
1693 def it_they(n: int) -> str:
1696 n: how many of them are there?
1699 'it' if n is one or 'they' otherwize.
1703 n = num_files_saved_to_tmp()
1704 print(f'Saved file{pluralize(n)} successfully.')
1705 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1717 def is_are(n: int) -> str:
1720 n: how many of them are there?
1723 'is' if n is one or 'are' otherwize.
1727 n = num_files_saved_to_tmp()
1728 print(f'Saved file{pluralize(n)} successfully.')
1729 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1742 def pluralize(n: int) -> str:
1745 n: how many of them are there?
1748 's' if n is greater than one otherwize ''.
1752 n = num_files_saved_to_tmp()
1753 print(f'Saved file{pluralize(n)} successfully.')
1754 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
1759 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1762 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
1770 def make_contractions(txt: str) -> str:
1771 """This code glues words in txt together to form (English)
1775 txt: the input text to be contractionized.
1778 Output text identical to original input except for any
1779 recognized contractions are formed.
1782 The order in which we create contractions is defined by the
1783 implementation and what I thought made more sense when writing
1786 >>> make_contractions('It is nice today.')
1789 >>> make_contractions('I can not even...')
1792 >>> make_contractions('She could not see!')
1795 >>> make_contractions('But she will not go.')
1798 >>> make_contractions('Verily, I shall not.')
1801 >>> make_contractions('No you cannot.')
1804 >>> make_contractions('I said you can not go.')
1805 "I said you can't go."
1841 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
1845 # Special cases: can't, shan't and won't.
1846 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
1847 txt = re.sub(r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE)
1849 r'\b(w)ill\s*(n)(o)(t)\b',
1853 flags=re.IGNORECASE,
1856 for first_list, second_list in first_second:
1857 for first in first_list:
1858 for second in second_list:
1859 # Disallow there're/where're. They're valid English
1861 if (first in ('there', 'where')) and second == 'a(re)':
1864 pattern = fr'\b({first})\s+{second}\b'
1865 if second == '(n)o(t)':
1866 replacement = r"\1\2'\3"
1868 replacement = r"\1'\2"
1869 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
1874 def thify(n: int) -> str:
1877 n: how many of them are there?
1880 The proper cardinal suffix for a number.
1889 print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
1899 assert is_integer_number(digit)
1911 def ngrams(txt: str, n: int):
1914 txt: the string to create ngrams using
1915 n: how many words per ngram created?
1918 Generates the ngrams from the input string.
1920 >>> [x for x in ngrams('This is a test', 2)]
1921 ['This is', 'is a', 'a test']
1924 for ngram in ngrams_presplit(words, n):
1931 def ngrams_presplit(words: Sequence[str], n: int):
1933 Same as :meth:`ngrams` but with the string pre-split.
1935 return list_utils.ngrams(words, n)
1938 def bigrams(txt: str):
1939 """Generates the bigrams (n=2) of the given string."""
1940 return ngrams(txt, 2)
1943 def trigrams(txt: str):
1944 """Generates the trigrams (n=3) of the given string."""
1945 return ngrams(txt, 3)
1948 def shuffle_columns_into_list(
1949 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
1951 """Helper to shuffle / parse columnar data and return the results as a
1955 input_lines: A sequence of strings that represents text that
1956 has been broken into columns by the caller
1957 column_specs: an iterable collection of numeric sequences that
1958 indicate one or more column numbers to copy to form the Nth
1959 position in the output list. See example below.
1960 delim: for column_specs that indicate we should copy more than
1961 one column from the input into this position, use delim to
1962 separate source data. Defaults to ''.
1965 A list of string created by following the instructions set forth
1968 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
1969 >>> shuffle_columns_into_list(
1971 ... [ [8], [2, 3], [5, 6, 7] ],
1974 ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
1978 # Column specs map input lines' columns into outputs.
1980 for spec in column_specs:
1983 hunk = hunk + delim + input_lines[n]
1984 hunk = hunk.strip(delim)
1989 def shuffle_columns_into_dict(
1990 input_lines: Sequence[str],
1991 column_specs: Iterable[Tuple[str, Iterable[int]]],
1993 ) -> Dict[str, str]:
1994 """Helper to shuffle / parse columnar data and return the results
1998 input_lines: a sequence of strings that represents text that
1999 has been broken into columns by the caller
2000 column_specs: instructions for what dictionary keys to apply
2001 to individual or compound input column data. See example
2003 delim: when forming compound output data by gluing more than
2004 one input column together, use this character to separate
2005 the source data. Defaults to ''.
2008 A dict formed by applying the column_specs instructions.
2010 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2011 >>> shuffle_columns_into_dict(
2013 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2016 {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2020 # Column specs map input lines' columns into outputs.
2021 # "key", [col1, col2...]
2022 for spec in column_specs:
2025 hunk = hunk + delim + input_lines[n]
2026 hunk = hunk.strip(delim)
2031 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2033 Interpolate a string with data from a dict.
2036 txt: the mad libs template
2037 values: what you and your kids chose for each category.
2039 >>> interpolate_using_dict('This is a {adjective} {noun}.',
2040 ... {'adjective': 'good', 'noun': 'example'})
2041 'This is a good example.'
2043 return sprintf(txt.format(**values), end='')
2046 def to_ascii(txt: str):
2049 txt: the input data to encode
2052 txt encoded as an ASCII byte string.
2054 >>> to_ascii('test')
2057 >>> to_ascii(b'1, 2, 3')
2060 if isinstance(txt, str):
2061 return txt.encode('ascii')
2062 if isinstance(txt, bytes):
2064 raise Exception('to_ascii works with strings and bytes')
2067 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
2070 txt: the input data to encode
2073 txt encoded with a 64-chracter alphabet. Similar to and compatible
2074 with uuencode/uudecode.
2076 >>> to_base64('hello?')
2079 return base64.encodebytes(txt.encode(encoding, errors))
2082 def is_base64(txt: str) -> bool:
2085 txt: the string to check
2088 True if txt is a valid base64 encoded string. This assumes
2089 txt was encoded with Python's standard base64 alphabet which
2090 is the same as what uuencode/uudecode uses).
2092 >>> is_base64('test') # all letters in the b64 alphabet
2095 >>> is_base64('another test, how do you like this one?')
2098 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
2102 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2103 alphabet = set(a.encode('ascii'))
2104 for char in to_ascii(txt.strip()):
2105 if char not in alphabet:
2110 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
2113 b64: bytestring of 64-bit encoded data to decode / convert.
2116 The decoded form of b64 as a normal python string. Similar to
2117 and compatible with uuencode / uudecode.
2119 >>> from_base64(b'aGVsbG8/\\n')
2122 return base64.decodebytes(b64).decode(encoding, errors)
2125 def chunk(txt: str, chunk_size: int):
2128 txt: a string to be chunked into evenly spaced pieces.
2129 chunk_size: the size of each chunk to make
2132 The original string chunked into evenly spaced pieces.
2134 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2135 '01001101 11000101 10101010 10101010 10011111 10101000'
2137 if len(txt) % chunk_size != 0:
2138 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2140 warnings.warn(msg, stacklevel=2)
2141 for x in range(0, len(txt), chunk_size):
2142 yield txt[x : x + chunk_size]
2145 def to_bitstring(txt: str, *, delimiter='') -> str:
2148 txt: the string to convert into a bitstring
2149 delimiter: character to insert between adjacent bytes. Note that
2150 only bitstrings with delimiter='' are interpretable by
2151 :meth:`from_bitstring`.
2154 txt converted to ascii/binary and then chopped into bytes.
2156 >>> to_bitstring('hello?')
2157 '011010000110010101101100011011000110111100111111'
2159 >>> to_bitstring('test', delimiter=' ')
2160 '01110100 01100101 01110011 01110100'
2162 >>> to_bitstring(b'test')
2163 '01110100011001010111001101110100'
2165 etxt = to_ascii(txt)
2166 bits = bin(int.from_bytes(etxt, 'big'))
2168 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2171 def is_bitstring(txt: str) -> bool:
2174 txt: the string to check
2177 True if txt is a recognized bitstring and False otherwise.
2178 Note that if delimiter is non empty this code will not
2179 recognize the bitstring.
2181 >>> is_bitstring('011010000110010101101100011011000110111100111111')
2184 >>> is_bitstring('1234')
2187 return is_binary_integer_number(f'0b{txt}')
2190 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
2193 bits: the bitstring to convert back into a python string
2194 encoding: the encoding to use
2197 The regular python string represented by bits. Note that this
2198 code does not work with to_bitstring when delimiter is non-empty.
2200 >>> from_bitstring('011010000110010101101100011011000110111100111111')
2204 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2207 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2210 txt: an IP address to chunk up for sorting purposes
2213 A tuple of IP components arranged such that the sorting of
2214 IP addresses using a normal comparator will do something sane
2217 >>> ip_v4_sort_key('10.0.0.18')
2220 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2221 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2222 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2224 if not is_ip_v4(txt):
2225 print(f"not IP: {txt}")
2227 return tuple(int(x) for x in txt.split('.'))
2230 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2233 volume: the string to chunk up for sorting purposes
2236 A tuple of volume's components such that the sorting of
2237 volumes using a normal comparator will do something sane
2240 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2241 ('usr', 'local', 'bin')
2243 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2244 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2245 ['/usr', '/usr/local', '/usr/local/bin']
2247 return tuple(x for x in volume.split('/') if len(x) > 0)
2250 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2252 Execute several replace operations in a row.
2255 in_str: the string in which to replace characters
2256 replace_set: the set of target characters to replace
2257 replacement: the character to replace any member of replace_set
2261 The string with replacements executed.
2263 >>> s = 'this_is a-test!'
2264 >>> replace_all(s, ' _-!', '')
2267 for char in replace_set:
2268 in_str = in_str.replace(char, replacement)
2272 def replace_nth(in_str: str, source: str, target: str, nth: int):
2274 Replaces the nth occurrance of a substring within a string.
2277 in_str: the string in which to run the replacement
2278 source: the substring to replace
2279 target: the replacement text
2280 nth: which occurrance of source to replace?
2282 >>> replace_nth('this is a test', ' ', '-', 3)
2285 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2286 before = in_str[:where]
2287 after = in_str[where:]
2288 after = after.replace(source, target, 1)
2289 return before + after
2292 if __name__ == '__main__':