2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
8 Modifications Copyright (c) 2021-2022 Scott Gasch
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
17 The above copyright notice and this permission notice shall be included in all
18 copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 This class is based on:
29 https://github.com/daveoncode/python-string-utils. See `NOTICE
30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
31 in the root of this module for a detailed enumeration of what work is
32 Davide's and what work was added by Scott.
37 import contextlib # type: ignore
48 from itertools import zip_longest
61 from uuid import uuid4
63 from pyutils import list_utils
65 logger = logging.getLogger(__name__)
67 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
69 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
71 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
73 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
76 r"([a-z-]+://)" # scheme
77 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
79 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
80 r"(:\d{2,})?" # port number
81 r"(/[a-z\d_%+-]*)*" # folders
82 r"(\.[a-z\d_%+-]+)*" # file extension
83 r"(\?[a-z\d_+%-=]*)?" # query string
87 URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
89 URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
91 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
94 r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
97 EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
99 EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
101 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
103 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
105 SNAKE_CASE_TEST_RE = re.compile(
106 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
109 SNAKE_CASE_TEST_DASH_RE = re.compile(
110 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
113 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
115 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
118 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
119 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
120 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
121 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
122 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
123 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
126 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
128 UUID_RE = re.compile(
129 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
132 UUID_HEX_OK_RE = re.compile(
133 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
137 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
139 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
141 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
143 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
145 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
147 ANYWHERE_MAC_ADDRESS_RE = re.compile(
148 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
151 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
153 HTML_RE = re.compile(
154 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
155 re.IGNORECASE | re.MULTILINE | re.DOTALL,
158 HTML_TAG_ONLY_RE = re.compile(
159 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
160 re.IGNORECASE | re.MULTILINE | re.DOTALL,
163 SPACES_RE = re.compile(r"\s")
165 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
167 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
169 ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
230 NUM_WORDS["and"] = (1, 0)
231 for i, word in enumerate(UNIT_WORDS):
232 NUM_WORDS[word] = (1, i)
233 for i, word in enumerate(TENS_WORDS):
234 NUM_WORDS[word] = (1, i * 10)
235 for i, word in enumerate(MAGNITUDE_SCALES):
237 NUM_WORDS[word] = (100, 0)
239 NUM_WORDS[word] = (10 ** (i * 3), 0)
240 NUM_WORDS['score'] = (20, 0)
243 def is_none_or_empty(in_str: Optional[str]) -> bool:
246 in_str: the string to test
249 True if the input string is either None or an empty string,
252 See also :meth:`is_string` and :meth:`is_empty_string`.
254 >>> is_none_or_empty("")
256 >>> is_none_or_empty(None)
258 >>> is_none_or_empty(" \t ")
260 >>> is_none_or_empty('Test')
263 return in_str is None or len(in_str.strip()) == 0
266 def is_string(in_str: Any) -> bool:
269 in_str: the object to test
272 True if the object is a string and False otherwise.
274 See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
276 >>> is_string('test')
282 >>> is_string([1, 2, 3])
285 return isinstance(in_str, str)
288 def is_empty_string(in_str: Any) -> bool:
291 in_str: the string to test
294 True if the string is empty and False otherwise.
296 See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
298 return is_empty(in_str)
301 def is_empty(in_str: Any) -> bool:
304 in_str: the string to test
307 True if the string is empty and false otherwise.
309 See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
313 >>> is_empty(' \t\t ')
319 >>> is_empty([1, 2, 3])
322 return is_string(in_str) and in_str.strip() == ""
325 def is_full_string(in_str: Any) -> bool:
328 in_str: the object to test
331 True if the object is a string and is not empty ('') and
332 is not only composed of whitespace.
334 See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
336 >>> is_full_string('test!')
338 >>> is_full_string('')
340 >>> is_full_string(' ')
342 >>> is_full_string(100.999)
344 >>> is_full_string({"a": 1, "b": 2})
347 return is_string(in_str) and in_str.strip() != ""
350 def is_number(in_str: str) -> bool:
353 in_str: the string to test
356 True if the string contains a valid numberic value and
360 TypeError: the input argument isn't a string
362 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
363 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
367 Traceback (most recent call last):
370 >>> is_number("100.5")
372 >>> is_number("test")
376 >>> is_number([1, 2, 3])
377 Traceback (most recent call last):
381 if not is_string(in_str):
382 raise TypeError(in_str)
383 return NUMBER_RE.match(in_str) is not None
386 def is_integer_number(in_str: str) -> bool:
389 in_str: the string to test
392 True if the string contains a valid (signed or unsigned,
393 decimal, hex, or octal, regular or scientific) integral
394 expression and False otherwise.
396 See also :meth:`is_number`, :meth:`is_decimal_number`,
397 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
400 >>> is_integer_number('42')
402 >>> is_integer_number('42.0')
406 (is_number(in_str) and "." not in in_str)
407 or is_hexidecimal_integer_number(in_str)
408 or is_octal_integer_number(in_str)
409 or is_binary_integer_number(in_str)
413 def is_hexidecimal_integer_number(in_str: str) -> bool:
416 in_str: the string to test
419 True if the string is a hex integer number and False otherwise.
422 TypeError: the input argument isn't a string
424 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
425 :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
427 >>> is_hexidecimal_integer_number('0x12345')
429 >>> is_hexidecimal_integer_number('0x1A3E')
431 >>> is_hexidecimal_integer_number('1234') # Needs 0x
433 >>> is_hexidecimal_integer_number('-0xff')
435 >>> is_hexidecimal_integer_number('test')
437 >>> is_hexidecimal_integer_number(12345) # Not a string
438 Traceback (most recent call last):
441 >>> is_hexidecimal_integer_number(101.4)
442 Traceback (most recent call last):
445 >>> is_hexidecimal_integer_number(0x1A3E)
446 Traceback (most recent call last):
450 if not is_string(in_str):
451 raise TypeError(in_str)
452 return HEX_NUMBER_RE.match(in_str) is not None
455 def is_octal_integer_number(in_str: str) -> bool:
458 in_str: the string to test
461 True if the string is a valid octal integral number and False otherwise.
464 TypeError: the input argument isn't a string
466 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
467 :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
470 >>> is_octal_integer_number('0o777')
472 >>> is_octal_integer_number('-0O115')
474 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
476 >>> is_octal_integer_number('7777') # Needs 0o
478 >>> is_octal_integer_number('test')
481 if not is_string(in_str):
482 raise TypeError(in_str)
483 return OCT_NUMBER_RE.match(in_str) is not None
486 def is_binary_integer_number(in_str: str) -> bool:
489 in_str: the string to test
492 True if the string contains a binary integral number and False otherwise.
495 TypeError: the input argument isn't a string
497 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
498 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
501 >>> is_binary_integer_number('0b10111')
503 >>> is_binary_integer_number('-0b111')
505 >>> is_binary_integer_number('0B10101')
507 >>> is_binary_integer_number('0b10102')
509 >>> is_binary_integer_number('0xFFF')
511 >>> is_binary_integer_number('test')
514 if not is_string(in_str):
515 raise TypeError(in_str)
516 return BIN_NUMBER_RE.match(in_str) is not None
519 def to_int(in_str: str) -> int:
522 in_str: the string to convert
525 The integral value of the string.
528 TypeError: the input argument isn't a string
530 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
531 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
532 :meth:`is_binary_integer_number`, etc...
538 >>> to_int('0b01101')
543 Traceback (most recent call last):
545 ValueError: invalid literal for int() with base 10: 'test'
547 Traceback (most recent call last):
551 if not is_string(in_str):
552 raise TypeError(in_str)
553 if is_binary_integer_number(in_str):
554 return int(in_str, 2)
555 if is_octal_integer_number(in_str):
556 return int(in_str, 8)
557 if is_hexidecimal_integer_number(in_str):
558 return int(in_str, 16)
562 def number_string_to_integer(in_str: str) -> int:
563 """Convert a string containing a written-out number into an int.
566 in_str: the string containing the long-hand written out integer number
567 in English. See examples below.
570 The integer whose value was parsed from in_str.
573 ValueError: unable to parse a chunk of the number string
575 See also :meth:`integer_to_number_string`.
578 This code only handles integers; it will not work with decimals / floats.
580 >>> number_string_to_integer("one hundred fifty two")
583 >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
586 >>> number_string_to_integer("four-score and 7")
589 >>> number_string_to_integer("fifty xyzzy three")
590 Traceback (most recent call last):
592 ValueError: Unknown word: xyzzy
594 if isinstance(in_str, int):
598 in_str = in_str.replace('-', ' ')
599 for w in in_str.split():
600 if w not in NUM_WORDS:
601 if is_integer_number(w):
605 raise ValueError("Unknown word: " + w)
606 scale, increment = NUM_WORDS[w]
607 current = current * scale + increment
611 return result + current
614 def integer_to_number_string(num: int) -> str:
616 Opposite of :meth:`number_string_to_integer`; converts a number to a written out
617 longhand format in English.
620 num: the integer number to convert
623 The long-hand written out English form of the number. See examples below.
625 See also :meth:`number_string_to_integer`.
628 This method does not handle decimals or floats, only ints.
630 >>> integer_to_number_string(9)
633 >>> integer_to_number_string(42)
636 >>> integer_to_number_string(123219982)
637 'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
641 return UNIT_WORDS[num]
643 ret = TENS_WORDS[num // 10]
646 ret += ' ' + UNIT_WORDS[leftover]
649 # If num > 100 go find the highest chunk and convert that, then recursively
650 # convert the rest. NUM_WORDS contains items like 'thousand' -> (1000, 0).
651 # The second item in the tuple is an increment that can be ignored; the first
652 # is the numeric "scale" of the entry. So find the greatest entry in NUM_WORDS
653 # still less than num. For 123,456 it would be thousand. Then pull out the
654 # 123, convert it, and append "thousand". Then do the rest.
656 for name, val in NUM_WORDS.items():
658 scales[name] = val[0]
659 scale = max(scales.items(), key=lambda _: _[1])
661 # scale[1] = numeric magnitude (e.g. 1000)
662 # scale[0] = name (e.g. "thousand")
663 ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
664 leftover = num % scale[1]
666 ret += ' ' + integer_to_number_string(leftover)
670 def is_decimal_number(in_str: str) -> bool:
673 in_str: the string to check
676 True if the given string represents a decimal or False
677 otherwise. A decimal may be signed or unsigned or use
678 a "scientific notation".
680 See also :meth:`is_integer_number`.
683 We do not consider integers without a decimal point
684 to be decimals; they return False (see example).
686 >>> is_decimal_number('42.0')
688 >>> is_decimal_number('42')
691 return is_number(in_str) and "." in in_str
694 def strip_escape_sequences(in_str: str) -> str:
697 in_str: the string to strip of escape sequences.
700 in_str with escape sequences removed.
702 See also: :mod:`pyutils.ansi`.
705 What is considered to be an "escape sequence" is defined
706 by a regular expression. While this gets common ones,
707 there may exist valid sequences that it doesn't match.
709 >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
712 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
716 def add_thousands_separator(
717 in_str: str, *, separator_char: str = ',', places: int = 3
721 in_str: string or number to which to add thousands separator(s)
722 separator_char: the separator character to add (defaults to comma)
723 places: add a separator every N places (defaults to three)
726 A numeric string with thousands separators added appropriately.
729 ValueError: a non-numeric string argument is presented
731 >>> add_thousands_separator('12345678')
733 >>> add_thousands_separator(12345678)
735 >>> add_thousands_separator(12345678.99)
737 >>> add_thousands_separator('test')
738 Traceback (most recent call last):
743 if isinstance(in_str, numbers.Number):
745 if is_number(in_str):
746 return _add_thousands_separator(
747 in_str, separator_char=separator_char, places=places
749 raise ValueError(in_str)
752 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
753 """Internal helper"""
756 (in_str, decimal_part) = in_str.split('.')
757 tmp = [iter(in_str[::-1])] * places
758 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
765 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
768 in_str: the string to test
769 allowed_schemes: an optional list of allowed schemes (e.g.
770 ['http', 'https', 'ftp']. If passed, only URLs that
771 begin with the one of the schemes passed will be considered
772 to be valid. Otherwise, any scheme:// will be considered
776 True if in_str contains a valid URL and False otherwise.
778 >>> is_url('http://www.mysite.com')
780 >>> is_url('https://mysite.com')
782 >>> is_url('.mysite.com')
784 >>> is_url('scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash')
787 if not is_full_string(in_str):
790 valid = URL_RE.match(in_str) is not None
793 return valid and any([in_str.startswith(s) for s in allowed_schemes])
797 def is_email(in_str: Any) -> bool:
800 in_str: the email address to check
802 Returns: True if the in_str contains a valid email (as defined by
803 https://tools.ietf.org/html/rfc3696#section-3) or False
808 >>> is_email('@gmail.com')
811 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
815 # we expect 2 tokens, one before "@" and one after, otherwise
816 # we have an exception and the email is not valid.
817 head, tail = in_str.split("@")
819 # head's size must be <= 64, tail <= 255, head must not start
820 # with a dot or contain multiple consecutive dots.
821 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
824 # removes escaped spaces, so that later on the test regex will
826 head = head.replace("\\ ", "")
827 if head.startswith('"') and head.endswith('"'):
828 head = head.replace(" ", "")[1:-1]
829 return EMAIL_RE.match(head + "@" + tail) is not None
831 except (TypeError, ValueError):
832 # borderline case in which we have multiple "@" signs but the
833 # head part is correctly escaped.
834 if ESCAPED_AT_SIGN.search(in_str) is not None:
835 # replace "@" with "a" in the head
836 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
840 def suffix_string_to_number(in_str: str) -> Optional[int]:
841 """Takes a string like "33Gb" and converts it into a number (of bytes)
845 in_str: the string with a suffix to be interpreted and removed.
848 An integer number of bytes or None to indicate an error.
850 See also :meth:`number_to_suffix_string`.
852 >>> suffix_string_to_number('1Mb')
854 >>> suffix_string_to_number('13.1Gb')
856 >>> suffix_string_to_number('12345')
858 >>> x = suffix_string_to_number('a lot')
863 def suffix_capitalize(s: str) -> str:
867 return f"{s[0].upper()}{s[1].lower()}"
868 return suffix_capitalize(s[0:1])
870 if is_string(in_str):
871 if is_integer_number(in_str):
872 return to_int(in_str)
873 suffixes = [in_str[-2:], in_str[-1:]]
874 rest = [in_str[:-2], in_str[:-1]]
875 for x in range(len(suffixes)):
877 s = suffix_capitalize(s)
878 multiplier = NUM_SUFFIXES.get(s, None)
879 if multiplier is not None:
881 if is_integer_number(r):
882 return to_int(r) * multiplier
883 if is_decimal_number(r):
884 return int(float(r) * multiplier)
888 def number_to_suffix_string(num: int) -> Optional[str]:
889 """Take a number (of bytes) and returns a string like "43.8Gb".
892 num: an integer number of bytes
895 A string with a suffix representing num bytes concisely or
896 None to indicate an error.
898 See also: :meth:`suffix_string_to_number`.
900 >>> number_to_suffix_string(14066017894)
902 >>> number_to_suffix_string(1024 * 1024)
907 for (sfx, size) in NUM_SUFFIXES.items():
912 if suffix is not None:
913 return f"{d:.1f}{suffix}"
918 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
921 in_str: a string to check
922 card_type: if provided, contains the card type to validate
923 with. Otherwise, all known credit card number types will
926 Supported card types are the following:
936 True if in_str is a valid credit card number.
939 KeyError: card_type is invalid
942 This code is not verifying the authenticity of the credit card (i.e.
943 not checking whether it's a real card that can be charged); rather
944 it's only checking that the number follows the "rules" for numbering
945 established by credit card issuers.
948 if not is_full_string(in_str):
951 if card_type is not None:
952 if card_type not in CREDIT_CARDS:
954 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
956 return CREDIT_CARDS[card_type].match(in_str) is not None
957 for c in CREDIT_CARDS:
958 if CREDIT_CARDS[c].match(in_str) is not None:
963 def is_camel_case(in_str: Any) -> bool:
966 in_str: the string to test
969 True if the string is formatted as camel case and False otherwise.
970 A string is considered camel case when:
972 * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
973 * it contains both lowercase and uppercase letters
974 * it does not start with a number
976 See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
978 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
981 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
984 in_str: the string to test
985 separator: the snake case separator character to use
987 Returns: True if the string is snake case and False otherwise. A
988 string is considered snake case when:
990 * it's composed only by lowercase/uppercase letters and digits
991 * it contains at least one underscore (or provided separator)
992 * it does not start with a number
994 See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
996 >>> is_snake_case('this_is_a_test')
998 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
1000 >>> is_snake_case('this-is-a-test')
1002 >>> is_snake_case('this-is-a-test', separator='-')
1005 if is_full_string(in_str):
1006 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
1007 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
1010 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
1012 return r.match(in_str) is not None
1016 def is_json(in_str: Any) -> bool:
1019 in_str: the string to test
1022 True if the in_str contains valid JSON and False otherwise.
1024 >>> is_json('{"name": "Peter"}')
1026 >>> is_json('[1, 2, 3]')
1028 >>> is_json('{nope}')
1031 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
1033 return isinstance(json.loads(in_str), (dict, list))
1034 except (TypeError, ValueError, OverflowError):
1039 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1042 in_str: the string to test
1043 allow_hex: should we allow hexidecimal digits in valid uuids?
1046 True if the in_str contains a valid UUID and False otherwise.
1048 See also :meth:`generate_uuid`.
1050 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1052 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1054 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1057 # string casting is used to allow UUID itself as input data type
1060 return UUID_HEX_OK_RE.match(s) is not None
1061 return UUID_RE.match(s) is not None
1064 def is_ip_v4(in_str: Any) -> bool:
1067 in_str: the string to test
1070 True if in_str contains a valid IPv4 address and False otherwise.
1072 See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1075 >>> is_ip_v4('255.200.100.75')
1077 >>> is_ip_v4('nope')
1079 >>> is_ip_v4('255.200.100.999') # 999 out of range
1082 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1085 # checks that each entry in the ip is in the valid range (0 to 255)
1086 for token in in_str.split("."):
1087 if not 0 <= int(token) <= 255:
1092 def extract_ip_v4(in_str: Any) -> Optional[str]:
1095 in_str: the string to extract an IPv4 address from.
1098 The first extracted IPv4 address from in_str or None if
1099 none were found or an error occurred.
1101 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1104 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
1106 >>> extract_ip_v4('Your mom dresses you funny.')
1108 if not is_full_string(in_str):
1110 m = ANYWHERE_IP_V4_RE.search(in_str)
1116 def is_ip_v6(in_str: Any) -> bool:
1119 in_str: the string to test.
1122 True if in_str contains a valid IPv6 address and False otherwise.
1124 See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1127 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1129 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
1132 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1135 def extract_ip_v6(in_str: Any) -> Optional[str]:
1138 in_str: the string from which to extract an IPv6 address.
1141 The first IPv6 address found in in_str or None if no address
1142 was found or an error occurred.
1144 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1147 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1148 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1149 >>> extract_ip_v6("(and she's ugly too, btw)")
1151 if not is_full_string(in_str):
1153 m = ANYWHERE_IP_V6_RE.search(in_str)
1159 def is_ip(in_str: Any) -> bool:
1162 in_str: the string to test.
1165 True if in_str contains a valid IP address (either IPv4 or
1168 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1169 and :meth:`extract_ip_v4`.
1171 >>> is_ip('255.200.100.75')
1173 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1177 >>> is_ip('1.2.3.999')
1180 return is_ip_v6(in_str) or is_ip_v4(in_str)
1183 def extract_ip(in_str: Any) -> Optional[str]:
1186 in_str: the string from which to extract in IP address.
1189 The first IP address (IPv4 or IPv6) found in in_str or
1190 None to indicate none found or an error condition.
1192 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1193 and :meth:`extract_ip_v4`.
1195 >>> extract_ip('Attacker: 255.200.100.75')
1197 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1198 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1199 >>> extract_ip('1.2.3')
1201 ip = extract_ip_v4(in_str)
1203 ip = extract_ip_v6(in_str)
1207 def is_mac_address(in_str: Any) -> bool:
1210 in_str: the string to test
1213 True if in_str is a valid MAC address False otherwise.
1215 See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1217 >>> is_mac_address("34:29:8F:12:0D:2F")
1219 >>> is_mac_address('34:29:8f:12:0d:2f')
1221 >>> is_mac_address('34-29-8F-12-0D-2F')
1223 >>> is_mac_address("test")
1226 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1229 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1232 in_str: the string from which to extract a MAC address.
1233 separator: the MAC address hex byte separator to use.
1236 The first MAC address found in in_str or None to indicate no
1239 See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1241 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1244 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1247 if not is_full_string(in_str):
1250 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1253 mac.replace(":", separator)
1254 mac.replace("-", separator)
1259 def is_slug(in_str: Any, separator: str = "-") -> bool:
1262 in_str: string to test
1263 separator: the slug character to use
1266 True if in_str is a slug string and False otherwise.
1268 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1270 >>> is_slug('my-blog-post-title')
1272 >>> is_slug('My blog post title')
1275 if not is_full_string(in_str):
1277 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1278 return re.match(rex, in_str) is not None
1281 def contains_html(in_str: str) -> bool:
1284 in_str: the string to check for tags in
1287 True if the given string contains HTML/XML tags and False
1291 TypeError: the input argument isn't a string
1293 See also :meth:`strip_html`.
1296 By design, this function matches ANY type of tag, so don't expect
1297 to use it as an HTML validator. It's a quick sanity check at
1298 best. See something like BeautifulSoup for a more full-featuered
1301 >>> contains_html('my string is <strong>bold</strong>')
1303 >>> contains_html('my string is not bold')
1307 if not is_string(in_str):
1308 raise TypeError(in_str)
1309 return HTML_RE.search(in_str) is not None
1312 def words_count(in_str: str) -> int:
1315 in_str: the string to count words in
1318 The number of words contained in the given string.
1321 TypeError: the input argument isn't a string
1324 This method is "smart" in that it does consider only sequences
1325 of one or more letter and/or numbers to be "words". Thus a
1326 string like this: "! @ # % ... []" will return zero. Moreover
1327 it is aware of punctuation, so the count for a string like
1328 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1331 >>> words_count('hello world')
1333 >>> words_count('one,two,three.stop')
1336 if not is_string(in_str):
1337 raise TypeError(in_str)
1338 return len(WORDS_COUNT_RE.findall(in_str))
1341 def word_count(in_str: str) -> int:
1344 in_str: the string to count words in
1347 The number of words contained in the given string.
1350 This method is "smart" in that it does consider only sequences
1351 of one or more letter and/or numbers to be "words". Thus a
1352 string like this: "! @ # % ... []" will return zero. Moreover
1353 it is aware of punctuation, so the count for a string like
1354 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1357 >>> word_count('hello world')
1359 >>> word_count('one,two,three.stop')
1362 return words_count(in_str)
1365 def generate_uuid(omit_dashes: bool = False) -> str:
1368 omit_dashes: should we omit the dashes in the generated UUID?
1371 A generated UUID string (using `uuid.uuid4()`) with or without
1372 dashes per the omit_dashes arg.
1374 See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1376 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1377 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1385 def generate_random_alphanumeric_string(size: int) -> str:
1388 size: number of characters to generate
1391 A string of the specified size containing random characters
1392 (uppercase/lowercase ascii letters and digits).
1395 ValueError: size < 1
1397 See also :meth:`asciify`, :meth:`generate_uuid`.
1400 >>> generate_random_alphanumeric_string(9)
1404 raise ValueError("size must be >= 1")
1405 chars = string.ascii_letters + string.digits
1406 buffer = [random.choice(chars) for _ in range(size)]
1407 return from_char_list(buffer)
1410 def reverse(in_str: str) -> str:
1413 in_str: the string to reverse
1416 The reversed (chracter by character) string.
1419 TypeError: the input argument isn't a string
1424 if not is_string(in_str):
1425 raise TypeError(in_str)
1429 def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
1432 in_str: the camel case string to convert
1433 separator: the snake case separator character to use
1436 A snake case string equivalent to the camel case input or the
1437 original string if it is not a valid camel case string or some
1441 TypeError: the input argument isn't a string
1443 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1445 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1446 'mac_address_extractor_factory'
1447 >>> camel_case_to_snake_case('Luke Skywalker')
1450 if not is_string(in_str):
1451 raise TypeError(in_str)
1452 if not is_camel_case(in_str):
1454 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1457 def snake_case_to_camel_case(
1458 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1462 in_str: the snake case string to convert
1463 upper_case_first: should we capitalize the first letter?
1464 separator: the separator character to use
1467 A camel case string that is equivalent to the snake case string
1468 provided or the original string back again if it is not valid
1469 snake case or another error occurs.
1472 TypeError: the input argument isn't a string
1474 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1476 >>> snake_case_to_camel_case('this_is_a_test')
1478 >>> snake_case_to_camel_case('Han Solo')
1481 if not is_string(in_str):
1482 raise TypeError(in_str)
1483 if not is_snake_case(in_str, separator=separator):
1485 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1486 if not upper_case_first:
1487 tokens[0] = tokens[0].lower()
1488 return from_char_list(tokens)
1491 def to_char_list(in_str: str) -> List[str]:
1494 in_str: the string to split into a char list
1497 A list of strings of length one each.
1499 See also :meth:`from_char_list`.
1501 >>> to_char_list('test')
1502 ['t', 'e', 's', 't']
1504 if not is_string(in_str):
1509 def from_char_list(in_list: List[str]) -> str:
1512 in_list: A list of characters to convert into a string.
1515 The string resulting from gluing the characters in in_list
1518 See also :meth:`to_char_list`.
1520 >>> from_char_list(['t', 'e', 's', 't'])
1523 return "".join(in_list)
1526 def shuffle(in_str: str) -> Optional[str]:
1529 in_str: a string to shuffle randomly by character
1532 A new string containing same chars of the given one but in
1533 a randomized order. Note that in rare cases this could result
1534 in the same original string as no check is done. Returns
1535 None to indicate error conditions.
1538 >>> shuffle('awesome')
1541 if not is_string(in_str):
1543 chars = to_char_list(in_str)
1544 random.shuffle(chars)
1545 return from_char_list(chars)
1548 def scramble(in_str: str) -> Optional[str]:
1551 in_str: a string to shuffle randomly by character
1554 A new string containing same chars of the given one but in
1555 a randomized order. Note that in rare cases this could result
1556 in the same original string as no check is done. Returns
1557 None to indicate error conditions.
1559 See also :mod:`pyutils.unscrambler`.
1562 >>> scramble('awesome')
1565 return shuffle(in_str)
1568 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1571 in_str: the string to strip tags from
1572 keep_tag_content: should we keep the inner contents of tags?
1575 A string with all HTML tags removed (optionally with tag contents
1579 TypeError: the input argument isn't a string
1581 See also :meth:`contains_html`.
1584 This method uses simple regular expressions to strip tags and is
1585 not a full fledged HTML parser by any means. Consider using
1586 something like BeautifulSoup if your needs are more than this
1587 simple code can fulfill.
1589 >>> strip_html('test: <a href="foo/bar">click here</a>')
1591 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1594 if not is_string(in_str):
1595 raise TypeError(in_str)
1596 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1597 return r.sub("", in_str)
1600 def asciify(in_str: str) -> str:
1603 in_str: the string to asciify.
1606 An output string roughly equivalent to the original string
1607 where all content to are ascii-only. This is accomplished
1608 by translating all non-ascii chars into their closest possible
1609 ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1612 TypeError: the input argument isn't a string
1614 See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1617 Some chars may be lost if impossible to translate.
1619 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1620 'eeuuooaaeynAAACIINOE'
1622 if not is_string(in_str):
1623 raise TypeError(in_str)
1625 # "NFKD" is the algorithm which is able to successfully translate
1626 # the most of non-ascii chars.
1627 normalized = unicodedata.normalize("NFKD", in_str)
1629 # encode string forcing ascii and ignore any errors
1630 # (unrepresentable chars will be stripped out)
1631 ascii_bytes = normalized.encode("ascii", "ignore")
1633 # turns encoded bytes into an utf-8 string
1634 return ascii_bytes.decode("utf-8")
1637 def slugify(in_str: str, *, separator: str = "-") -> str:
1640 in_str: the string to slugify
1641 separator: the character to use during sligification (default
1645 The converted string. The returned string has the following properties:
1648 * all letters are in lower case
1649 * all punctuation signs and non alphanumeric chars are removed
1650 * words are divided using provided separator
1651 * all chars are encoded as ascii (by using :meth:`asciify`)
1655 TypeError: the input argument isn't a string
1657 See also :meth:`is_slug` and :meth:`asciify`.
1659 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1660 'top-10-reasons-to-love-dogs'
1661 >>> slugify('Mönstér Mägnët')
1664 if not is_string(in_str):
1665 raise TypeError(in_str)
1667 # replace any character that is NOT letter or number with spaces
1668 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1670 # replace spaces with join sign
1671 out = SPACES_RE.sub(separator, out)
1673 # normalize joins (remove duplicates)
1674 out = re.sub(re.escape(separator) + r"+", separator, out)
1678 def to_bool(in_str: str) -> bool:
1681 in_str: the string to convert to boolean
1684 A boolean equivalent of the original string based on its contents.
1685 All conversion is case insensitive. A positive boolean (True) is
1686 returned if the string value is any of the following:
1695 Otherwise False is returned.
1698 TypeError: the input argument isn't a string
1700 See also :mod:`pyutils.argparse_utils`.
1720 if not is_string(in_str):
1721 raise TypeError(in_str)
1722 return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
1725 def to_date(in_str: str) -> Optional[datetime.date]:
1728 in_str: the string to convert into a date
1731 The datetime.date the string contained or None to indicate
1732 an error. This parser is relatively clever; see
1733 :class:`datetimes.dateparse_utils` docs for details.
1735 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
1736 :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1738 >>> to_date('9/11/2001')
1739 datetime.date(2001, 9, 11)
1740 >>> to_date('xyzzy')
1742 import pyutils.datetimes.dateparse_utils as du
1745 d = du.DateParser() # type: ignore
1748 except du.ParseException: # type: ignore
1753 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1754 """Finds and extracts a date from the string, if possible.
1757 in_str: the string to extract a date from
1760 a datetime if date was found, otherwise None
1762 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1763 :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1765 >>> extract_date("filename.txt dec 13, 2022")
1766 datetime.datetime(2022, 12, 13, 0, 0)
1768 >>> extract_date("Dear Santa, please get me a pony.")
1773 import pyutils.datetimes.dateparse_utils as du
1775 d = du.DateParser() # type: ignore
1776 chunks = in_str.split()
1777 for ngram in itertools.chain(
1778 list_utils.ngrams(chunks, 5),
1779 list_utils.ngrams(chunks, 4),
1780 list_utils.ngrams(chunks, 3),
1781 list_utils.ngrams(chunks, 2),
1784 expr = " ".join(ngram)
1785 logger.debug("Trying %s", expr)
1787 return d.get_datetime()
1788 except du.ParseException: # type: ignore
1793 def is_valid_date(in_str: str) -> bool:
1796 in_str: the string to check
1799 True if the string represents a valid date that we can recognize
1800 and False otherwise. This parser is relatively clever; see
1801 :class:`datetimes.dateparse_utils` docs for details.
1803 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1804 :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1806 >>> is_valid_date('1/2/2022')
1808 >>> is_valid_date('christmas')
1810 >>> is_valid_date('next wednesday')
1812 >>> is_valid_date('xyzzy')
1815 import pyutils.datetimes.dateparse_utils as dp
1818 d = dp.DateParser() # type: ignore
1821 except dp.ParseException: # type: ignore
1826 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1829 in_str: string to parse into a datetime
1832 A python datetime parsed from in_str or None to indicate
1833 an error. This parser is relatively clever; see
1834 :class:`datetimes.dateparse_utils` docs for details.
1836 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1837 :meth:`extract_date`, :meth:`valid_datetime`.
1839 >>> to_datetime('7/20/1969 02:56 GMT')
1840 datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1842 import pyutils.datetimes.dateparse_utils as dp
1845 d = dp.DateParser() # type: ignore
1846 dt = d.parse(in_str)
1847 if isinstance(dt, datetime.datetime):
1854 def valid_datetime(in_str: str) -> bool:
1857 in_str: the string to check
1860 True if in_str contains a valid datetime and False otherwise.
1861 This parser is relatively clever; see
1862 :class:`datetimes.dateparse_utils` docs for details.
1864 >>> valid_datetime('next wednesday at noon')
1866 >>> valid_datetime('3 weeks ago at midnight')
1868 >>> valid_datetime('next easter at 5:00 am')
1870 >>> valid_datetime('sometime soon')
1873 _ = to_datetime(in_str)
1879 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1882 in_str: the string to squeeze
1883 character_to_squeeze: the character to remove runs of
1884 more than one in a row (default = space)
1886 Returns: A "squeezed string" where runs of more than one
1887 character_to_squeeze into one.
1889 >>> squeeze(' this is a test ')
1892 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1897 r'(' + re.escape(character_to_squeeze) + r')+',
1898 character_to_squeeze,
1903 def dedent(in_str: str) -> Optional[str]:
1906 in_str: the string to dedent
1909 A string with tab indentation removed or None on error.
1911 See also :meth:`indent`.
1913 >>> dedent('\t\ttest\\n\t\ting')
1916 if not is_string(in_str):
1918 line_separator = '\n'
1919 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1920 return line_separator.join(lines)
1923 def indent(in_str: str, amount: int) -> str:
1926 in_str: the string to indent
1927 amount: count of spaces to indent each line by
1930 An indented string created by prepending amount spaces.
1933 TypeError: the input argument isn't a string
1935 See also :meth:`dedent`.
1937 >>> indent('This is a test', 4)
1940 if not is_string(in_str):
1941 raise TypeError(in_str)
1942 line_separator = '\n'
1943 lines = [" " * amount + line for line in in_str.split(line_separator)]
1944 return line_separator.join(lines)
1947 def _sprintf(*args, **kwargs) -> str:
1948 """Internal helper."""
1951 sep = kwargs.pop("sep", None)
1953 if not isinstance(sep, str):
1954 raise TypeError("sep must be None or a string")
1956 end = kwargs.pop("end", None)
1958 if not isinstance(end, str):
1959 raise TypeError("end must be None or a string")
1962 raise TypeError("invalid keyword arguments to sprint()")
1968 for n, arg in enumerate(args):
1971 if isinstance(arg, str):
1979 def strip_ansi_sequences(in_str: str) -> str:
1982 in_str: the string to strip
1985 in_str with recognized ANSI escape sequences removed.
1987 See also :mod:`pyutils.ansi`.
1990 This method works by using a regular expression.
1991 It works for all ANSI escape sequences I've tested with but
1992 may miss some; caveat emptor.
1994 >>> import ansi as a
1995 >>> s = a.fg('blue') + 'blue!' + a.reset()
1996 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1998 >>> len(strip_ansi_sequences(s))
2000 >>> strip_ansi_sequences(s)
2004 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
2007 class SprintfStdout(contextlib.AbstractContextManager):
2009 A context manager that captures outputs to stdout to a buffer
2010 without printing them.
2012 >>> with SprintfStdout() as buf:
2014 ... print("1, 2, 3")
2016 >>> print(buf(), end='')
2021 def __init__(self) -> None:
2022 self.destination = io.StringIO()
2023 self.recorder: contextlib.redirect_stdout
2025 def __enter__(self) -> Callable[[], str]:
2026 self.recorder = contextlib.redirect_stdout(self.destination)
2027 self.recorder.__enter__()
2028 return lambda: self.destination.getvalue()
2030 def __exit__(self, *args) -> Literal[False]:
2031 self.recorder.__exit__(*args)
2032 self.destination.seek(0)
2036 def capitalize_first_letter(in_str: str) -> str:
2039 in_str: the string to capitalize
2042 in_str with the first character capitalized.
2044 >>> capitalize_first_letter('test')
2046 >>> capitalize_first_letter("ALREADY!")
2049 return in_str[0].upper() + in_str[1:]
2052 def it_they(n: int) -> str:
2055 n: how many of them are there?
2058 'it' if n is one or 'they' otherwize.
2060 See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
2065 n = num_files_saved_to_tmp()
2066 print(f'Saved file{pluralize(n)} successfully.')
2067 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2079 def is_are(n: int) -> str:
2082 n: how many of them are there?
2085 'is' if n is one or 'are' otherwize.
2087 See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2092 n = num_files_saved_to_tmp()
2093 print(f'Saved file{pluralize(n)} successfully.')
2094 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2107 def pluralize(n: int) -> str:
2110 n: how many of them are there?
2113 's' if n is greater than one otherwize ''.
2115 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2120 n = num_files_saved_to_tmp()
2121 print(f'Saved file{pluralize(n)} successfully.')
2122 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2127 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2130 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2138 def make_contractions(txt: str) -> str:
2139 """This code glues words in txt together to form (English)
2143 txt: the input text to be contractionized.
2146 Output text identical to original input except for any
2147 recognized contractions are formed.
2149 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2152 The order in which we create contractions is defined by the
2153 implementation and what I thought made more sense when writing
2156 >>> make_contractions('It is nice today.')
2159 >>> make_contractions('I can not even...')
2162 >>> make_contractions('She could not see!')
2165 >>> make_contractions('But she will not go.')
2168 >>> make_contractions('Verily, I shall not.')
2171 >>> make_contractions('No you cannot.')
2174 >>> make_contractions('I said you can not go.')
2175 "I said you can't go."
2211 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2215 # Special cases: can't, shan't and won't.
2216 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2218 r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2221 r'\b(w)ill\s*(n)(o)(t)\b',
2225 flags=re.IGNORECASE,
2228 for first_list, second_list in first_second:
2229 for first in first_list:
2230 for second in second_list:
2231 # Disallow there're/where're. They're valid English
2233 if (first in set(['there', 'where'])) and second == 'a(re)':
2236 pattern = fr'\b({first})\s+{second}\b'
2237 if second == '(n)o(t)':
2238 replacement = r"\1\2'\3"
2240 replacement = r"\1'\2"
2241 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2246 def thify(n: int) -> str:
2249 n: how many of them are there?
2252 The proper cardinal suffix for a number.
2254 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2263 print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2273 assert is_integer_number(digit)
2285 get_cardinal_suffix = thify
2288 def add_cardinal_suffix(n: int):
2291 n: the number to return as a string with a cardinal suffix.
2294 A string containing the number with its cardinal suffix.
2296 >>> add_cardinal_suffix(123)
2299 >>> add_cardinal_suffix(1)
2302 >>> add_cardinal_suffix(0)
2305 >>> add_cardinal_suffix(-123)
2308 return f'{n}{get_cardinal_suffix(n)}'
2311 def remove_cardinal_suffix(txt: str) -> Optional[str]:
2314 txt: the number with cardinal suffix to strip.
2317 The same string with its cardinal suffix removed or None on error.
2319 >>> remove_cardinal_suffix('123rd')
2322 >>> remove_cardinal_suffix('-10th')
2325 >>> remove_cardinal_suffix('1ero') is None
2329 if suffix in set(['st', 'nd', 'rd', 'th']):
2334 def ngrams(txt: str, n: int) -> Generator[str, str, None]:
2337 txt: the string to create ngrams using
2338 n: how many words per ngram created?
2341 Generates the ngrams from the input string.
2343 See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2345 >>> [x for x in ngrams('This is a test', 2)]
2346 ['This is', 'is a', 'a test']
2349 for ngram in ngrams_presplit(words, n):
2356 def ngrams_presplit(
2357 words: Sequence[str], n: int
2358 ) -> Generator[Sequence[str], str, None]:
2360 Same as :meth:`ngrams` but with the string pre-split.
2362 See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2364 return list_utils.ngrams(words, n)
2367 def bigrams(txt: str) -> Generator[str, str, None]:
2368 """Generates the bigrams (n=2) of the given string.
2370 See also :meth:`ngrams`, :meth:`trigrams`.
2372 >>> [x for x in bigrams('this is a test')]
2373 ['this is', 'is a', 'a test']
2375 return ngrams(txt, 2)
2378 def trigrams(txt: str) -> Generator[str, str, None]:
2379 """Generates the trigrams (n=3) of the given string.
2381 See also :meth:`ngrams`, :meth:`bigrams`.
2383 return ngrams(txt, 3)
2386 def shuffle_columns_into_list(
2387 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
2389 """Helper to shuffle / parse columnar data and return the results as a
2393 input_lines: A sequence of strings that represents text that
2394 has been broken into columns by the caller
2395 column_specs: an iterable collection of numeric sequences that
2396 indicate one or more column numbers to copy to form the Nth
2397 position in the output list. See example below.
2398 delim: for column_specs that indicate we should copy more than
2399 one column from the input into this position, use delim to
2400 separate source data. Defaults to ''.
2403 A list of string created by following the instructions set forth
2406 See also :meth:`shuffle_columns_into_dict`.
2408 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2409 >>> shuffle_columns_into_list(
2411 ... [ [8], [2, 3], [5, 6, 7] ],
2414 ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2418 # Column specs map input lines' columns into outputs.
2420 for spec in column_specs:
2423 hunk = hunk + delim + input_lines[n]
2424 hunk = hunk.strip(delim)
2429 def shuffle_columns_into_dict(
2430 input_lines: Sequence[str],
2431 column_specs: Iterable[Tuple[str, Iterable[int]]],
2433 ) -> Dict[str, str]:
2434 """Helper to shuffle / parse columnar data and return the results
2438 input_lines: a sequence of strings that represents text that
2439 has been broken into columns by the caller
2440 column_specs: instructions for what dictionary keys to apply
2441 to individual or compound input column data. See example
2443 delim: when forming compound output data by gluing more than
2444 one input column together, use this character to separate
2445 the source data. Defaults to ''.
2448 A dict formed by applying the column_specs instructions.
2450 See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2452 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2453 >>> shuffle_columns_into_dict(
2455 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2458 {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2462 # Column specs map input lines' columns into outputs.
2463 # "key", [col1, col2...]
2464 for spec in column_specs:
2467 hunk = hunk + delim + input_lines[n]
2468 hunk = hunk.strip(delim)
2473 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2475 Interpolate a string with data from a dict.
2478 txt: the mad libs template
2479 values: what you and your kids chose for each category.
2481 See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2483 >>> interpolate_using_dict('This is a {adjective} {noun}.',
2484 ... {'adjective': 'good', 'noun': 'example'})
2485 'This is a good example.'
2487 return _sprintf(txt.format(**values), end='')
2490 def to_ascii(txt: str):
2493 txt: the input data to encode
2496 txt encoded as an ASCII byte string.
2499 TypeError: the input argument isn't a string or bytes
2501 See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2502 :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2504 >>> to_ascii('test')
2507 >>> to_ascii(b'1, 2, 3')
2510 if isinstance(txt, str):
2511 return txt.encode('ascii')
2512 if isinstance(txt, bytes):
2514 raise TypeError('to_ascii works with strings and bytes')
2518 txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2522 txt: the input data to encode
2523 encoding: the encoding to use during conversion
2524 errors: how to handle encoding errors
2527 txt encoded with a 64-chracter alphabet. Similar to and compatible
2528 with uuencode/uudecode.
2530 See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2531 :meth:`from_base64`.
2533 >>> to_base64('hello?')
2536 return base64.encodebytes(txt.encode(encoding, errors))
2539 def is_base64(txt: str) -> bool:
2542 txt: the string to check
2545 True if txt is a valid base64 encoded string. This assumes
2546 txt was encoded with Python's standard base64 alphabet which
2547 is the same as what uuencode/uudecode uses).
2549 See also :meth:`to_base64`, :meth:`from_base64`.
2551 >>> is_base64('test') # all letters in the b64 alphabet
2554 >>> is_base64('another test, how do you like this one?')
2557 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
2561 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2562 alphabet = set(a.encode('ascii'))
2563 for char in to_ascii(txt.strip()):
2564 if char not in alphabet:
2570 b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2574 b64: bytestring of 64-bit encoded data to decode / convert.
2575 encoding: the encoding to use during conversion
2576 errors: how to handle encoding errors
2579 The decoded form of b64 as a normal python string. Similar to
2580 and compatible with uuencode / uudecode.
2582 See also :meth:`to_base64`, :meth:`is_base64`.
2584 >>> from_base64(b'aGVsbG8/\\n')
2587 return base64.decodebytes(b64).decode(encoding, errors)
2590 def chunk(txt: str, chunk_size: int):
2593 txt: a string to be chunked into evenly spaced pieces.
2594 chunk_size: the size of each chunk to make
2597 The original string chunked into evenly spaced pieces.
2599 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2600 '01001101 11000101 10101010 10101010 10011111 10101000'
2602 if len(txt) % chunk_size != 0:
2603 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2605 warnings.warn(msg, stacklevel=2)
2606 for x in range(0, len(txt), chunk_size):
2607 yield txt[x : x + chunk_size]
2610 def to_bitstring(txt: str, *, delimiter: str = '') -> str:
2613 txt: the string to convert into a bitstring
2614 delimiter: character to insert between adjacent bytes. Note that
2615 only bitstrings with delimiter='' are interpretable by
2616 :meth:`from_bitstring`.
2619 txt converted to ascii/binary and then chopped into bytes.
2621 See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2624 >>> to_bitstring('hello?')
2625 '011010000110010101101100011011000110111100111111'
2627 >>> to_bitstring('test', delimiter=' ')
2628 '01110100 01100101 01110011 01110100'
2630 >>> to_bitstring(b'test')
2631 '01110100011001010111001101110100'
2633 etxt = to_ascii(txt)
2634 bits = bin(int.from_bytes(etxt, 'big'))
2636 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2639 def is_bitstring(txt: str) -> bool:
2642 txt: the string to check
2645 True if txt is a recognized bitstring and False otherwise.
2646 Note that if delimiter is non empty this code will not
2647 recognize the bitstring.
2649 See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2652 >>> is_bitstring('011010000110010101101100011011000110111100111111')
2655 >>> is_bitstring('1234')
2658 return is_binary_integer_number(f'0b{txt}')
2662 bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2666 bits: the bitstring to convert back into a python string
2667 encoding: the encoding to use during conversion
2668 errors: how to handle encoding errors
2671 The regular python string represented by bits. Note that this
2672 code does not work with to_bitstring when delimiter is non-empty.
2674 See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2677 >>> from_bitstring('011010000110010101101100011011000110111100111111')
2681 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2684 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2687 txt: an IP address to chunk up for sorting purposes
2690 A tuple of IP components arranged such that the sorting of
2691 IP addresses using a normal comparator will do something sane
2694 See also :meth:`is_ip_v4`.
2696 >>> ip_v4_sort_key('10.0.0.18')
2699 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2700 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2701 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2703 if not is_ip_v4(txt):
2704 print(f"not IP: {txt}")
2706 return tuple(int(x) for x in txt.split('.'))
2709 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2712 volume: the string to chunk up for sorting purposes
2715 A tuple of volume's components such that the sorting of
2716 volumes using a normal comparator will do something sane
2719 See also :mod:`pyutils.files.file_utils`.
2721 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2722 ('usr', 'local', 'bin')
2724 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2725 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2726 ['/usr', '/usr/local', '/usr/local/bin']
2728 return tuple(x for x in volume.split('/') if len(x) > 0)
2731 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2733 Execute several replace operations in a row.
2736 in_str: the string in which to replace characters
2737 replace_set: the set of target characters to replace
2738 replacement: the character to replace any member of replace_set
2741 See also :meth:`replace_nth`.
2744 The string with replacements executed.
2746 >>> s = 'this_is a-test!'
2747 >>> replace_all(s, ' _-!', '')
2750 for char in replace_set:
2751 in_str = in_str.replace(char, replacement)
2755 def replace_nth(in_str: str, source: str, target: str, nth: int):
2757 Replaces the nth occurrance of a substring within a string.
2760 in_str: the string in which to run the replacement
2761 source: the substring to replace
2762 target: the replacement text
2763 nth: which occurrance of source to replace?
2765 See also :meth:`replace_all`.
2767 >>> replace_nth('this is a test', ' ', '-', 3)
2770 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2771 before = in_str[:where]
2772 after = in_str[where:]
2773 after = after.replace(source, target, 1)
2774 return before + after
2777 if __name__ == '__main__':