2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
8 Modifications Copyright (c) 2021-2022 Scott Gasch
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
17 The above copyright notice and this permission notice shall be included in all
18 copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 This class is based on:
29 https://github.com/daveoncode/python-string-utils. See `NOTICE
30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
31 in the root of this module for a detailed enumeration of what work is
32 Davide's and what work was added by Scott.
37 import contextlib # type: ignore
48 from itertools import zip_longest
60 from uuid import uuid4
62 from pyutils import list_utils
64 logger = logging.getLogger(__name__)
66 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
68 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
70 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
72 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
75 r"([a-z-]+://)" # scheme
76 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
78 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
79 r"(:\d{2,})?" # port number
80 r"(/[a-z\d_%+-]*)*" # folders
81 r"(\.[a-z\d_%+-]+)*" # file extension
82 r"(\?[a-z\d_+%-=]*)?" # query string
86 URL_RE = re.compile(r"^{}$".format(URLS_RAW_STRING), re.IGNORECASE)
88 URLS_RE = re.compile(r"({})".format(URLS_RAW_STRING), re.IGNORECASE)
90 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
93 r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
96 EMAIL_RE = re.compile(r"^{}$".format(EMAILS_RAW_STRING))
98 EMAILS_RE = re.compile(r"({})".format(EMAILS_RAW_STRING))
100 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
102 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
104 SNAKE_CASE_TEST_RE = re.compile(
105 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
108 SNAKE_CASE_TEST_DASH_RE = re.compile(
109 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
112 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
114 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
117 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
118 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
119 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
120 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
121 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
122 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
125 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
127 UUID_RE = re.compile(
128 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
131 UUID_HEX_OK_RE = re.compile(
132 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
136 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
138 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
140 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
142 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
144 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
146 ANYWHERE_MAC_ADDRESS_RE = re.compile(
147 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
150 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
152 HTML_RE = re.compile(
153 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
154 re.IGNORECASE | re.MULTILINE | re.DOTALL,
157 HTML_TAG_ONLY_RE = re.compile(
158 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
159 re.IGNORECASE | re.MULTILINE | re.DOTALL,
162 SPACES_RE = re.compile(r"\s")
164 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
166 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
168 ESCAPE_SEQUENCE_RE = re.compile(r"
\e\[[^A-Za-z]*[A-Za-z]")
229 NUM_WORDS["and"] = (1, 0)
230 for i, word in enumerate(UNIT_WORDS):
231 NUM_WORDS[word] = (1, i)
232 for i, word in enumerate(TENS_WORDS):
233 NUM_WORDS[word] = (1, i * 10)
234 for i, word in enumerate(MAGNITUDE_SCALES):
236 NUM_WORDS[word] = (100, 0)
238 NUM_WORDS[word] = (10 ** (i * 3), 0)
239 NUM_WORDS['score'] = (20, 0)
242 def is_none_or_empty(in_str: Optional[str]) -> bool:
245 in_str: the string to test
248 True if the input string is either None or an empty string,
251 See also :meth:`is_string` and :meth:`is_empty_string`.
253 >>> is_none_or_empty("")
255 >>> is_none_or_empty(None)
257 >>> is_none_or_empty(" \t ")
259 >>> is_none_or_empty('Test')
262 return in_str is None or len(in_str.strip()) == 0
265 def is_string(obj: Any) -> bool:
268 in_str: the object to test
271 True if the object is a string and False otherwise.
273 See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
275 >>> is_string('test')
281 >>> is_string([1, 2, 3])
284 return isinstance(obj, str)
287 def is_empty_string(in_str: Any) -> bool:
290 in_str: the string to test
293 True if the string is empty and False otherwise.
295 See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
297 return is_empty(in_str)
300 def is_empty(in_str: Any) -> bool:
303 in_str: the string to test
306 True if the string is empty and false otherwise.
308 See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
312 >>> is_empty(' \t\t ')
318 >>> is_empty([1, 2, 3])
321 return is_string(in_str) and in_str.strip() == ""
324 def is_full_string(in_str: Any) -> bool:
327 in_str: the object to test
330 True if the object is a string and is not empty ('') and
331 is not only composed of whitespace.
333 See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
335 >>> is_full_string('test!')
337 >>> is_full_string('')
339 >>> is_full_string(' ')
341 >>> is_full_string(100.999)
343 >>> is_full_string({"a": 1, "b": 2})
346 return is_string(in_str) and in_str.strip() != ""
349 def is_number(in_str: str) -> bool:
352 in_str: the string to test
355 True if the string contains a valid numberic value and
358 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
359 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
363 Traceback (most recent call last):
366 >>> is_number("100.5")
368 >>> is_number("test")
372 >>> is_number([1, 2, 3])
373 Traceback (most recent call last):
375 ValueError: [1, 2, 3]
377 if not is_string(in_str):
378 raise ValueError(in_str)
379 return NUMBER_RE.match(in_str) is not None
382 def is_integer_number(in_str: str) -> bool:
385 in_str: the string to test
388 True if the string contains a valid (signed or unsigned,
389 decimal, hex, or octal, regular or scientific) integral
390 expression and False otherwise.
392 See also :meth:`is_number`, :meth:`is_decimal_number`,
393 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
396 >>> is_integer_number('42')
398 >>> is_integer_number('42.0')
402 (is_number(in_str) and "." not in in_str)
403 or is_hexidecimal_integer_number(in_str)
404 or is_octal_integer_number(in_str)
405 or is_binary_integer_number(in_str)
409 def is_hexidecimal_integer_number(in_str: str) -> bool:
412 in_str: the string to test
415 True if the string is a hex integer number and False otherwise.
417 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
418 :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
420 >>> is_hexidecimal_integer_number('0x12345')
422 >>> is_hexidecimal_integer_number('0x1A3E')
424 >>> is_hexidecimal_integer_number('1234') # Needs 0x
426 >>> is_hexidecimal_integer_number('-0xff')
428 >>> is_hexidecimal_integer_number('test')
430 >>> is_hexidecimal_integer_number(12345) # Not a string
431 Traceback (most recent call last):
434 >>> is_hexidecimal_integer_number(101.4)
435 Traceback (most recent call last):
438 >>> is_hexidecimal_integer_number(0x1A3E)
439 Traceback (most recent call last):
443 if not is_string(in_str):
444 raise ValueError(in_str)
445 return HEX_NUMBER_RE.match(in_str) is not None
448 def is_octal_integer_number(in_str: str) -> bool:
451 in_str: the string to test
454 True if the string is a valid octal integral number and False otherwise.
456 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
457 :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
460 >>> is_octal_integer_number('0o777')
462 >>> is_octal_integer_number('-0O115')
464 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
466 >>> is_octal_integer_number('7777') # Needs 0o
468 >>> is_octal_integer_number('test')
471 if not is_string(in_str):
472 raise ValueError(in_str)
473 return OCT_NUMBER_RE.match(in_str) is not None
476 def is_binary_integer_number(in_str: str) -> bool:
479 in_str: the string to test
482 True if the string contains a binary integral number and False otherwise.
484 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
485 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
488 >>> is_binary_integer_number('0b10111')
490 >>> is_binary_integer_number('-0b111')
492 >>> is_binary_integer_number('0B10101')
494 >>> is_binary_integer_number('0b10102')
496 >>> is_binary_integer_number('0xFFF')
498 >>> is_binary_integer_number('test')
501 if not is_string(in_str):
502 raise ValueError(in_str)
503 return BIN_NUMBER_RE.match(in_str) is not None
506 def to_int(in_str: str) -> int:
509 in_str: the string to convert
512 The integral value of the string or raises on error.
514 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
515 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
516 :meth:`is_binary_integer_number`, etc...
522 >>> to_int('0b01101')
527 Traceback (most recent call last):
529 ValueError: invalid literal for int() with base 10: 'test'
531 if not is_string(in_str):
532 raise ValueError(in_str)
533 if is_binary_integer_number(in_str):
534 return int(in_str, 2)
535 if is_octal_integer_number(in_str):
536 return int(in_str, 8)
537 if is_hexidecimal_integer_number(in_str):
538 return int(in_str, 16)
542 def number_string_to_integer(in_str: str) -> int:
543 """Convert a string containing a written-out number into an int.
546 in_str: the string containing the long-hand written out integer number
547 in English. See examples below.
550 The integer whose value was parsed from in_str.
552 See also :meth:`integer_to_number_string`.
555 This code only handles integers; it will not work with decimals / floats.
557 >>> number_string_to_integer("one hundred fifty two")
560 >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
563 >>> number_string_to_integer("four-score and 7")
566 >>> number_string_to_integer("fifty xyzzy three")
567 Traceback (most recent call last):
569 ValueError: Unknown word: xyzzy
571 if type(in_str) == int:
575 in_str = in_str.replace('-', ' ')
576 for word in in_str.split():
577 if word not in NUM_WORDS:
578 if is_integer_number(word):
582 raise ValueError("Unknown word: " + word)
583 scale, increment = NUM_WORDS[word]
584 current = current * scale + increment
588 return result + current
591 def integer_to_number_string(num: int) -> str:
593 Opposite of :meth:`number_string_to_integer`; converts a number to a written out
594 longhand format in English.
597 num: the integer number to convert
600 The long-hand written out English form of the number. See examples below.
602 See also :meth:`number_string_to_integer`.
605 This method does not handle decimals or floats, only ints.
607 >>> integer_to_number_string(9)
610 >>> integer_to_number_string(42)
613 >>> integer_to_number_string(123219982)
614 'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
618 return UNIT_WORDS[num]
620 ret = TENS_WORDS[num // 10]
623 ret += ' ' + UNIT_WORDS[leftover]
626 # If num > 100 go find the highest chunk and convert that, then recursively
627 # convert the rest. NUM_WORDS contains items like 'thousand' -> (1000, 0).
628 # The second item in the tuple is an increment that can be ignored; the first
629 # is the numeric "scale" of the entry. So find the greatest entry in NUM_WORDS
630 # still less than num. For 123,456 it would be thousand. Then pull out the
631 # 123, convert it, and append "thousand". Then do the rest.
633 for name, val in NUM_WORDS.items():
635 scales[name] = val[0]
636 scale = max(scales.items(), key=lambda _: _[1])
638 # scale[1] = numeric magnitude (e.g. 1000)
639 # scale[0] = name (e.g. "thousand")
640 ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
641 leftover = num % scale[1]
643 ret += ' ' + integer_to_number_string(leftover)
647 def is_decimal_number(in_str: str) -> bool:
650 in_str: the string to check
653 True if the given string represents a decimal or False
654 otherwise. A decimal may be signed or unsigned or use
655 a "scientific notation".
657 See also :meth:`is_integer_number`.
660 We do not consider integers without a decimal point
661 to be decimals; they return False (see example).
663 >>> is_decimal_number('42.0')
665 >>> is_decimal_number('42')
668 return is_number(in_str) and "." in in_str
671 def strip_escape_sequences(in_str: str) -> str:
674 in_str: the string to strip of escape sequences.
677 in_str with escape sequences removed.
679 See also: :mod:`pyutils.ansi`.
682 What is considered to be an "escape sequence" is defined
683 by a regular expression. While this gets common ones,
684 there may exist valid sequences that it doesn't match.
686 >>> strip_escape_sequences('
\e[12;11;22mthis is a test!')
689 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
693 def add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
696 in_str: string or number to which to add thousands separator(s)
697 separator_char: the separator character to add (defaults to comma)
698 places: add a separator every N places (defaults to three)
701 A numeric string with thousands separators added appropriately.
703 >>> add_thousands_separator('12345678')
705 >>> add_thousands_separator(12345678)
707 >>> add_thousands_separator(12345678.99)
709 >>> add_thousands_separator('test')
710 Traceback (most recent call last):
715 if isinstance(in_str, numbers.Number):
717 if is_number(in_str):
718 return _add_thousands_separator(
719 in_str, separator_char=separator_char, places=places
721 raise ValueError(in_str)
724 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
725 """Internal helper"""
728 (in_str, decimal_part) = in_str.split('.')
729 tmp = [iter(in_str[::-1])] * places
730 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
731 if len(decimal_part) > 0:
737 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
740 in_str: the string to test
741 allowed_schemes: an optional list of allowed schemes (e.g.
742 ['http', 'https', 'ftp']. If passed, only URLs that
743 begin with the one of the schemes passed will be considered
744 to be valid. Otherwise, any scheme:// will be considered
748 True if in_str contains a valid URL and False otherwise.
750 >>> is_url('http://www.mysite.com')
752 >>> is_url('https://mysite.com')
754 >>> is_url('.mysite.com')
756 >>> is_url('scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash')
759 if not is_full_string(in_str):
762 valid = URL_RE.match(in_str) is not None
765 return valid and any([in_str.startswith(s) for s in allowed_schemes])
769 def is_email(in_str: Any) -> bool:
772 in_str: the email address to check
774 Returns: True if the in_str contains a valid email (as defined by
775 https://tools.ietf.org/html/rfc3696#section-3) or False
780 >>> is_email('@gmail.com')
783 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
787 # we expect 2 tokens, one before "@" and one after, otherwise
788 # we have an exception and the email is not valid.
789 head, tail = in_str.split("@")
791 # head's size must be <= 64, tail <= 255, head must not start
792 # with a dot or contain multiple consecutive dots.
793 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
796 # removes escaped spaces, so that later on the test regex will
798 head = head.replace("\\ ", "")
799 if head.startswith('"') and head.endswith('"'):
800 head = head.replace(" ", "")[1:-1]
801 return EMAIL_RE.match(head + "@" + tail) is not None
804 # borderline case in which we have multiple "@" signs but the
805 # head part is correctly escaped.
806 if ESCAPED_AT_SIGN.search(in_str) is not None:
807 # replace "@" with "a" in the head
808 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
812 def suffix_string_to_number(in_str: str) -> Optional[int]:
813 """Takes a string like "33Gb" and converts it into a number (of bytes)
817 in_str: the string with a suffix to be interpreted and removed.
820 An integer number of bytes or None to indicate an error.
822 See also :meth:`number_to_suffix_string`.
824 >>> suffix_string_to_number('1Mb')
826 >>> suffix_string_to_number('13.1Gb')
830 def suffix_capitalize(s: str) -> str:
834 return f"{s[0].upper()}{s[1].lower()}"
835 return suffix_capitalize(s[0:1])
837 if is_string(in_str):
838 if is_integer_number(in_str):
839 return to_int(in_str)
840 suffixes = [in_str[-2:], in_str[-1:]]
841 rest = [in_str[:-2], in_str[:-1]]
842 for x in range(len(suffixes)):
844 s = suffix_capitalize(s)
845 multiplier = NUM_SUFFIXES.get(s, None)
846 if multiplier is not None:
848 if is_integer_number(r):
849 return to_int(r) * multiplier
850 if is_decimal_number(r):
851 return int(float(r) * multiplier)
855 def number_to_suffix_string(num: int) -> Optional[str]:
856 """Take a number (of bytes) and returns a string like "43.8Gb".
859 num: an integer number of bytes
862 A string with a suffix representing num bytes concisely or
863 None to indicate an error.
865 See also: :meth:`suffix_string_to_number`.
867 >>> number_to_suffix_string(14066017894)
869 >>> number_to_suffix_string(1024 * 1024)
874 for (sfx, size) in NUM_SUFFIXES.items():
879 if suffix is not None:
880 return f"{d:.1f}{suffix}"
885 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
888 in_str: a string to check
889 card_type: if provided, contains the card type to validate
890 with. Otherwise, all known credit card number types will
893 Supported card types are the following:
903 True if in_str is a valid credit card number.
906 This code is not verifying the authenticity of the credit card (i.e.
907 not checking whether it's a real card that can be charged); rather
908 it's only checking that the number follows the "rules" for numbering
909 established by credit card issuers.
912 if not is_full_string(in_str):
915 if card_type is not None:
916 if card_type not in CREDIT_CARDS:
918 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
920 return CREDIT_CARDS[card_type].match(in_str) is not None
921 for c in CREDIT_CARDS:
922 if CREDIT_CARDS[c].match(in_str) is not None:
927 def is_camel_case(in_str: Any) -> bool:
930 in_str: the string to test
933 True if the string is formatted as camel case and False otherwise.
934 A string is considered camel case when:
936 * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
937 * it contains both lowercase and uppercase letters
938 * it does not start with a number
940 See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
942 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
945 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
948 in_str: the string to test
950 Returns: True if the string is snake case and False otherwise. A
951 string is considered snake case when:
953 * it's composed only by lowercase/uppercase letters and digits
954 * it contains at least one underscore (or provided separator)
955 * it does not start with a number
957 See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
959 >>> is_snake_case('this_is_a_test')
961 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
963 >>> is_snake_case('this-is-a-test')
965 >>> is_snake_case('this-is-a-test', separator='-')
968 if is_full_string(in_str):
969 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
970 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
973 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
975 return r.match(in_str) is not None
979 def is_json(in_str: Any) -> bool:
982 in_str: the string to test
985 True if the in_str contains valid JSON and False otherwise.
987 >>> is_json('{"name": "Peter"}')
989 >>> is_json('[1, 2, 3]')
991 >>> is_json('{nope}')
994 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
996 return isinstance(json.loads(in_str), (dict, list))
997 except (TypeError, ValueError, OverflowError):
1002 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1005 in_str: the string to test
1008 True if the in_str contains a valid UUID and False otherwise.
1010 See also :meth:`generate_uuid`.
1012 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1014 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1016 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1019 # string casting is used to allow UUID itself as input data type
1022 return UUID_HEX_OK_RE.match(s) is not None
1023 return UUID_RE.match(s) is not None
1026 def is_ip_v4(in_str: Any) -> bool:
1029 in_str: the string to test
1032 True if in_str contains a valid IPv4 address and False otherwise.
1034 See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1037 >>> is_ip_v4('255.200.100.75')
1039 >>> is_ip_v4('nope')
1041 >>> is_ip_v4('255.200.100.999') # 999 out of range
1044 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1047 # checks that each entry in the ip is in the valid range (0 to 255)
1048 for token in in_str.split("."):
1049 if not 0 <= int(token) <= 255:
1054 def extract_ip_v4(in_str: Any) -> Optional[str]:
1057 in_str: the string to extract an IPv4 address from.
1060 The first extracted IPv4 address from in_str or None if
1061 none were found or an error occurred.
1063 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1066 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
1068 >>> extract_ip_v4('Your mom dresses you funny.')
1070 if not is_full_string(in_str):
1072 m = ANYWHERE_IP_V4_RE.search(in_str)
1078 def is_ip_v6(in_str: Any) -> bool:
1081 in_str: the string to test.
1084 True if in_str contains a valid IPv6 address and False otherwise.
1086 See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1089 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1091 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
1094 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1097 def extract_ip_v6(in_str: Any) -> Optional[str]:
1100 in_str: the string from which to extract an IPv6 address.
1103 The first IPv6 address found in in_str or None if no address
1104 was found or an error occurred.
1106 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1109 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1110 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1111 >>> extract_ip_v6("(and she's ugly too, btw)")
1113 if not is_full_string(in_str):
1115 m = ANYWHERE_IP_V6_RE.search(in_str)
1121 def is_ip(in_str: Any) -> bool:
1124 in_str: the string to test.
1127 True if in_str contains a valid IP address (either IPv4 or
1130 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1131 and :meth:`extract_ip_v4`.
1133 >>> is_ip('255.200.100.75')
1135 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1139 >>> is_ip('1.2.3.999')
1142 return is_ip_v6(in_str) or is_ip_v4(in_str)
1145 def extract_ip(in_str: Any) -> Optional[str]:
1148 in_str: the string from which to extract in IP address.
1151 The first IP address (IPv4 or IPv6) found in in_str or
1152 None to indicate none found or an error condition.
1154 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1155 and :meth:`extract_ip_v4`.
1157 >>> extract_ip('Attacker: 255.200.100.75')
1159 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1160 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1161 >>> extract_ip('1.2.3')
1163 ip = extract_ip_v4(in_str)
1165 ip = extract_ip_v6(in_str)
1169 def is_mac_address(in_str: Any) -> bool:
1172 in_str: the string to test
1175 True if in_str is a valid MAC address False otherwise.
1177 See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1179 >>> is_mac_address("34:29:8F:12:0D:2F")
1181 >>> is_mac_address('34:29:8f:12:0d:2f')
1183 >>> is_mac_address('34-29-8F-12-0D-2F')
1185 >>> is_mac_address("test")
1188 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1191 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1194 in_str: the string from which to extract a MAC address.
1197 The first MAC address found in in_str or None to indicate no
1200 See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1202 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1205 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1208 if not is_full_string(in_str):
1211 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1214 mac.replace(":", separator)
1215 mac.replace("-", separator)
1220 def is_slug(in_str: Any, separator: str = "-") -> bool:
1223 in_str: string to test
1226 True if in_str is a slug string and False otherwise.
1228 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1230 >>> is_slug('my-blog-post-title')
1232 >>> is_slug('My blog post title')
1235 if not is_full_string(in_str):
1237 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1238 return re.match(rex, in_str) is not None
1241 def contains_html(in_str: str) -> bool:
1244 in_str: the string to check for tags in
1247 True if the given string contains HTML/XML tags and False
1250 See also :meth:`strip_html`.
1253 By design, this function matches ANY type of tag, so don't expect
1254 to use it as an HTML validator. It's a quick sanity check at
1255 best. See something like BeautifulSoup for a more full-featuered
1258 >>> contains_html('my string is <strong>bold</strong>')
1260 >>> contains_html('my string is not bold')
1264 if not is_string(in_str):
1265 raise ValueError(in_str)
1266 return HTML_RE.search(in_str) is not None
1269 def words_count(in_str: str) -> int:
1272 in_str: the string to count words in
1275 The number of words contained in the given string.
1278 This method is "smart" in that it does consider only sequences
1279 of one or more letter and/or numbers to be "words". Thus a
1280 string like this: "! @ # % ... []" will return zero. Moreover
1281 it is aware of punctuation, so the count for a string like
1282 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1285 >>> words_count('hello world')
1287 >>> words_count('one,two,three.stop')
1290 if not is_string(in_str):
1291 raise ValueError(in_str)
1292 return len(WORDS_COUNT_RE.findall(in_str))
1295 def word_count(in_str: str) -> int:
1298 in_str: the string to count words in
1301 The number of words contained in the given string.
1304 This method is "smart" in that it does consider only sequences
1305 of one or more letter and/or numbers to be "words". Thus a
1306 string like this: "! @ # % ... []" will return zero. Moreover
1307 it is aware of punctuation, so the count for a string like
1308 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1311 >>> word_count('hello world')
1313 >>> word_count('one,two,three.stop')
1316 return words_count(in_str)
1319 def generate_uuid(omit_dashes: bool = False) -> str:
1322 omit_dashes: should we omit the dashes in the generated UUID?
1325 A generated UUID string (using `uuid.uuid4()`) with or without
1326 dashes per the omit_dashes arg.
1328 See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1330 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1331 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1339 def generate_random_alphanumeric_string(size: int) -> str:
1342 size: number of characters to generate
1345 A string of the specified size containing random characters
1346 (uppercase/lowercase ascii letters and digits).
1348 See also :meth:`asciify`, :meth:`generate_uuid`.
1351 >>> generate_random_alphanumeric_string(9)
1355 raise ValueError("size must be >= 1")
1356 chars = string.ascii_letters + string.digits
1357 buffer = [random.choice(chars) for _ in range(size)]
1358 return from_char_list(buffer)
1361 def reverse(in_str: str) -> str:
1364 in_str: the string to reverse
1367 The reversed (chracter by character) string.
1372 if not is_string(in_str):
1373 raise ValueError(in_str)
1377 def camel_case_to_snake_case(in_str, *, separator="_"):
1380 in_str: the camel case string to convert
1383 A snake case string equivalent to the camel case input or the
1384 original string if it is not a valid camel case string or some
1387 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1389 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1390 'mac_address_extractor_factory'
1391 >>> camel_case_to_snake_case('Luke Skywalker')
1394 if not is_string(in_str):
1395 raise ValueError(in_str)
1396 if not is_camel_case(in_str):
1398 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1401 def snake_case_to_camel_case(
1402 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1406 in_str: the snake case string to convert
1409 A camel case string that is equivalent to the snake case string
1410 provided or the original string back again if it is not valid
1411 snake case or another error occurs.
1413 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1415 >>> snake_case_to_camel_case('this_is_a_test')
1417 >>> snake_case_to_camel_case('Han Solo')
1420 if not is_string(in_str):
1421 raise ValueError(in_str)
1422 if not is_snake_case(in_str, separator=separator):
1424 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1425 if not upper_case_first:
1426 tokens[0] = tokens[0].lower()
1427 return from_char_list(tokens)
1430 def to_char_list(in_str: str) -> List[str]:
1433 in_str: the string to split into a char list
1436 A list of strings of length one each.
1438 See also :meth:`from_char_list`.
1440 >>> to_char_list('test')
1441 ['t', 'e', 's', 't']
1443 if not is_string(in_str):
1448 def from_char_list(in_list: List[str]) -> str:
1451 in_list: A list of characters to convert into a string.
1454 The string resulting from gluing the characters in in_list
1457 See also :meth:`to_char_list`.
1459 >>> from_char_list(['t', 'e', 's', 't'])
1462 return "".join(in_list)
1465 def shuffle(in_str: str) -> Optional[str]:
1468 in_str: a string to shuffle randomly by character
1471 A new string containing same chars of the given one but in
1472 a randomized order. Note that in rare cases this could result
1473 in the same original string as no check is done. Returns
1474 None to indicate error conditions.
1477 >>> shuffle('awesome')
1480 if not is_string(in_str):
1482 chars = to_char_list(in_str)
1483 random.shuffle(chars)
1484 return from_char_list(chars)
1487 def scramble(in_str: str) -> Optional[str]:
1490 in_str: a string to shuffle randomly by character
1493 A new string containing same chars of the given one but in
1494 a randomized order. Note that in rare cases this could result
1495 in the same original string as no check is done. Returns
1496 None to indicate error conditions.
1498 See also :mod:`pyutils.unscrambler`.
1501 >>> scramble('awesome')
1504 return shuffle(in_str)
1507 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1510 in_str: the string to strip tags from
1511 keep_tag_content: should we keep the inner contents of tags?
1514 A string with all HTML tags removed (optionally with tag contents
1517 See also :meth:`contains_html`.
1520 This method uses simple regular expressions to strip tags and is
1521 not a full fledged HTML parser by any means. Consider using
1522 something like BeautifulSoup if your needs are more than this
1523 simple code can fulfill.
1525 >>> strip_html('test: <a href="foo/bar">click here</a>')
1527 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1530 if not is_string(in_str):
1531 raise ValueError(in_str)
1532 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1533 return r.sub("", in_str)
1536 def asciify(in_str: str) -> str:
1539 in_str: the string to asciify.
1542 An output string roughly equivalent to the original string
1543 where all content to are ascii-only. This is accomplished
1544 by translating all non-ascii chars into their closest possible
1545 ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1547 See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1550 Some chars may be lost if impossible to translate.
1552 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1553 'eeuuooaaeynAAACIINOE'
1555 if not is_string(in_str):
1556 raise ValueError(in_str)
1558 # "NFKD" is the algorithm which is able to successfully translate
1559 # the most of non-ascii chars.
1560 normalized = unicodedata.normalize("NFKD", in_str)
1562 # encode string forcing ascii and ignore any errors
1563 # (unrepresentable chars will be stripped out)
1564 ascii_bytes = normalized.encode("ascii", "ignore")
1566 # turns encoded bytes into an utf-8 string
1567 return ascii_bytes.decode("utf-8")
1570 def slugify(in_str: str, *, separator: str = "-") -> str:
1573 in_str: the string to slugify
1574 separator: the character to use during sligification (default
1578 The converted string. The returned string has the following properties:
1581 * all letters are in lower case
1582 * all punctuation signs and non alphanumeric chars are removed
1583 * words are divided using provided separator
1584 * all chars are encoded as ascii (by using :meth:`asciify`)
1587 See also :meth:`is_slug` and :meth:`asciify`.
1589 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1590 'top-10-reasons-to-love-dogs'
1591 >>> slugify('Mönstér Mägnët')
1594 if not is_string(in_str):
1595 raise ValueError(in_str)
1597 # replace any character that is NOT letter or number with spaces
1598 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1600 # replace spaces with join sign
1601 out = SPACES_RE.sub(separator, out)
1603 # normalize joins (remove duplicates)
1604 out = re.sub(re.escape(separator) + r"+", separator, out)
1608 def to_bool(in_str: str) -> bool:
1611 in_str: the string to convert to boolean
1614 A boolean equivalent of the original string based on its contents.
1615 All conversion is case insensitive. A positive boolean (True) is
1616 returned if the string value is any of the following:
1625 Otherwise False is returned.
1627 See also :mod:`pyutils.argparse_utils`.
1647 if not is_string(in_str):
1648 raise ValueError(in_str)
1649 return in_str.lower() in ("true", "1", "yes", "y", "t", "on")
1652 def to_date(in_str: str) -> Optional[datetime.date]:
1655 in_str: the string to convert into a date
1658 The datetime.date the string contained or None to indicate
1659 an error. This parser is relatively clever; see
1660 :class:`datetimez.dateparse_utils` docs for details.
1662 See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`extract_date`,
1663 :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1665 >>> to_date('9/11/2001')
1666 datetime.date(2001, 9, 11)
1667 >>> to_date('xyzzy')
1669 import pyutils.datetimez.dateparse_utils as du
1672 d = du.DateParser() # type: ignore
1675 except du.ParseException: # type: ignore
1676 msg = f'Unable to parse date {in_str}.'
1681 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1682 """Finds and extracts a date from the string, if possible.
1685 in_str: the string to extract a date from
1688 a datetime if date was found, otherwise None
1690 See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`to_date`,
1691 :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1693 >>> extract_date("filename.txt dec 13, 2022")
1694 datetime.datetime(2022, 12, 13, 0, 0)
1696 >>> extract_date("Dear Santa, please get me a pony.")
1701 import pyutils.datetimez.dateparse_utils as du
1703 d = du.DateParser() # type: ignore
1704 chunks = in_str.split()
1705 for ngram in itertools.chain(
1706 list_utils.ngrams(chunks, 5),
1707 list_utils.ngrams(chunks, 4),
1708 list_utils.ngrams(chunks, 3),
1709 list_utils.ngrams(chunks, 2),
1712 expr = " ".join(ngram)
1713 logger.debug(f"Trying {expr}")
1715 return d.get_datetime()
1716 except du.ParseException: # type: ignore
1721 def is_valid_date(in_str: str) -> bool:
1724 in_str: the string to check
1727 True if the string represents a valid date that we can recognize
1728 and False otherwise. This parser is relatively clever; see
1729 :class:`datetimez.dateparse_utils` docs for details.
1731 See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`to_date`,
1732 :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1734 >>> is_valid_date('1/2/2022')
1736 >>> is_valid_date('christmas')
1738 >>> is_valid_date('next wednesday')
1740 >>> is_valid_date('xyzzy')
1743 import pyutils.datetimez.dateparse_utils as dp
1746 d = dp.DateParser() # type: ignore
1749 except dp.ParseException: # type: ignore
1750 msg = f'Unable to parse date {in_str}.'
1755 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1758 in_str: string to parse into a datetime
1761 A python datetime parsed from in_str or None to indicate
1762 an error. This parser is relatively clever; see
1763 :class:`datetimez.dateparse_utils` docs for details.
1765 See also: :mod:`pyutils.datetimez.dateparse_utils`, :meth:`to_date`,
1766 :meth:`extract_date`, :meth:`valid_datetime`.
1768 >>> to_datetime('7/20/1969 02:56 GMT')
1769 datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1771 import pyutils.datetimez.dateparse_utils as dp
1774 d = dp.DateParser() # type: ignore
1775 dt = d.parse(in_str)
1776 if isinstance(dt, datetime.datetime):
1779 msg = f'Unable to parse datetime {in_str}.'
1784 def valid_datetime(in_str: str) -> bool:
1787 in_str: the string to check
1790 True if in_str contains a valid datetime and False otherwise.
1791 This parser is relatively clever; see
1792 :class:`datetimez.dateparse_utils` docs for details.
1794 >>> valid_datetime('next wednesday at noon')
1796 >>> valid_datetime('3 weeks ago at midnight')
1798 >>> valid_datetime('next easter at 5:00 am')
1800 >>> valid_datetime('sometime soon')
1803 _ = to_datetime(in_str)
1806 msg = f'Unable to parse datetime {in_str}.'
1811 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1814 in_str: the string to squeeze
1815 character_to_squeeze: the character to remove runs of
1816 more than one in a row (default = space)
1818 Returns: A "squeezed string" where runs of more than one
1819 character_to_squeeze into one.
1821 >>> squeeze(' this is a test ')
1824 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1829 r'(' + re.escape(character_to_squeeze) + r')+',
1830 character_to_squeeze,
1835 def dedent(in_str: str) -> Optional[str]:
1838 in_str: the string to dedent
1841 A string with tab indentation removed or None on error.
1843 See also :meth:`indent`.
1845 >>> dedent('\t\ttest\\n\t\ting')
1848 if not is_string(in_str):
1850 line_separator = '\n'
1851 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1852 return line_separator.join(lines)
1855 def indent(in_str: str, amount: int) -> str:
1858 in_str: the string to indent
1859 amount: count of spaces to indent each line by
1862 An indented string created by prepending amount spaces.
1864 See also :meth:`dedent`.
1866 >>> indent('This is a test', 4)
1869 if not is_string(in_str):
1870 raise ValueError(in_str)
1871 line_separator = '\n'
1872 lines = [" " * amount + line for line in in_str.split(line_separator)]
1873 return line_separator.join(lines)
1876 def _sprintf(*args, **kwargs) -> str:
1877 """Internal helper."""
1880 sep = kwargs.pop("sep", None)
1882 if not isinstance(sep, str):
1883 raise TypeError("sep must be None or a string")
1885 end = kwargs.pop("end", None)
1887 if not isinstance(end, str):
1888 raise TypeError("end must be None or a string")
1891 raise TypeError("invalid keyword arguments to sprint()")
1897 for i, arg in enumerate(args):
1900 if isinstance(arg, str):
1908 def strip_ansi_sequences(in_str: str) -> str:
1911 in_str: the string to strip
1914 in_str with recognized ANSI escape sequences removed.
1916 See also :mod:`pyutils.ansi`.
1919 This method works by using a regular expression.
1920 It works for all ANSI escape sequences I've tested with but
1921 may miss some; caveat emptor.
1923 >>> import ansi as a
1924 >>> s = a.fg('blue') + 'blue!' + a.reset()
1925 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1927 >>> len(strip_ansi_sequences(s))
1929 >>> strip_ansi_sequences(s)
1933 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1936 class SprintfStdout(contextlib.AbstractContextManager):
1938 A context manager that captures outputs to stdout to a buffer
1939 without printing them.
1941 >>> with SprintfStdout() as buf:
1943 ... print("1, 2, 3")
1945 >>> print(buf(), end='')
1950 def __init__(self) -> None:
1951 self.destination = io.StringIO()
1952 self.recorder: contextlib.redirect_stdout
1954 def __enter__(self) -> Callable[[], str]:
1955 self.recorder = contextlib.redirect_stdout(self.destination)
1956 self.recorder.__enter__()
1957 return lambda: self.destination.getvalue()
1959 def __exit__(self, *args) -> Literal[False]:
1960 self.recorder.__exit__(*args)
1961 self.destination.seek(0)
1965 def capitalize_first_letter(in_str: str) -> str:
1968 in_str: the string to capitalize
1971 in_str with the first character capitalized.
1973 >>> capitalize_first_letter('test')
1975 >>> capitalize_first_letter("ALREADY!")
1978 return in_str[0].upper() + in_str[1:]
1981 def it_they(n: int) -> str:
1984 n: how many of them are there?
1987 'it' if n is one or 'they' otherwize.
1989 See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
1994 n = num_files_saved_to_tmp()
1995 print(f'Saved file{pluralize(n)} successfully.')
1996 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2008 def is_are(n: int) -> str:
2011 n: how many of them are there?
2014 'is' if n is one or 'are' otherwize.
2016 See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2021 n = num_files_saved_to_tmp()
2022 print(f'Saved file{pluralize(n)} successfully.')
2023 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2036 def pluralize(n: int) -> str:
2039 n: how many of them are there?
2042 's' if n is greater than one otherwize ''.
2044 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2049 n = num_files_saved_to_tmp()
2050 print(f'Saved file{pluralize(n)} successfully.')
2051 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2056 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2059 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2067 def make_contractions(txt: str) -> str:
2068 """This code glues words in txt together to form (English)
2072 txt: the input text to be contractionized.
2075 Output text identical to original input except for any
2076 recognized contractions are formed.
2078 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2081 The order in which we create contractions is defined by the
2082 implementation and what I thought made more sense when writing
2085 >>> make_contractions('It is nice today.')
2088 >>> make_contractions('I can not even...')
2091 >>> make_contractions('She could not see!')
2094 >>> make_contractions('But she will not go.')
2097 >>> make_contractions('Verily, I shall not.')
2100 >>> make_contractions('No you cannot.')
2103 >>> make_contractions('I said you can not go.')
2104 "I said you can't go."
2140 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2144 # Special cases: can't, shan't and won't.
2145 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2147 r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2150 r'\b(w)ill\s*(n)(o)(t)\b',
2154 flags=re.IGNORECASE,
2157 for first_list, second_list in first_second:
2158 for first in first_list:
2159 for second in second_list:
2160 # Disallow there're/where're. They're valid English
2162 if (first in ('there', 'where')) and second == 'a(re)':
2165 pattern = fr'\b({first})\s+{second}\b'
2166 if second == '(n)o(t)':
2167 replacement = r"\1\2'\3"
2169 replacement = r"\1'\2"
2170 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2175 def thify(n: int) -> str:
2178 n: how many of them are there?
2181 The proper cardinal suffix for a number.
2183 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2192 print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2202 assert is_integer_number(digit)
2214 def ngrams(txt: str, n: int):
2217 txt: the string to create ngrams using
2218 n: how many words per ngram created?
2221 Generates the ngrams from the input string.
2223 See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2225 >>> [x for x in ngrams('This is a test', 2)]
2226 ['This is', 'is a', 'a test']
2229 for ngram in ngrams_presplit(words, n):
2236 def ngrams_presplit(words: Sequence[str], n: int):
2238 Same as :meth:`ngrams` but with the string pre-split.
2240 See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2242 return list_utils.ngrams(words, n)
2245 def bigrams(txt: str):
2246 """Generates the bigrams (n=2) of the given string.
2248 See also :meth:`ngrams`, :meth:`trigrams`.
2250 >>> [x for x in bigrams('this is a test')]
2251 ['this is', 'is a', 'a test']
2253 return ngrams(txt, 2)
2256 def trigrams(txt: str):
2257 """Generates the trigrams (n=3) of the given string.
2259 See also :meth:`ngrams`, :meth:`bigrams`.
2261 return ngrams(txt, 3)
2264 def shuffle_columns_into_list(
2265 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim=''
2267 """Helper to shuffle / parse columnar data and return the results as a
2271 input_lines: A sequence of strings that represents text that
2272 has been broken into columns by the caller
2273 column_specs: an iterable collection of numeric sequences that
2274 indicate one or more column numbers to copy to form the Nth
2275 position in the output list. See example below.
2276 delim: for column_specs that indicate we should copy more than
2277 one column from the input into this position, use delim to
2278 separate source data. Defaults to ''.
2281 A list of string created by following the instructions set forth
2284 See also :meth:`shuffle_columns_into_dict`.
2286 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2287 >>> shuffle_columns_into_list(
2289 ... [ [8], [2, 3], [5, 6, 7] ],
2292 ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2296 # Column specs map input lines' columns into outputs.
2298 for spec in column_specs:
2301 hunk = hunk + delim + input_lines[n]
2302 hunk = hunk.strip(delim)
2307 def shuffle_columns_into_dict(
2308 input_lines: Sequence[str],
2309 column_specs: Iterable[Tuple[str, Iterable[int]]],
2311 ) -> Dict[str, str]:
2312 """Helper to shuffle / parse columnar data and return the results
2316 input_lines: a sequence of strings that represents text that
2317 has been broken into columns by the caller
2318 column_specs: instructions for what dictionary keys to apply
2319 to individual or compound input column data. See example
2321 delim: when forming compound output data by gluing more than
2322 one input column together, use this character to separate
2323 the source data. Defaults to ''.
2326 A dict formed by applying the column_specs instructions.
2328 See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2330 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2331 >>> shuffle_columns_into_dict(
2333 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2336 {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2340 # Column specs map input lines' columns into outputs.
2341 # "key", [col1, col2...]
2342 for spec in column_specs:
2345 hunk = hunk + delim + input_lines[n]
2346 hunk = hunk.strip(delim)
2351 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2353 Interpolate a string with data from a dict.
2356 txt: the mad libs template
2357 values: what you and your kids chose for each category.
2359 See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2361 >>> interpolate_using_dict('This is a {adjective} {noun}.',
2362 ... {'adjective': 'good', 'noun': 'example'})
2363 'This is a good example.'
2365 return _sprintf(txt.format(**values), end='')
2368 def to_ascii(txt: str):
2371 txt: the input data to encode
2374 txt encoded as an ASCII byte string.
2376 See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2377 :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2379 >>> to_ascii('test')
2382 >>> to_ascii(b'1, 2, 3')
2385 if isinstance(txt, str):
2386 return txt.encode('ascii')
2387 if isinstance(txt, bytes):
2389 raise Exception('to_ascii works with strings and bytes')
2392 def to_base64(txt: str, *, encoding='utf-8', errors='surrogatepass') -> bytes:
2395 txt: the input data to encode
2398 txt encoded with a 64-chracter alphabet. Similar to and compatible
2399 with uuencode/uudecode.
2401 See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2402 :meth:`from_base64`.
2404 >>> to_base64('hello?')
2407 return base64.encodebytes(txt.encode(encoding, errors))
2410 def is_base64(txt: str) -> bool:
2413 txt: the string to check
2416 True if txt is a valid base64 encoded string. This assumes
2417 txt was encoded with Python's standard base64 alphabet which
2418 is the same as what uuencode/uudecode uses).
2420 See also :meth:`to_base64`, :meth:`from_base64`.
2422 >>> is_base64('test') # all letters in the b64 alphabet
2425 >>> is_base64('another test, how do you like this one?')
2428 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
2432 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2433 alphabet = set(a.encode('ascii'))
2434 for char in to_ascii(txt.strip()):
2435 if char not in alphabet:
2440 def from_base64(b64: bytes, encoding='utf-8', errors='surrogatepass') -> str:
2443 b64: bytestring of 64-bit encoded data to decode / convert.
2446 The decoded form of b64 as a normal python string. Similar to
2447 and compatible with uuencode / uudecode.
2449 See also :meth:`to_base64`, :meth:`is_base64`.
2451 >>> from_base64(b'aGVsbG8/\\n')
2454 return base64.decodebytes(b64).decode(encoding, errors)
2457 def chunk(txt: str, chunk_size: int):
2460 txt: a string to be chunked into evenly spaced pieces.
2461 chunk_size: the size of each chunk to make
2464 The original string chunked into evenly spaced pieces.
2466 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2467 '01001101 11000101 10101010 10101010 10011111 10101000'
2469 if len(txt) % chunk_size != 0:
2470 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2472 warnings.warn(msg, stacklevel=2)
2473 for x in range(0, len(txt), chunk_size):
2474 yield txt[x : x + chunk_size]
2477 def to_bitstring(txt: str, *, delimiter='') -> str:
2480 txt: the string to convert into a bitstring
2481 delimiter: character to insert between adjacent bytes. Note that
2482 only bitstrings with delimiter='' are interpretable by
2483 :meth:`from_bitstring`.
2486 txt converted to ascii/binary and then chopped into bytes.
2488 See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2491 >>> to_bitstring('hello?')
2492 '011010000110010101101100011011000110111100111111'
2494 >>> to_bitstring('test', delimiter=' ')
2495 '01110100 01100101 01110011 01110100'
2497 >>> to_bitstring(b'test')
2498 '01110100011001010111001101110100'
2500 etxt = to_ascii(txt)
2501 bits = bin(int.from_bytes(etxt, 'big'))
2503 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2506 def is_bitstring(txt: str) -> bool:
2509 txt: the string to check
2512 True if txt is a recognized bitstring and False otherwise.
2513 Note that if delimiter is non empty this code will not
2514 recognize the bitstring.
2516 See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2519 >>> is_bitstring('011010000110010101101100011011000110111100111111')
2522 >>> is_bitstring('1234')
2525 return is_binary_integer_number(f'0b{txt}')
2528 def from_bitstring(bits: str, encoding='utf-8', errors='surrogatepass') -> str:
2531 bits: the bitstring to convert back into a python string
2532 encoding: the encoding to use
2535 The regular python string represented by bits. Note that this
2536 code does not work with to_bitstring when delimiter is non-empty.
2538 See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2541 >>> from_bitstring('011010000110010101101100011011000110111100111111')
2545 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2548 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2551 txt: an IP address to chunk up for sorting purposes
2554 A tuple of IP components arranged such that the sorting of
2555 IP addresses using a normal comparator will do something sane
2558 See also :meth:`is_ip_v4`.
2560 >>> ip_v4_sort_key('10.0.0.18')
2563 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2564 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2565 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2567 if not is_ip_v4(txt):
2568 print(f"not IP: {txt}")
2570 return tuple(int(x) for x in txt.split('.'))
2573 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2576 volume: the string to chunk up for sorting purposes
2579 A tuple of volume's components such that the sorting of
2580 volumes using a normal comparator will do something sane
2583 See also :mod:`pyutils.files.file_utils`.
2585 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2586 ('usr', 'local', 'bin')
2588 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2589 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2590 ['/usr', '/usr/local', '/usr/local/bin']
2592 return tuple(x for x in volume.split('/') if len(x) > 0)
2595 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2597 Execute several replace operations in a row.
2600 in_str: the string in which to replace characters
2601 replace_set: the set of target characters to replace
2602 replacement: the character to replace any member of replace_set
2605 See also :meth:`replace_nth`.
2608 The string with replacements executed.
2610 >>> s = 'this_is a-test!'
2611 >>> replace_all(s, ' _-!', '')
2614 for char in replace_set:
2615 in_str = in_str.replace(char, replacement)
2619 def replace_nth(in_str: str, source: str, target: str, nth: int):
2621 Replaces the nth occurrance of a substring within a string.
2624 in_str: the string in which to run the replacement
2625 source: the substring to replace
2626 target: the replacement text
2627 nth: which occurrance of source to replace?
2629 See also :meth:`replace_all`.
2631 >>> replace_nth('this is a test', ' ', '-', 3)
2634 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2635 before = in_str[:where]
2636 after = in_str[where:]
2637 after = after.replace(source, target, 1)
2638 return before + after
2641 if __name__ == '__main__':