2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
8 Modifications Copyright (c) 2021-2022 Scott Gasch
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
17 The above copyright notice and this permission notice shall be included in all
18 copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 This class is based on:
29 https://github.com/daveoncode/python-string-utils. See `NOTICE
30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
31 in the root of this module for a detailed enumeration of what work is
32 Davide's and what work was added by Scott.
37 import contextlib # type: ignore
48 from itertools import zip_longest
61 from uuid import uuid4
63 from pyutils import list_utils
65 logger = logging.getLogger(__name__)
67 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
69 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
71 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
73 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
76 r"([a-z-]+://)" # scheme
77 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
79 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
80 r"(:\d{2,})?" # port number
81 r"(/[a-z\d_%+-]*)*" # folders
82 r"(\.[a-z\d_%+-]+)*" # file extension
83 r"(\?[a-z\d_+%-=]*)?" # query string
87 URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
89 URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
91 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
94 r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
97 EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
99 EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
101 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
103 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
105 SNAKE_CASE_TEST_RE = re.compile(
106 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
109 SNAKE_CASE_TEST_DASH_RE = re.compile(
110 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
113 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
115 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
118 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
119 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
120 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
121 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
122 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
123 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
126 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
128 UUID_RE = re.compile(
129 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
132 UUID_HEX_OK_RE = re.compile(
133 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
137 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
139 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
141 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
143 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
145 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
147 ANYWHERE_MAC_ADDRESS_RE = re.compile(
148 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
151 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
153 HTML_RE = re.compile(
154 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
155 re.IGNORECASE | re.MULTILINE | re.DOTALL,
158 HTML_TAG_ONLY_RE = re.compile(
159 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
160 re.IGNORECASE | re.MULTILINE | re.DOTALL,
163 SPACES_RE = re.compile(r"\s")
165 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
167 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
169 ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
230 NUM_WORDS["and"] = (1, 0)
231 for i, word in enumerate(UNIT_WORDS):
232 NUM_WORDS[word] = (1, i)
233 for i, word in enumerate(TENS_WORDS):
234 NUM_WORDS[word] = (1, i * 10)
235 for i, word in enumerate(MAGNITUDE_SCALES):
237 NUM_WORDS[word] = (100, 0)
239 NUM_WORDS[word] = (10 ** (i * 3), 0)
240 NUM_WORDS['score'] = (20, 0)
243 def is_none_or_empty(in_str: Optional[str]) -> bool:
246 in_str: the string to test
249 True if the input string is either None or an empty string,
252 See also :meth:`is_string` and :meth:`is_empty_string`.
254 >>> is_none_or_empty("")
256 >>> is_none_or_empty(None)
258 >>> is_none_or_empty(" \t ")
260 >>> is_none_or_empty('Test')
263 return in_str is None or len(in_str.strip()) == 0
266 def is_string(in_str: Any) -> bool:
269 in_str: the object to test
272 True if the object is a string and False otherwise.
274 See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
276 >>> is_string('test')
282 >>> is_string([1, 2, 3])
285 return isinstance(in_str, str)
288 def is_empty_string(in_str: Any) -> bool:
291 in_str: the string to test
294 True if the string is empty and False otherwise.
296 See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
298 return is_empty(in_str)
301 def is_empty(in_str: Any) -> bool:
304 in_str: the string to test
307 True if the string is empty and false otherwise.
309 See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
313 >>> is_empty(' \t\t ')
319 >>> is_empty([1, 2, 3])
322 return is_string(in_str) and in_str.strip() == ""
325 def is_full_string(in_str: Any) -> bool:
328 in_str: the object to test
331 True if the object is a string and is not empty ('') and
332 is not only composed of whitespace.
334 See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
336 >>> is_full_string('test!')
338 >>> is_full_string('')
340 >>> is_full_string(' ')
342 >>> is_full_string(100.999)
344 >>> is_full_string({"a": 1, "b": 2})
347 return is_string(in_str) and in_str.strip() != ""
350 def is_number(in_str: str) -> bool:
353 in_str: the string to test
356 True if the string contains a valid numberic value and
359 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
360 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
364 Traceback (most recent call last):
367 >>> is_number("100.5")
369 >>> is_number("test")
373 >>> is_number([1, 2, 3])
374 Traceback (most recent call last):
376 ValueError: [1, 2, 3]
378 if not is_string(in_str):
379 raise ValueError(in_str)
380 return NUMBER_RE.match(in_str) is not None
383 def is_integer_number(in_str: str) -> bool:
386 in_str: the string to test
389 True if the string contains a valid (signed or unsigned,
390 decimal, hex, or octal, regular or scientific) integral
391 expression and False otherwise.
393 See also :meth:`is_number`, :meth:`is_decimal_number`,
394 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
397 >>> is_integer_number('42')
399 >>> is_integer_number('42.0')
403 (is_number(in_str) and "." not in in_str)
404 or is_hexidecimal_integer_number(in_str)
405 or is_octal_integer_number(in_str)
406 or is_binary_integer_number(in_str)
410 def is_hexidecimal_integer_number(in_str: str) -> bool:
413 in_str: the string to test
416 True if the string is a hex integer number and False otherwise.
418 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
419 :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
421 >>> is_hexidecimal_integer_number('0x12345')
423 >>> is_hexidecimal_integer_number('0x1A3E')
425 >>> is_hexidecimal_integer_number('1234') # Needs 0x
427 >>> is_hexidecimal_integer_number('-0xff')
429 >>> is_hexidecimal_integer_number('test')
431 >>> is_hexidecimal_integer_number(12345) # Not a string
432 Traceback (most recent call last):
435 >>> is_hexidecimal_integer_number(101.4)
436 Traceback (most recent call last):
439 >>> is_hexidecimal_integer_number(0x1A3E)
440 Traceback (most recent call last):
444 if not is_string(in_str):
445 raise ValueError(in_str)
446 return HEX_NUMBER_RE.match(in_str) is not None
449 def is_octal_integer_number(in_str: str) -> bool:
452 in_str: the string to test
455 True if the string is a valid octal integral number and False otherwise.
457 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
458 :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
461 >>> is_octal_integer_number('0o777')
463 >>> is_octal_integer_number('-0O115')
465 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
467 >>> is_octal_integer_number('7777') # Needs 0o
469 >>> is_octal_integer_number('test')
472 if not is_string(in_str):
473 raise ValueError(in_str)
474 return OCT_NUMBER_RE.match(in_str) is not None
477 def is_binary_integer_number(in_str: str) -> bool:
480 in_str: the string to test
483 True if the string contains a binary integral number and False otherwise.
485 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
486 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
489 >>> is_binary_integer_number('0b10111')
491 >>> is_binary_integer_number('-0b111')
493 >>> is_binary_integer_number('0B10101')
495 >>> is_binary_integer_number('0b10102')
497 >>> is_binary_integer_number('0xFFF')
499 >>> is_binary_integer_number('test')
502 if not is_string(in_str):
503 raise ValueError(in_str)
504 return BIN_NUMBER_RE.match(in_str) is not None
507 def to_int(in_str: str) -> int:
510 in_str: the string to convert
513 The integral value of the string or raises on error.
515 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
516 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
517 :meth:`is_binary_integer_number`, etc...
523 >>> to_int('0b01101')
528 Traceback (most recent call last):
530 ValueError: invalid literal for int() with base 10: 'test'
532 if not is_string(in_str):
533 raise ValueError(in_str)
534 if is_binary_integer_number(in_str):
535 return int(in_str, 2)
536 if is_octal_integer_number(in_str):
537 return int(in_str, 8)
538 if is_hexidecimal_integer_number(in_str):
539 return int(in_str, 16)
543 def number_string_to_integer(in_str: str) -> int:
544 """Convert a string containing a written-out number into an int.
547 in_str: the string containing the long-hand written out integer number
548 in English. See examples below.
551 The integer whose value was parsed from in_str.
553 See also :meth:`integer_to_number_string`.
556 This code only handles integers; it will not work with decimals / floats.
558 >>> number_string_to_integer("one hundred fifty two")
561 >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
564 >>> number_string_to_integer("four-score and 7")
567 >>> number_string_to_integer("fifty xyzzy three")
568 Traceback (most recent call last):
570 ValueError: Unknown word: xyzzy
572 if isinstance(in_str, int):
576 in_str = in_str.replace('-', ' ')
577 for w in in_str.split():
578 if w not in NUM_WORDS:
579 if is_integer_number(w):
583 raise ValueError("Unknown word: " + w)
584 scale, increment = NUM_WORDS[w]
585 current = current * scale + increment
589 return result + current
592 def integer_to_number_string(num: int) -> str:
594 Opposite of :meth:`number_string_to_integer`; converts a number to a written out
595 longhand format in English.
598 num: the integer number to convert
601 The long-hand written out English form of the number. See examples below.
603 See also :meth:`number_string_to_integer`.
606 This method does not handle decimals or floats, only ints.
608 >>> integer_to_number_string(9)
611 >>> integer_to_number_string(42)
614 >>> integer_to_number_string(123219982)
615 'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
619 return UNIT_WORDS[num]
621 ret = TENS_WORDS[num // 10]
624 ret += ' ' + UNIT_WORDS[leftover]
627 # If num > 100 go find the highest chunk and convert that, then recursively
628 # convert the rest. NUM_WORDS contains items like 'thousand' -> (1000, 0).
629 # The second item in the tuple is an increment that can be ignored; the first
630 # is the numeric "scale" of the entry. So find the greatest entry in NUM_WORDS
631 # still less than num. For 123,456 it would be thousand. Then pull out the
632 # 123, convert it, and append "thousand". Then do the rest.
634 for name, val in NUM_WORDS.items():
636 scales[name] = val[0]
637 scale = max(scales.items(), key=lambda _: _[1])
639 # scale[1] = numeric magnitude (e.g. 1000)
640 # scale[0] = name (e.g. "thousand")
641 ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
642 leftover = num % scale[1]
644 ret += ' ' + integer_to_number_string(leftover)
648 def is_decimal_number(in_str: str) -> bool:
651 in_str: the string to check
654 True if the given string represents a decimal or False
655 otherwise. A decimal may be signed or unsigned or use
656 a "scientific notation".
658 See also :meth:`is_integer_number`.
661 We do not consider integers without a decimal point
662 to be decimals; they return False (see example).
664 >>> is_decimal_number('42.0')
666 >>> is_decimal_number('42')
669 return is_number(in_str) and "." in in_str
672 def strip_escape_sequences(in_str: str) -> str:
675 in_str: the string to strip of escape sequences.
678 in_str with escape sequences removed.
680 See also: :mod:`pyutils.ansi`.
683 What is considered to be an "escape sequence" is defined
684 by a regular expression. While this gets common ones,
685 there may exist valid sequences that it doesn't match.
687 >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
690 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
694 def add_thousands_separator(
695 in_str: str, *, separator_char: str = ',', places: int = 3
699 in_str: string or number to which to add thousands separator(s)
700 separator_char: the separator character to add (defaults to comma)
701 places: add a separator every N places (defaults to three)
704 A numeric string with thousands separators added appropriately.
706 >>> add_thousands_separator('12345678')
708 >>> add_thousands_separator(12345678)
710 >>> add_thousands_separator(12345678.99)
712 >>> add_thousands_separator('test')
713 Traceback (most recent call last):
718 if isinstance(in_str, numbers.Number):
720 if is_number(in_str):
721 return _add_thousands_separator(
722 in_str, separator_char=separator_char, places=places
724 raise ValueError(in_str)
727 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
728 """Internal helper"""
731 (in_str, decimal_part) = in_str.split('.')
732 tmp = [iter(in_str[::-1])] * places
733 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
734 if len(decimal_part) > 0:
740 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
743 in_str: the string to test
744 allowed_schemes: an optional list of allowed schemes (e.g.
745 ['http', 'https', 'ftp']. If passed, only URLs that
746 begin with the one of the schemes passed will be considered
747 to be valid. Otherwise, any scheme:// will be considered
751 True if in_str contains a valid URL and False otherwise.
753 >>> is_url('http://www.mysite.com')
755 >>> is_url('https://mysite.com')
757 >>> is_url('.mysite.com')
759 >>> is_url('scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash')
762 if not is_full_string(in_str):
765 valid = URL_RE.match(in_str) is not None
768 return valid and any([in_str.startswith(s) for s in allowed_schemes])
772 def is_email(in_str: Any) -> bool:
775 in_str: the email address to check
777 Returns: True if the in_str contains a valid email (as defined by
778 https://tools.ietf.org/html/rfc3696#section-3) or False
783 >>> is_email('@gmail.com')
786 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
790 # we expect 2 tokens, one before "@" and one after, otherwise
791 # we have an exception and the email is not valid.
792 head, tail = in_str.split("@")
794 # head's size must be <= 64, tail <= 255, head must not start
795 # with a dot or contain multiple consecutive dots.
796 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
799 # removes escaped spaces, so that later on the test regex will
801 head = head.replace("\\ ", "")
802 if head.startswith('"') and head.endswith('"'):
803 head = head.replace(" ", "")[1:-1]
804 return EMAIL_RE.match(head + "@" + tail) is not None
807 # borderline case in which we have multiple "@" signs but the
808 # head part is correctly escaped.
809 if ESCAPED_AT_SIGN.search(in_str) is not None:
810 # replace "@" with "a" in the head
811 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
815 def suffix_string_to_number(in_str: str) -> Optional[int]:
816 """Takes a string like "33Gb" and converts it into a number (of bytes)
820 in_str: the string with a suffix to be interpreted and removed.
823 An integer number of bytes or None to indicate an error.
825 See also :meth:`number_to_suffix_string`.
827 >>> suffix_string_to_number('1Mb')
829 >>> suffix_string_to_number('13.1Gb')
831 >>> suffix_string_to_number('12345')
833 >>> x = suffix_string_to_number('a lot')
838 def suffix_capitalize(s: str) -> str:
842 return f"{s[0].upper()}{s[1].lower()}"
843 return suffix_capitalize(s[0:1])
845 if is_string(in_str):
846 if is_integer_number(in_str):
847 return to_int(in_str)
848 suffixes = [in_str[-2:], in_str[-1:]]
849 rest = [in_str[:-2], in_str[:-1]]
850 for x in range(len(suffixes)):
852 s = suffix_capitalize(s)
853 multiplier = NUM_SUFFIXES.get(s, None)
854 if multiplier is not None:
856 if is_integer_number(r):
857 return to_int(r) * multiplier
858 if is_decimal_number(r):
859 return int(float(r) * multiplier)
863 def number_to_suffix_string(num: int) -> Optional[str]:
864 """Take a number (of bytes) and returns a string like "43.8Gb".
867 num: an integer number of bytes
870 A string with a suffix representing num bytes concisely or
871 None to indicate an error.
873 See also: :meth:`suffix_string_to_number`.
875 >>> number_to_suffix_string(14066017894)
877 >>> number_to_suffix_string(1024 * 1024)
882 for (sfx, size) in NUM_SUFFIXES.items():
887 if suffix is not None:
888 return f"{d:.1f}{suffix}"
893 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
896 in_str: a string to check
897 card_type: if provided, contains the card type to validate
898 with. Otherwise, all known credit card number types will
901 Supported card types are the following:
911 True if in_str is a valid credit card number.
914 This code is not verifying the authenticity of the credit card (i.e.
915 not checking whether it's a real card that can be charged); rather
916 it's only checking that the number follows the "rules" for numbering
917 established by credit card issuers.
920 if not is_full_string(in_str):
923 if card_type is not None:
924 if card_type not in CREDIT_CARDS:
926 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
928 return CREDIT_CARDS[card_type].match(in_str) is not None
929 for c in CREDIT_CARDS:
930 if CREDIT_CARDS[c].match(in_str) is not None:
935 def is_camel_case(in_str: Any) -> bool:
938 in_str: the string to test
941 True if the string is formatted as camel case and False otherwise.
942 A string is considered camel case when:
944 * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
945 * it contains both lowercase and uppercase letters
946 * it does not start with a number
948 See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
950 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
953 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
956 in_str: the string to test
957 separator: the snake case separator character to use
959 Returns: True if the string is snake case and False otherwise. A
960 string is considered snake case when:
962 * it's composed only by lowercase/uppercase letters and digits
963 * it contains at least one underscore (or provided separator)
964 * it does not start with a number
966 See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
968 >>> is_snake_case('this_is_a_test')
970 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
972 >>> is_snake_case('this-is-a-test')
974 >>> is_snake_case('this-is-a-test', separator='-')
977 if is_full_string(in_str):
978 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
979 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
982 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
984 return r.match(in_str) is not None
988 def is_json(in_str: Any) -> bool:
991 in_str: the string to test
994 True if the in_str contains valid JSON and False otherwise.
996 >>> is_json('{"name": "Peter"}')
998 >>> is_json('[1, 2, 3]')
1000 >>> is_json('{nope}')
1003 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
1005 return isinstance(json.loads(in_str), (dict, list))
1006 except (TypeError, ValueError, OverflowError):
1011 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1014 in_str: the string to test
1015 allow_hex: should we allow hexidecimal digits in valid uuids?
1018 True if the in_str contains a valid UUID and False otherwise.
1020 See also :meth:`generate_uuid`.
1022 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1024 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1026 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1029 # string casting is used to allow UUID itself as input data type
1032 return UUID_HEX_OK_RE.match(s) is not None
1033 return UUID_RE.match(s) is not None
1036 def is_ip_v4(in_str: Any) -> bool:
1039 in_str: the string to test
1042 True if in_str contains a valid IPv4 address and False otherwise.
1044 See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1047 >>> is_ip_v4('255.200.100.75')
1049 >>> is_ip_v4('nope')
1051 >>> is_ip_v4('255.200.100.999') # 999 out of range
1054 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1057 # checks that each entry in the ip is in the valid range (0 to 255)
1058 for token in in_str.split("."):
1059 if not 0 <= int(token) <= 255:
1064 def extract_ip_v4(in_str: Any) -> Optional[str]:
1067 in_str: the string to extract an IPv4 address from.
1070 The first extracted IPv4 address from in_str or None if
1071 none were found or an error occurred.
1073 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1076 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
1078 >>> extract_ip_v4('Your mom dresses you funny.')
1080 if not is_full_string(in_str):
1082 m = ANYWHERE_IP_V4_RE.search(in_str)
1088 def is_ip_v6(in_str: Any) -> bool:
1091 in_str: the string to test.
1094 True if in_str contains a valid IPv6 address and False otherwise.
1096 See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1099 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1101 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
1104 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1107 def extract_ip_v6(in_str: Any) -> Optional[str]:
1110 in_str: the string from which to extract an IPv6 address.
1113 The first IPv6 address found in in_str or None if no address
1114 was found or an error occurred.
1116 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1119 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1120 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1121 >>> extract_ip_v6("(and she's ugly too, btw)")
1123 if not is_full_string(in_str):
1125 m = ANYWHERE_IP_V6_RE.search(in_str)
1131 def is_ip(in_str: Any) -> bool:
1134 in_str: the string to test.
1137 True if in_str contains a valid IP address (either IPv4 or
1140 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1141 and :meth:`extract_ip_v4`.
1143 >>> is_ip('255.200.100.75')
1145 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1149 >>> is_ip('1.2.3.999')
1152 return is_ip_v6(in_str) or is_ip_v4(in_str)
1155 def extract_ip(in_str: Any) -> Optional[str]:
1158 in_str: the string from which to extract in IP address.
1161 The first IP address (IPv4 or IPv6) found in in_str or
1162 None to indicate none found or an error condition.
1164 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1165 and :meth:`extract_ip_v4`.
1167 >>> extract_ip('Attacker: 255.200.100.75')
1169 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1170 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1171 >>> extract_ip('1.2.3')
1173 ip = extract_ip_v4(in_str)
1175 ip = extract_ip_v6(in_str)
1179 def is_mac_address(in_str: Any) -> bool:
1182 in_str: the string to test
1185 True if in_str is a valid MAC address False otherwise.
1187 See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1189 >>> is_mac_address("34:29:8F:12:0D:2F")
1191 >>> is_mac_address('34:29:8f:12:0d:2f')
1193 >>> is_mac_address('34-29-8F-12-0D-2F')
1195 >>> is_mac_address("test")
1198 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1201 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1204 in_str: the string from which to extract a MAC address.
1205 separator: the MAC address hex byte separator to use.
1208 The first MAC address found in in_str or None to indicate no
1211 See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1213 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1216 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1219 if not is_full_string(in_str):
1222 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1225 mac.replace(":", separator)
1226 mac.replace("-", separator)
1231 def is_slug(in_str: Any, separator: str = "-") -> bool:
1234 in_str: string to test
1235 separator: the slug character to use
1238 True if in_str is a slug string and False otherwise.
1240 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1242 >>> is_slug('my-blog-post-title')
1244 >>> is_slug('My blog post title')
1247 if not is_full_string(in_str):
1249 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1250 return re.match(rex, in_str) is not None
1253 def contains_html(in_str: str) -> bool:
1256 in_str: the string to check for tags in
1259 True if the given string contains HTML/XML tags and False
1262 See also :meth:`strip_html`.
1265 By design, this function matches ANY type of tag, so don't expect
1266 to use it as an HTML validator. It's a quick sanity check at
1267 best. See something like BeautifulSoup for a more full-featuered
1270 >>> contains_html('my string is <strong>bold</strong>')
1272 >>> contains_html('my string is not bold')
1276 if not is_string(in_str):
1277 raise ValueError(in_str)
1278 return HTML_RE.search(in_str) is not None
1281 def words_count(in_str: str) -> int:
1284 in_str: the string to count words in
1287 The number of words contained in the given string.
1290 This method is "smart" in that it does consider only sequences
1291 of one or more letter and/or numbers to be "words". Thus a
1292 string like this: "! @ # % ... []" will return zero. Moreover
1293 it is aware of punctuation, so the count for a string like
1294 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1297 >>> words_count('hello world')
1299 >>> words_count('one,two,three.stop')
1302 if not is_string(in_str):
1303 raise ValueError(in_str)
1304 return len(WORDS_COUNT_RE.findall(in_str))
1307 def word_count(in_str: str) -> int:
1310 in_str: the string to count words in
1313 The number of words contained in the given string.
1316 This method is "smart" in that it does consider only sequences
1317 of one or more letter and/or numbers to be "words". Thus a
1318 string like this: "! @ # % ... []" will return zero. Moreover
1319 it is aware of punctuation, so the count for a string like
1320 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1323 >>> word_count('hello world')
1325 >>> word_count('one,two,three.stop')
1328 return words_count(in_str)
1331 def generate_uuid(omit_dashes: bool = False) -> str:
1334 omit_dashes: should we omit the dashes in the generated UUID?
1337 A generated UUID string (using `uuid.uuid4()`) with or without
1338 dashes per the omit_dashes arg.
1340 See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1342 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1343 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1351 def generate_random_alphanumeric_string(size: int) -> str:
1354 size: number of characters to generate
1357 A string of the specified size containing random characters
1358 (uppercase/lowercase ascii letters and digits).
1360 See also :meth:`asciify`, :meth:`generate_uuid`.
1363 >>> generate_random_alphanumeric_string(9)
1367 raise ValueError("size must be >= 1")
1368 chars = string.ascii_letters + string.digits
1369 buffer = [random.choice(chars) for _ in range(size)]
1370 return from_char_list(buffer)
1373 def reverse(in_str: str) -> str:
1376 in_str: the string to reverse
1379 The reversed (chracter by character) string.
1384 if not is_string(in_str):
1385 raise ValueError(in_str)
1389 def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
1392 in_str: the camel case string to convert
1393 separator: the snake case separator character to use
1396 A snake case string equivalent to the camel case input or the
1397 original string if it is not a valid camel case string or some
1400 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1402 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1403 'mac_address_extractor_factory'
1404 >>> camel_case_to_snake_case('Luke Skywalker')
1407 if not is_string(in_str):
1408 raise ValueError(in_str)
1409 if not is_camel_case(in_str):
1411 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1414 def snake_case_to_camel_case(
1415 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1419 in_str: the snake case string to convert
1420 upper_case_first: should we capitalize the first letter?
1421 separator: the separator character to use
1424 A camel case string that is equivalent to the snake case string
1425 provided or the original string back again if it is not valid
1426 snake case or another error occurs.
1428 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1430 >>> snake_case_to_camel_case('this_is_a_test')
1432 >>> snake_case_to_camel_case('Han Solo')
1435 if not is_string(in_str):
1436 raise ValueError(in_str)
1437 if not is_snake_case(in_str, separator=separator):
1439 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1440 if not upper_case_first:
1441 tokens[0] = tokens[0].lower()
1442 return from_char_list(tokens)
1445 def to_char_list(in_str: str) -> List[str]:
1448 in_str: the string to split into a char list
1451 A list of strings of length one each.
1453 See also :meth:`from_char_list`.
1455 >>> to_char_list('test')
1456 ['t', 'e', 's', 't']
1458 if not is_string(in_str):
1463 def from_char_list(in_list: List[str]) -> str:
1466 in_list: A list of characters to convert into a string.
1469 The string resulting from gluing the characters in in_list
1472 See also :meth:`to_char_list`.
1474 >>> from_char_list(['t', 'e', 's', 't'])
1477 return "".join(in_list)
1480 def shuffle(in_str: str) -> Optional[str]:
1483 in_str: a string to shuffle randomly by character
1486 A new string containing same chars of the given one but in
1487 a randomized order. Note that in rare cases this could result
1488 in the same original string as no check is done. Returns
1489 None to indicate error conditions.
1492 >>> shuffle('awesome')
1495 if not is_string(in_str):
1497 chars = to_char_list(in_str)
1498 random.shuffle(chars)
1499 return from_char_list(chars)
1502 def scramble(in_str: str) -> Optional[str]:
1505 in_str: a string to shuffle randomly by character
1508 A new string containing same chars of the given one but in
1509 a randomized order. Note that in rare cases this could result
1510 in the same original string as no check is done. Returns
1511 None to indicate error conditions.
1513 See also :mod:`pyutils.unscrambler`.
1516 >>> scramble('awesome')
1519 return shuffle(in_str)
1522 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1525 in_str: the string to strip tags from
1526 keep_tag_content: should we keep the inner contents of tags?
1529 A string with all HTML tags removed (optionally with tag contents
1532 See also :meth:`contains_html`.
1535 This method uses simple regular expressions to strip tags and is
1536 not a full fledged HTML parser by any means. Consider using
1537 something like BeautifulSoup if your needs are more than this
1538 simple code can fulfill.
1540 >>> strip_html('test: <a href="foo/bar">click here</a>')
1542 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1545 if not is_string(in_str):
1546 raise ValueError(in_str)
1547 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1548 return r.sub("", in_str)
1551 def asciify(in_str: str) -> str:
1554 in_str: the string to asciify.
1557 An output string roughly equivalent to the original string
1558 where all content to are ascii-only. This is accomplished
1559 by translating all non-ascii chars into their closest possible
1560 ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1562 See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1565 Some chars may be lost if impossible to translate.
1567 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1568 'eeuuooaaeynAAACIINOE'
1570 if not is_string(in_str):
1571 raise ValueError(in_str)
1573 # "NFKD" is the algorithm which is able to successfully translate
1574 # the most of non-ascii chars.
1575 normalized = unicodedata.normalize("NFKD", in_str)
1577 # encode string forcing ascii and ignore any errors
1578 # (unrepresentable chars will be stripped out)
1579 ascii_bytes = normalized.encode("ascii", "ignore")
1581 # turns encoded bytes into an utf-8 string
1582 return ascii_bytes.decode("utf-8")
1585 def slugify(in_str: str, *, separator: str = "-") -> str:
1588 in_str: the string to slugify
1589 separator: the character to use during sligification (default
1593 The converted string. The returned string has the following properties:
1596 * all letters are in lower case
1597 * all punctuation signs and non alphanumeric chars are removed
1598 * words are divided using provided separator
1599 * all chars are encoded as ascii (by using :meth:`asciify`)
1602 See also :meth:`is_slug` and :meth:`asciify`.
1604 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1605 'top-10-reasons-to-love-dogs'
1606 >>> slugify('Mönstér Mägnët')
1609 if not is_string(in_str):
1610 raise ValueError(in_str)
1612 # replace any character that is NOT letter or number with spaces
1613 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1615 # replace spaces with join sign
1616 out = SPACES_RE.sub(separator, out)
1618 # normalize joins (remove duplicates)
1619 out = re.sub(re.escape(separator) + r"+", separator, out)
1623 def to_bool(in_str: str) -> bool:
1626 in_str: the string to convert to boolean
1629 A boolean equivalent of the original string based on its contents.
1630 All conversion is case insensitive. A positive boolean (True) is
1631 returned if the string value is any of the following:
1640 Otherwise False is returned.
1642 See also :mod:`pyutils.argparse_utils`.
1662 if not is_string(in_str):
1663 raise ValueError(in_str)
1664 return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
1667 def to_date(in_str: str) -> Optional[datetime.date]:
1670 in_str: the string to convert into a date
1673 The datetime.date the string contained or None to indicate
1674 an error. This parser is relatively clever; see
1675 :class:`datetimes.dateparse_utils` docs for details.
1677 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
1678 :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1680 >>> to_date('9/11/2001')
1681 datetime.date(2001, 9, 11)
1682 >>> to_date('xyzzy')
1684 import pyutils.datetimes.dateparse_utils as du
1687 d = du.DateParser() # type: ignore
1690 except du.ParseException: # type: ignore
1695 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1696 """Finds and extracts a date from the string, if possible.
1699 in_str: the string to extract a date from
1702 a datetime if date was found, otherwise None
1704 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1705 :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1707 >>> extract_date("filename.txt dec 13, 2022")
1708 datetime.datetime(2022, 12, 13, 0, 0)
1710 >>> extract_date("Dear Santa, please get me a pony.")
1715 import pyutils.datetimes.dateparse_utils as du
1717 d = du.DateParser() # type: ignore
1718 chunks = in_str.split()
1719 for ngram in itertools.chain(
1720 list_utils.ngrams(chunks, 5),
1721 list_utils.ngrams(chunks, 4),
1722 list_utils.ngrams(chunks, 3),
1723 list_utils.ngrams(chunks, 2),
1726 expr = " ".join(ngram)
1727 logger.debug("Trying %s", expr)
1729 return d.get_datetime()
1730 except du.ParseException: # type: ignore
1735 def is_valid_date(in_str: str) -> bool:
1738 in_str: the string to check
1741 True if the string represents a valid date that we can recognize
1742 and False otherwise. This parser is relatively clever; see
1743 :class:`datetimes.dateparse_utils` docs for details.
1745 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1746 :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1748 >>> is_valid_date('1/2/2022')
1750 >>> is_valid_date('christmas')
1752 >>> is_valid_date('next wednesday')
1754 >>> is_valid_date('xyzzy')
1757 import pyutils.datetimes.dateparse_utils as dp
1760 d = dp.DateParser() # type: ignore
1763 except dp.ParseException: # type: ignore
1768 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1771 in_str: string to parse into a datetime
1774 A python datetime parsed from in_str or None to indicate
1775 an error. This parser is relatively clever; see
1776 :class:`datetimes.dateparse_utils` docs for details.
1778 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1779 :meth:`extract_date`, :meth:`valid_datetime`.
1781 >>> to_datetime('7/20/1969 02:56 GMT')
1782 datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1784 import pyutils.datetimes.dateparse_utils as dp
1787 d = dp.DateParser() # type: ignore
1788 dt = d.parse(in_str)
1789 if isinstance(dt, datetime.datetime):
1796 def valid_datetime(in_str: str) -> bool:
1799 in_str: the string to check
1802 True if in_str contains a valid datetime and False otherwise.
1803 This parser is relatively clever; see
1804 :class:`datetimes.dateparse_utils` docs for details.
1806 >>> valid_datetime('next wednesday at noon')
1808 >>> valid_datetime('3 weeks ago at midnight')
1810 >>> valid_datetime('next easter at 5:00 am')
1812 >>> valid_datetime('sometime soon')
1815 _ = to_datetime(in_str)
1821 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1824 in_str: the string to squeeze
1825 character_to_squeeze: the character to remove runs of
1826 more than one in a row (default = space)
1828 Returns: A "squeezed string" where runs of more than one
1829 character_to_squeeze into one.
1831 >>> squeeze(' this is a test ')
1834 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1839 r'(' + re.escape(character_to_squeeze) + r')+',
1840 character_to_squeeze,
1845 def dedent(in_str: str) -> Optional[str]:
1848 in_str: the string to dedent
1851 A string with tab indentation removed or None on error.
1853 See also :meth:`indent`.
1855 >>> dedent('\t\ttest\\n\t\ting')
1858 if not is_string(in_str):
1860 line_separator = '\n'
1861 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1862 return line_separator.join(lines)
1865 def indent(in_str: str, amount: int) -> str:
1868 in_str: the string to indent
1869 amount: count of spaces to indent each line by
1872 An indented string created by prepending amount spaces.
1874 See also :meth:`dedent`.
1876 >>> indent('This is a test', 4)
1879 if not is_string(in_str):
1880 raise ValueError(in_str)
1881 line_separator = '\n'
1882 lines = [" " * amount + line for line in in_str.split(line_separator)]
1883 return line_separator.join(lines)
1886 def _sprintf(*args, **kwargs) -> str:
1887 """Internal helper."""
1890 sep = kwargs.pop("sep", None)
1892 if not isinstance(sep, str):
1893 raise TypeError("sep must be None or a string")
1895 end = kwargs.pop("end", None)
1897 if not isinstance(end, str):
1898 raise TypeError("end must be None or a string")
1901 raise TypeError("invalid keyword arguments to sprint()")
1907 for n, arg in enumerate(args):
1910 if isinstance(arg, str):
1918 def strip_ansi_sequences(in_str: str) -> str:
1921 in_str: the string to strip
1924 in_str with recognized ANSI escape sequences removed.
1926 See also :mod:`pyutils.ansi`.
1929 This method works by using a regular expression.
1930 It works for all ANSI escape sequences I've tested with but
1931 may miss some; caveat emptor.
1933 >>> import ansi as a
1934 >>> s = a.fg('blue') + 'blue!' + a.reset()
1935 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1937 >>> len(strip_ansi_sequences(s))
1939 >>> strip_ansi_sequences(s)
1943 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1946 class SprintfStdout(contextlib.AbstractContextManager):
1948 A context manager that captures outputs to stdout to a buffer
1949 without printing them.
1951 >>> with SprintfStdout() as buf:
1953 ... print("1, 2, 3")
1955 >>> print(buf(), end='')
1960 def __init__(self) -> None:
1961 self.destination = io.StringIO()
1962 self.recorder: contextlib.redirect_stdout
1964 def __enter__(self) -> Callable[[], str]:
1965 self.recorder = contextlib.redirect_stdout(self.destination)
1966 self.recorder.__enter__()
1967 return lambda: self.destination.getvalue()
1969 def __exit__(self, *args) -> Literal[False]:
1970 self.recorder.__exit__(*args)
1971 self.destination.seek(0)
1975 def capitalize_first_letter(in_str: str) -> str:
1978 in_str: the string to capitalize
1981 in_str with the first character capitalized.
1983 >>> capitalize_first_letter('test')
1985 >>> capitalize_first_letter("ALREADY!")
1988 return in_str[0].upper() + in_str[1:]
1991 def it_they(n: int) -> str:
1994 n: how many of them are there?
1997 'it' if n is one or 'they' otherwize.
1999 See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
2004 n = num_files_saved_to_tmp()
2005 print(f'Saved file{pluralize(n)} successfully.')
2006 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2018 def is_are(n: int) -> str:
2021 n: how many of them are there?
2024 'is' if n is one or 'are' otherwize.
2026 See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2031 n = num_files_saved_to_tmp()
2032 print(f'Saved file{pluralize(n)} successfully.')
2033 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2046 def pluralize(n: int) -> str:
2049 n: how many of them are there?
2052 's' if n is greater than one otherwize ''.
2054 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2059 n = num_files_saved_to_tmp()
2060 print(f'Saved file{pluralize(n)} successfully.')
2061 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2066 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2069 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2077 def make_contractions(txt: str) -> str:
2078 """This code glues words in txt together to form (English)
2082 txt: the input text to be contractionized.
2085 Output text identical to original input except for any
2086 recognized contractions are formed.
2088 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2091 The order in which we create contractions is defined by the
2092 implementation and what I thought made more sense when writing
2095 >>> make_contractions('It is nice today.')
2098 >>> make_contractions('I can not even...')
2101 >>> make_contractions('She could not see!')
2104 >>> make_contractions('But she will not go.')
2107 >>> make_contractions('Verily, I shall not.')
2110 >>> make_contractions('No you cannot.')
2113 >>> make_contractions('I said you can not go.')
2114 "I said you can't go."
2150 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2154 # Special cases: can't, shan't and won't.
2155 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2157 r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2160 r'\b(w)ill\s*(n)(o)(t)\b',
2164 flags=re.IGNORECASE,
2167 for first_list, second_list in first_second:
2168 for first in first_list:
2169 for second in second_list:
2170 # Disallow there're/where're. They're valid English
2172 if (first in set(['there', 'where'])) and second == 'a(re)':
2175 pattern = fr'\b({first})\s+{second}\b'
2176 if second == '(n)o(t)':
2177 replacement = r"\1\2'\3"
2179 replacement = r"\1'\2"
2180 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2185 def thify(n: int) -> str:
2188 n: how many of them are there?
2191 The proper cardinal suffix for a number.
2193 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2202 print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2212 assert is_integer_number(digit)
2224 get_cardinal_suffix = thify
2227 def add_cardinal_suffix(n: int):
2230 n: the number to return as a string with a cardinal suffix.
2233 A string containing the number with its cardinal suffix.
2235 >>> add_cardinal_suffix(123)
2238 >>> add_cardinal_suffix(1)
2241 >>> add_cardinal_suffix(0)
2244 >>> add_cardinal_suffix(-123)
2247 return f'{n}{get_cardinal_suffix(n)}'
2250 def remove_cardinal_suffix(txt: str) -> Optional[str]:
2253 txt: the number with cardinal suffix to strip.
2256 The same string with its cardinal suffix removed or None on error.
2258 >>> remove_cardinal_suffix('123rd')
2261 >>> remove_cardinal_suffix('-10th')
2264 >>> remove_cardinal_suffix('1ero') is None
2268 if suffix in set(['st', 'nd', 'rd', 'th']):
2273 def ngrams(txt: str, n: int) -> Generator[str, str, None]:
2276 txt: the string to create ngrams using
2277 n: how many words per ngram created?
2280 Generates the ngrams from the input string.
2282 See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2284 >>> [x for x in ngrams('This is a test', 2)]
2285 ['This is', 'is a', 'a test']
2288 for ngram in ngrams_presplit(words, n):
2295 def ngrams_presplit(
2296 words: Sequence[str], n: int
2297 ) -> Generator[Sequence[str], str, None]:
2299 Same as :meth:`ngrams` but with the string pre-split.
2301 See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2303 return list_utils.ngrams(words, n)
2306 def bigrams(txt: str) -> Generator[str, str, None]:
2307 """Generates the bigrams (n=2) of the given string.
2309 See also :meth:`ngrams`, :meth:`trigrams`.
2311 >>> [x for x in bigrams('this is a test')]
2312 ['this is', 'is a', 'a test']
2314 return ngrams(txt, 2)
2317 def trigrams(txt: str) -> Generator[str, str, None]:
2318 """Generates the trigrams (n=3) of the given string.
2320 See also :meth:`ngrams`, :meth:`bigrams`.
2322 return ngrams(txt, 3)
2325 def shuffle_columns_into_list(
2326 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
2328 """Helper to shuffle / parse columnar data and return the results as a
2332 input_lines: A sequence of strings that represents text that
2333 has been broken into columns by the caller
2334 column_specs: an iterable collection of numeric sequences that
2335 indicate one or more column numbers to copy to form the Nth
2336 position in the output list. See example below.
2337 delim: for column_specs that indicate we should copy more than
2338 one column from the input into this position, use delim to
2339 separate source data. Defaults to ''.
2342 A list of string created by following the instructions set forth
2345 See also :meth:`shuffle_columns_into_dict`.
2347 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2348 >>> shuffle_columns_into_list(
2350 ... [ [8], [2, 3], [5, 6, 7] ],
2353 ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2357 # Column specs map input lines' columns into outputs.
2359 for spec in column_specs:
2362 hunk = hunk + delim + input_lines[n]
2363 hunk = hunk.strip(delim)
2368 def shuffle_columns_into_dict(
2369 input_lines: Sequence[str],
2370 column_specs: Iterable[Tuple[str, Iterable[int]]],
2372 ) -> Dict[str, str]:
2373 """Helper to shuffle / parse columnar data and return the results
2377 input_lines: a sequence of strings that represents text that
2378 has been broken into columns by the caller
2379 column_specs: instructions for what dictionary keys to apply
2380 to individual or compound input column data. See example
2382 delim: when forming compound output data by gluing more than
2383 one input column together, use this character to separate
2384 the source data. Defaults to ''.
2387 A dict formed by applying the column_specs instructions.
2389 See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2391 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2392 >>> shuffle_columns_into_dict(
2394 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2397 {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2401 # Column specs map input lines' columns into outputs.
2402 # "key", [col1, col2...]
2403 for spec in column_specs:
2406 hunk = hunk + delim + input_lines[n]
2407 hunk = hunk.strip(delim)
2412 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2414 Interpolate a string with data from a dict.
2417 txt: the mad libs template
2418 values: what you and your kids chose for each category.
2420 See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2422 >>> interpolate_using_dict('This is a {adjective} {noun}.',
2423 ... {'adjective': 'good', 'noun': 'example'})
2424 'This is a good example.'
2426 return _sprintf(txt.format(**values), end='')
2429 def to_ascii(txt: str):
2432 txt: the input data to encode
2435 txt encoded as an ASCII byte string.
2437 See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2438 :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2440 >>> to_ascii('test')
2443 >>> to_ascii(b'1, 2, 3')
2446 if isinstance(txt, str):
2447 return txt.encode('ascii')
2448 if isinstance(txt, bytes):
2450 raise Exception('to_ascii works with strings and bytes')
2454 txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2458 txt: the input data to encode
2459 encoding: the encoding to use during conversion
2460 errors: how to handle encoding errors
2463 txt encoded with a 64-chracter alphabet. Similar to and compatible
2464 with uuencode/uudecode.
2466 See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2467 :meth:`from_base64`.
2469 >>> to_base64('hello?')
2472 return base64.encodebytes(txt.encode(encoding, errors))
2475 def is_base64(txt: str) -> bool:
2478 txt: the string to check
2481 True if txt is a valid base64 encoded string. This assumes
2482 txt was encoded with Python's standard base64 alphabet which
2483 is the same as what uuencode/uudecode uses).
2485 See also :meth:`to_base64`, :meth:`from_base64`.
2487 >>> is_base64('test') # all letters in the b64 alphabet
2490 >>> is_base64('another test, how do you like this one?')
2493 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
2497 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2498 alphabet = set(a.encode('ascii'))
2499 for char in to_ascii(txt.strip()):
2500 if char not in alphabet:
2506 b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2510 b64: bytestring of 64-bit encoded data to decode / convert.
2511 encoding: the encoding to use during conversion
2512 errors: how to handle encoding errors
2515 The decoded form of b64 as a normal python string. Similar to
2516 and compatible with uuencode / uudecode.
2518 See also :meth:`to_base64`, :meth:`is_base64`.
2520 >>> from_base64(b'aGVsbG8/\\n')
2523 return base64.decodebytes(b64).decode(encoding, errors)
2526 def chunk(txt: str, chunk_size: int):
2529 txt: a string to be chunked into evenly spaced pieces.
2530 chunk_size: the size of each chunk to make
2533 The original string chunked into evenly spaced pieces.
2535 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2536 '01001101 11000101 10101010 10101010 10011111 10101000'
2538 if len(txt) % chunk_size != 0:
2539 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2541 warnings.warn(msg, stacklevel=2)
2542 for x in range(0, len(txt), chunk_size):
2543 yield txt[x : x + chunk_size]
2546 def to_bitstring(txt: str, *, delimiter: str = '') -> str:
2549 txt: the string to convert into a bitstring
2550 delimiter: character to insert between adjacent bytes. Note that
2551 only bitstrings with delimiter='' are interpretable by
2552 :meth:`from_bitstring`.
2555 txt converted to ascii/binary and then chopped into bytes.
2557 See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2560 >>> to_bitstring('hello?')
2561 '011010000110010101101100011011000110111100111111'
2563 >>> to_bitstring('test', delimiter=' ')
2564 '01110100 01100101 01110011 01110100'
2566 >>> to_bitstring(b'test')
2567 '01110100011001010111001101110100'
2569 etxt = to_ascii(txt)
2570 bits = bin(int.from_bytes(etxt, 'big'))
2572 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2575 def is_bitstring(txt: str) -> bool:
2578 txt: the string to check
2581 True if txt is a recognized bitstring and False otherwise.
2582 Note that if delimiter is non empty this code will not
2583 recognize the bitstring.
2585 See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2588 >>> is_bitstring('011010000110010101101100011011000110111100111111')
2591 >>> is_bitstring('1234')
2594 return is_binary_integer_number(f'0b{txt}')
2598 bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2602 bits: the bitstring to convert back into a python string
2603 encoding: the encoding to use during conversion
2604 errors: how to handle encoding errors
2607 The regular python string represented by bits. Note that this
2608 code does not work with to_bitstring when delimiter is non-empty.
2610 See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2613 >>> from_bitstring('011010000110010101101100011011000110111100111111')
2617 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2620 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2623 txt: an IP address to chunk up for sorting purposes
2626 A tuple of IP components arranged such that the sorting of
2627 IP addresses using a normal comparator will do something sane
2630 See also :meth:`is_ip_v4`.
2632 >>> ip_v4_sort_key('10.0.0.18')
2635 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2636 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2637 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2639 if not is_ip_v4(txt):
2640 print(f"not IP: {txt}")
2642 return tuple(int(x) for x in txt.split('.'))
2645 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2648 volume: the string to chunk up for sorting purposes
2651 A tuple of volume's components such that the sorting of
2652 volumes using a normal comparator will do something sane
2655 See also :mod:`pyutils.files.file_utils`.
2657 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2658 ('usr', 'local', 'bin')
2660 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2661 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2662 ['/usr', '/usr/local', '/usr/local/bin']
2664 return tuple(x for x in volume.split('/') if len(x) > 0)
2667 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2669 Execute several replace operations in a row.
2672 in_str: the string in which to replace characters
2673 replace_set: the set of target characters to replace
2674 replacement: the character to replace any member of replace_set
2677 See also :meth:`replace_nth`.
2680 The string with replacements executed.
2682 >>> s = 'this_is a-test!'
2683 >>> replace_all(s, ' _-!', '')
2686 for char in replace_set:
2687 in_str = in_str.replace(char, replacement)
2691 def replace_nth(in_str: str, source: str, target: str, nth: int):
2693 Replaces the nth occurrance of a substring within a string.
2696 in_str: the string in which to run the replacement
2697 source: the substring to replace
2698 target: the replacement text
2699 nth: which occurrance of source to replace?
2701 See also :meth:`replace_all`.
2703 >>> replace_nth('this is a test', ' ', '-', 3)
2706 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2707 before = in_str[:where]
2708 after = in_str[where:]
2709 after = after.replace(source, target, 1)
2710 return before + after
2713 if __name__ == '__main__':