2 # -*- coding: utf-8 -*-
4 """The MIT License (MIT)
6 Copyright (c) 2016-2020 Davide Zanotti
8 Modifications Copyright (c) 2021-2022 Scott Gasch
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
17 The above copyright notice and this permission notice shall be included in all
18 copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 This class is based on:
29 https://github.com/daveoncode/python-string-utils. See `NOTICE
30 <https://wannabe.guru.org/gitweb/?p=pyutils.git;a=blob_plain;f=NOTICE;hb=HEAD>`__
31 in the root of this module for a detailed enumeration of what work is
32 Davide's and what work was added by Scott.
37 import contextlib # type: ignore
48 from itertools import zip_longest
60 from uuid import uuid4
62 from pyutils import list_utils
64 logger = logging.getLogger(__name__)
66 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
68 HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$")
70 OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$")
72 BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$")
75 r"([a-z-]+://)" # scheme
76 r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password
78 r"((?<!\.)[a-z\d]+[a-z\d.-]+\.[a-z]{2,6}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|localhost)" # domain
79 r"(:\d{2,})?" # port number
80 r"(/[a-z\d_%+-]*)*" # folders
81 r"(\.[a-z\d_%+-]+)*" # file extension
82 r"(\?[a-z\d_+%-=]*)?" # query string
86 URL_RE = re.compile(rf"^{URLS_RAW_STRING}$", re.IGNORECASE)
88 URLS_RE = re.compile(rf"({URLS_RAW_STRING})", re.IGNORECASE)
90 ESCAPED_AT_SIGN = re.compile(r'(?!"[^"]*)@+(?=[^"]*")|\\@')
93 r"[a-zA-Z\d._\+\-'`!%#$&*/=\?\^\{\}\|~\\]+@[a-z\d-]+\.?[a-z\d-]+\.[a-z]{2,4}"
96 EMAIL_RE = re.compile(rf"^{EMAILS_RAW_STRING}$")
98 EMAILS_RE = re.compile(rf"({EMAILS_RAW_STRING})")
100 CAMEL_CASE_TEST_RE = re.compile(r"^[a-zA-Z]*([a-z]+[A-Z]+|[A-Z]+[a-z]+)[a-zA-Z\d]*$")
102 CAMEL_CASE_REPLACE_RE = re.compile(r"([a-z]|[A-Z]+)(?=[A-Z])")
104 SNAKE_CASE_TEST_RE = re.compile(
105 r"^([a-z]+\d*_[a-z\d_]*|_+[a-z\d]+[a-z\d_]*)$", re.IGNORECASE
108 SNAKE_CASE_TEST_DASH_RE = re.compile(
109 r"([a-z]+\d*-[a-z\d-]*|-+[a-z\d]+[a-z\d-]*)$", re.IGNORECASE
112 SNAKE_CASE_REPLACE_RE = re.compile(r"(_)([a-z\d])")
114 SNAKE_CASE_REPLACE_DASH_RE = re.compile(r"(-)([a-z\d])")
117 "VISA": re.compile(r"^4\d{12}(?:\d{3})?$"),
118 "MASTERCARD": re.compile(r"^5[1-5]\d{14}$"),
119 "AMERICAN_EXPRESS": re.compile(r"^3[47]\d{13}$"),
120 "DINERS_CLUB": re.compile(r"^3(?:0[0-5]|[68]\d)\d{11}$"),
121 "DISCOVER": re.compile(r"^6(?:011|5\d{2})\d{12}$"),
122 "JCB": re.compile(r"^(?:2131|1800|35\d{3})\d{11}$"),
125 JSON_WRAPPER_RE = re.compile(r"^\s*[\[{]\s*(.*)\s*[\}\]]\s*$", re.MULTILINE | re.DOTALL)
127 UUID_RE = re.compile(
128 r"^[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}$", re.IGNORECASE
131 UUID_HEX_OK_RE = re.compile(
132 r"^[a-f\d]{8}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{4}-?[a-f\d]{12}$",
136 SHALLOW_IP_V4_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
138 ANYWHERE_IP_V4_RE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
140 IP_V6_RE = re.compile(r"^([a-z\d]{0,4}:){7}[a-z\d]{0,4}$", re.IGNORECASE)
142 ANYWHERE_IP_V6_RE = re.compile(r"([a-z\d]{0,4}:){7}[a-z\d]{0,4}", re.IGNORECASE)
144 MAC_ADDRESS_RE = re.compile(r"^([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$", re.IGNORECASE)
146 ANYWHERE_MAC_ADDRESS_RE = re.compile(
147 r"([0-9A-F]{2}[:-]){5}([0-9A-F]{2})", re.IGNORECASE
150 WORDS_COUNT_RE = re.compile(r"\W*[^\W_]+\W*", re.IGNORECASE | re.MULTILINE | re.UNICODE)
152 HTML_RE = re.compile(
153 r"((<([a-z]+:)?[a-z]+[^>]*/?>)(.*?(</([a-z]+:)?[a-z]+>))?|<!--.*-->|<!doctype.*>)",
154 re.IGNORECASE | re.MULTILINE | re.DOTALL,
157 HTML_TAG_ONLY_RE = re.compile(
158 r"(<([a-z]+:)?[a-z]+[^>]*/?>|</([a-z]+:)?[a-z]+>|<!--.*-->|<!doctype.*>)",
159 re.IGNORECASE | re.MULTILINE | re.DOTALL,
162 SPACES_RE = re.compile(r"\s")
164 NO_LETTERS_OR_NUMBERS_RE = re.compile(r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE)
166 MARGIN_RE = re.compile(r"^[^\S\r\n]+")
168 ESCAPE_SEQUENCE_RE = re.compile(r"\x1B\[[^A-Za-z]*[A-Za-z]")
229 NUM_WORDS["and"] = (1, 0)
230 for i, word in enumerate(UNIT_WORDS):
231 NUM_WORDS[word] = (1, i)
232 for i, word in enumerate(TENS_WORDS):
233 NUM_WORDS[word] = (1, i * 10)
234 for i, word in enumerate(MAGNITUDE_SCALES):
236 NUM_WORDS[word] = (100, 0)
238 NUM_WORDS[word] = (10 ** (i * 3), 0)
239 NUM_WORDS['score'] = (20, 0)
242 def is_none_or_empty(in_str: Optional[str]) -> bool:
245 in_str: the string to test
248 True if the input string is either None or an empty string,
251 See also :meth:`is_string` and :meth:`is_empty_string`.
253 >>> is_none_or_empty("")
255 >>> is_none_or_empty(None)
257 >>> is_none_or_empty(" \t ")
259 >>> is_none_or_empty('Test')
262 return in_str is None or len(in_str.strip()) == 0
265 def is_string(in_str: Any) -> bool:
268 in_str: the object to test
271 True if the object is a string and False otherwise.
273 See also :meth:`is_empty_string`, :meth:`is_none_or_empty`.
275 >>> is_string('test')
281 >>> is_string([1, 2, 3])
284 return isinstance(in_str, str)
287 def is_empty_string(in_str: Any) -> bool:
290 in_str: the string to test
293 True if the string is empty and False otherwise.
295 See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
297 return is_empty(in_str)
300 def is_empty(in_str: Any) -> bool:
303 in_str: the string to test
306 True if the string is empty and false otherwise.
308 See also :meth:`is_none_or_empty`, :meth:`is_full_string`.
312 >>> is_empty(' \t\t ')
318 >>> is_empty([1, 2, 3])
321 return is_string(in_str) and in_str.strip() == ""
324 def is_full_string(in_str: Any) -> bool:
327 in_str: the object to test
330 True if the object is a string and is not empty ('') and
331 is not only composed of whitespace.
333 See also :meth:`is_string`, :meth:`is_empty_string`, :meth:`is_none_or_empty`.
335 >>> is_full_string('test!')
337 >>> is_full_string('')
339 >>> is_full_string(' ')
341 >>> is_full_string(100.999)
343 >>> is_full_string({"a": 1, "b": 2})
346 return is_string(in_str) and in_str.strip() != ""
349 def is_number(in_str: str) -> bool:
352 in_str: the string to test
355 True if the string contains a valid numberic value and
358 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
359 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
363 Traceback (most recent call last):
366 >>> is_number("100.5")
368 >>> is_number("test")
372 >>> is_number([1, 2, 3])
373 Traceback (most recent call last):
375 ValueError: [1, 2, 3]
377 if not is_string(in_str):
378 raise ValueError(in_str)
379 return NUMBER_RE.match(in_str) is not None
382 def is_integer_number(in_str: str) -> bool:
385 in_str: the string to test
388 True if the string contains a valid (signed or unsigned,
389 decimal, hex, or octal, regular or scientific) integral
390 expression and False otherwise.
392 See also :meth:`is_number`, :meth:`is_decimal_number`,
393 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
396 >>> is_integer_number('42')
398 >>> is_integer_number('42.0')
402 (is_number(in_str) and "." not in in_str)
403 or is_hexidecimal_integer_number(in_str)
404 or is_octal_integer_number(in_str)
405 or is_binary_integer_number(in_str)
409 def is_hexidecimal_integer_number(in_str: str) -> bool:
412 in_str: the string to test
415 True if the string is a hex integer number and False otherwise.
417 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
418 :meth:`is_octal_integer_number`, :meth:`is_binary_integer_number`, etc...
420 >>> is_hexidecimal_integer_number('0x12345')
422 >>> is_hexidecimal_integer_number('0x1A3E')
424 >>> is_hexidecimal_integer_number('1234') # Needs 0x
426 >>> is_hexidecimal_integer_number('-0xff')
428 >>> is_hexidecimal_integer_number('test')
430 >>> is_hexidecimal_integer_number(12345) # Not a string
431 Traceback (most recent call last):
434 >>> is_hexidecimal_integer_number(101.4)
435 Traceback (most recent call last):
438 >>> is_hexidecimal_integer_number(0x1A3E)
439 Traceback (most recent call last):
443 if not is_string(in_str):
444 raise ValueError(in_str)
445 return HEX_NUMBER_RE.match(in_str) is not None
448 def is_octal_integer_number(in_str: str) -> bool:
451 in_str: the string to test
454 True if the string is a valid octal integral number and False otherwise.
456 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
457 :meth:`is_hexidecimal_integer_number`, :meth:`is_binary_integer_number`,
460 >>> is_octal_integer_number('0o777')
462 >>> is_octal_integer_number('-0O115')
464 >>> is_octal_integer_number('0xFF') # Not octal, needs 0o
466 >>> is_octal_integer_number('7777') # Needs 0o
468 >>> is_octal_integer_number('test')
471 if not is_string(in_str):
472 raise ValueError(in_str)
473 return OCT_NUMBER_RE.match(in_str) is not None
476 def is_binary_integer_number(in_str: str) -> bool:
479 in_str: the string to test
482 True if the string contains a binary integral number and False otherwise.
484 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
485 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
488 >>> is_binary_integer_number('0b10111')
490 >>> is_binary_integer_number('-0b111')
492 >>> is_binary_integer_number('0B10101')
494 >>> is_binary_integer_number('0b10102')
496 >>> is_binary_integer_number('0xFFF')
498 >>> is_binary_integer_number('test')
501 if not is_string(in_str):
502 raise ValueError(in_str)
503 return BIN_NUMBER_RE.match(in_str) is not None
506 def to_int(in_str: str) -> int:
509 in_str: the string to convert
512 The integral value of the string or raises on error.
514 See also :meth:`is_integer_number`, :meth:`is_decimal_number`,
515 :meth:`is_hexidecimal_integer_number`, :meth:`is_octal_integer_number`,
516 :meth:`is_binary_integer_number`, etc...
522 >>> to_int('0b01101')
527 Traceback (most recent call last):
529 ValueError: invalid literal for int() with base 10: 'test'
531 if not is_string(in_str):
532 raise ValueError(in_str)
533 if is_binary_integer_number(in_str):
534 return int(in_str, 2)
535 if is_octal_integer_number(in_str):
536 return int(in_str, 8)
537 if is_hexidecimal_integer_number(in_str):
538 return int(in_str, 16)
542 def number_string_to_integer(in_str: str) -> int:
543 """Convert a string containing a written-out number into an int.
546 in_str: the string containing the long-hand written out integer number
547 in English. See examples below.
550 The integer whose value was parsed from in_str.
552 See also :meth:`integer_to_number_string`.
555 This code only handles integers; it will not work with decimals / floats.
557 >>> number_string_to_integer("one hundred fifty two")
560 >>> number_string_to_integer("ten billion two hundred million fifty four thousand three")
563 >>> number_string_to_integer("four-score and 7")
566 >>> number_string_to_integer("fifty xyzzy three")
567 Traceback (most recent call last):
569 ValueError: Unknown word: xyzzy
571 if isinstance(in_str, int):
575 in_str = in_str.replace('-', ' ')
576 for w in in_str.split():
577 if w not in NUM_WORDS:
578 if is_integer_number(w):
582 raise ValueError("Unknown word: " + w)
583 scale, increment = NUM_WORDS[w]
584 current = current * scale + increment
588 return result + current
591 def integer_to_number_string(num: int) -> str:
593 Opposite of :meth:`number_string_to_integer`; converts a number to a written out
594 longhand format in English.
597 num: the integer number to convert
600 The long-hand written out English form of the number. See examples below.
602 See also :meth:`number_string_to_integer`.
605 This method does not handle decimals or floats, only ints.
607 >>> integer_to_number_string(9)
610 >>> integer_to_number_string(42)
613 >>> integer_to_number_string(123219982)
614 'one hundred twenty three million two hundred nineteen thousand nine hundred eighty two'
618 return UNIT_WORDS[num]
620 ret = TENS_WORDS[num // 10]
623 ret += ' ' + UNIT_WORDS[leftover]
626 # If num > 100 go find the highest chunk and convert that, then recursively
627 # convert the rest. NUM_WORDS contains items like 'thousand' -> (1000, 0).
628 # The second item in the tuple is an increment that can be ignored; the first
629 # is the numeric "scale" of the entry. So find the greatest entry in NUM_WORDS
630 # still less than num. For 123,456 it would be thousand. Then pull out the
631 # 123, convert it, and append "thousand". Then do the rest.
633 for name, val in NUM_WORDS.items():
635 scales[name] = val[0]
636 scale = max(scales.items(), key=lambda _: _[1])
638 # scale[1] = numeric magnitude (e.g. 1000)
639 # scale[0] = name (e.g. "thousand")
640 ret = integer_to_number_string(num // scale[1]) + ' ' + scale[0]
641 leftover = num % scale[1]
643 ret += ' ' + integer_to_number_string(leftover)
647 def is_decimal_number(in_str: str) -> bool:
650 in_str: the string to check
653 True if the given string represents a decimal or False
654 otherwise. A decimal may be signed or unsigned or use
655 a "scientific notation".
657 See also :meth:`is_integer_number`.
660 We do not consider integers without a decimal point
661 to be decimals; they return False (see example).
663 >>> is_decimal_number('42.0')
665 >>> is_decimal_number('42')
668 return is_number(in_str) and "." in in_str
671 def strip_escape_sequences(in_str: str) -> str:
674 in_str: the string to strip of escape sequences.
677 in_str with escape sequences removed.
679 See also: :mod:`pyutils.ansi`.
682 What is considered to be an "escape sequence" is defined
683 by a regular expression. While this gets common ones,
684 there may exist valid sequences that it doesn't match.
686 >>> strip_escape_sequences('\x1B[12;11;22mthis is a test!')
689 in_str = ESCAPE_SEQUENCE_RE.sub("", in_str)
693 def add_thousands_separator(
694 in_str: str, *, separator_char: str = ',', places: int = 3
698 in_str: string or number to which to add thousands separator(s)
699 separator_char: the separator character to add (defaults to comma)
700 places: add a separator every N places (defaults to three)
703 A numeric string with thousands separators added appropriately.
705 >>> add_thousands_separator('12345678')
707 >>> add_thousands_separator(12345678)
709 >>> add_thousands_separator(12345678.99)
711 >>> add_thousands_separator('test')
712 Traceback (most recent call last):
717 if isinstance(in_str, numbers.Number):
719 if is_number(in_str):
720 return _add_thousands_separator(
721 in_str, separator_char=separator_char, places=places
723 raise ValueError(in_str)
726 def _add_thousands_separator(in_str: str, *, separator_char=',', places=3) -> str:
727 """Internal helper"""
730 (in_str, decimal_part) = in_str.split('.')
731 tmp = [iter(in_str[::-1])] * places
732 ret = separator_char.join("".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1]
733 if len(decimal_part) > 0:
739 def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool:
742 in_str: the string to test
743 allowed_schemes: an optional list of allowed schemes (e.g.
744 ['http', 'https', 'ftp']. If passed, only URLs that
745 begin with the one of the schemes passed will be considered
746 to be valid. Otherwise, any scheme:// will be considered
750 True if in_str contains a valid URL and False otherwise.
752 >>> is_url('http://www.mysite.com')
754 >>> is_url('https://mysite.com')
756 >>> is_url('.mysite.com')
758 >>> is_url('scheme://username:
[email protected]:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash')
761 if not is_full_string(in_str):
764 valid = URL_RE.match(in_str) is not None
767 return valid and any([in_str.startswith(s) for s in allowed_schemes])
771 def is_email(in_str: Any) -> bool:
774 in_str: the email address to check
776 Returns: True if the in_str contains a valid email (as defined by
777 https://tools.ietf.org/html/rfc3696#section-3) or False
782 >>> is_email('@gmail.com')
785 if not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith("."):
789 # we expect 2 tokens, one before "@" and one after, otherwise
790 # we have an exception and the email is not valid.
791 head, tail = in_str.split("@")
793 # head's size must be <= 64, tail <= 255, head must not start
794 # with a dot or contain multiple consecutive dots.
795 if len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head):
798 # removes escaped spaces, so that later on the test regex will
800 head = head.replace("\\ ", "")
801 if head.startswith('"') and head.endswith('"'):
802 head = head.replace(" ", "")[1:-1]
803 return EMAIL_RE.match(head + "@" + tail) is not None
806 # borderline case in which we have multiple "@" signs but the
807 # head part is correctly escaped.
808 if ESCAPED_AT_SIGN.search(in_str) is not None:
809 # replace "@" with "a" in the head
810 return is_email(ESCAPED_AT_SIGN.sub("a", in_str))
814 def suffix_string_to_number(in_str: str) -> Optional[int]:
815 """Takes a string like "33Gb" and converts it into a number (of bytes)
819 in_str: the string with a suffix to be interpreted and removed.
822 An integer number of bytes or None to indicate an error.
824 See also :meth:`number_to_suffix_string`.
826 >>> suffix_string_to_number('1Mb')
828 >>> suffix_string_to_number('13.1Gb')
832 def suffix_capitalize(s: str) -> str:
836 return f"{s[0].upper()}{s[1].lower()}"
837 return suffix_capitalize(s[0:1])
839 if is_string(in_str):
840 if is_integer_number(in_str):
841 return to_int(in_str)
842 suffixes = [in_str[-2:], in_str[-1:]]
843 rest = [in_str[:-2], in_str[:-1]]
844 for x in range(len(suffixes)):
846 s = suffix_capitalize(s)
847 multiplier = NUM_SUFFIXES.get(s, None)
848 if multiplier is not None:
850 if is_integer_number(r):
851 return to_int(r) * multiplier
852 if is_decimal_number(r):
853 return int(float(r) * multiplier)
857 def number_to_suffix_string(num: int) -> Optional[str]:
858 """Take a number (of bytes) and returns a string like "43.8Gb".
861 num: an integer number of bytes
864 A string with a suffix representing num bytes concisely or
865 None to indicate an error.
867 See also: :meth:`suffix_string_to_number`.
869 >>> number_to_suffix_string(14066017894)
871 >>> number_to_suffix_string(1024 * 1024)
876 for (sfx, size) in NUM_SUFFIXES.items():
881 if suffix is not None:
882 return f"{d:.1f}{suffix}"
887 def is_credit_card(in_str: Any, card_type: str = None) -> bool:
890 in_str: a string to check
891 card_type: if provided, contains the card type to validate
892 with. Otherwise, all known credit card number types will
895 Supported card types are the following:
905 True if in_str is a valid credit card number.
908 This code is not verifying the authenticity of the credit card (i.e.
909 not checking whether it's a real card that can be charged); rather
910 it's only checking that the number follows the "rules" for numbering
911 established by credit card issuers.
914 if not is_full_string(in_str):
917 if card_type is not None:
918 if card_type not in CREDIT_CARDS:
920 f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}'
922 return CREDIT_CARDS[card_type].match(in_str) is not None
923 for c in CREDIT_CARDS:
924 if CREDIT_CARDS[c].match(in_str) is not None:
929 def is_camel_case(in_str: Any) -> bool:
932 in_str: the string to test
935 True if the string is formatted as camel case and False otherwise.
936 A string is considered camel case when:
938 * it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9])
939 * it contains both lowercase and uppercase letters
940 * it does not start with a number
942 See also :meth:`is_snake_case`, :meth:`is_slug`, and :meth:`camel_case_to_snake_case`.
944 return is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None
947 def is_snake_case(in_str: Any, *, separator: str = "_") -> bool:
950 in_str: the string to test
951 separator: the snake case separator character to use
953 Returns: True if the string is snake case and False otherwise. A
954 string is considered snake case when:
956 * it's composed only by lowercase/uppercase letters and digits
957 * it contains at least one underscore (or provided separator)
958 * it does not start with a number
960 See also :meth:`is_camel_case`, :meth:`is_slug`, and :meth:`snake_case_to_camel_case`.
962 >>> is_snake_case('this_is_a_test')
964 >>> is_snake_case('___This_Is_A_Test_1_2_3___')
966 >>> is_snake_case('this-is-a-test')
968 >>> is_snake_case('this-is-a-test', separator='-')
971 if is_full_string(in_str):
972 re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE}
973 re_template = r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)"
976 re.compile(re_template.format(sign=re.escape(separator)), re.IGNORECASE),
978 return r.match(in_str) is not None
982 def is_json(in_str: Any) -> bool:
985 in_str: the string to test
988 True if the in_str contains valid JSON and False otherwise.
990 >>> is_json('{"name": "Peter"}')
992 >>> is_json('[1, 2, 3]')
994 >>> is_json('{nope}')
997 if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None:
999 return isinstance(json.loads(in_str), (dict, list))
1000 except (TypeError, ValueError, OverflowError):
1005 def is_uuid(in_str: Any, allow_hex: bool = False) -> bool:
1008 in_str: the string to test
1009 allow_hex: should we allow hexidecimal digits in valid uuids?
1012 True if the in_str contains a valid UUID and False otherwise.
1014 See also :meth:`generate_uuid`.
1016 >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf')
1018 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf')
1020 >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True)
1023 # string casting is used to allow UUID itself as input data type
1026 return UUID_HEX_OK_RE.match(s) is not None
1027 return UUID_RE.match(s) is not None
1030 def is_ip_v4(in_str: Any) -> bool:
1033 in_str: the string to test
1036 True if in_str contains a valid IPv4 address and False otherwise.
1038 See also :meth:`extract_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1041 >>> is_ip_v4('255.200.100.75')
1043 >>> is_ip_v4('nope')
1045 >>> is_ip_v4('255.200.100.999') # 999 out of range
1048 if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None:
1051 # checks that each entry in the ip is in the valid range (0 to 255)
1052 for token in in_str.split("."):
1053 if not 0 <= int(token) <= 255:
1058 def extract_ip_v4(in_str: Any) -> Optional[str]:
1061 in_str: the string to extract an IPv4 address from.
1064 The first extracted IPv4 address from in_str or None if
1065 none were found or an error occurred.
1067 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1070 >>> extract_ip_v4(' The secret IP address: 127.0.0.1 (use it wisely) ')
1072 >>> extract_ip_v4('Your mom dresses you funny.')
1074 if not is_full_string(in_str):
1076 m = ANYWHERE_IP_V4_RE.search(in_str)
1082 def is_ip_v6(in_str: Any) -> bool:
1085 in_str: the string to test.
1088 True if in_str contains a valid IPv6 address and False otherwise.
1090 See also :meth:`is_ip_v4`, :meth:`extract_ip_v4`, :meth:`extract_ip_v6`,
1093 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334')
1095 >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # invalid "?"
1098 return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None
1101 def extract_ip_v6(in_str: Any) -> Optional[str]:
1104 in_str: the string from which to extract an IPv6 address.
1107 The first IPv6 address found in in_str or None if no address
1108 was found or an error occurred.
1110 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v4`,
1113 >>> extract_ip_v6('IP: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1114 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1115 >>> extract_ip_v6("(and she's ugly too, btw)")
1117 if not is_full_string(in_str):
1119 m = ANYWHERE_IP_V6_RE.search(in_str)
1125 def is_ip(in_str: Any) -> bool:
1128 in_str: the string to test.
1131 True if in_str contains a valid IP address (either IPv4 or
1134 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1135 and :meth:`extract_ip_v4`.
1137 >>> is_ip('255.200.100.75')
1139 >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334')
1143 >>> is_ip('1.2.3.999')
1146 return is_ip_v6(in_str) or is_ip_v4(in_str)
1149 def extract_ip(in_str: Any) -> Optional[str]:
1152 in_str: the string from which to extract in IP address.
1155 The first IP address (IPv4 or IPv6) found in in_str or
1156 None to indicate none found or an error condition.
1158 See also :meth:`is_ip_v4`, :meth:`is_ip_v6`, :meth:`extract_ip_v6`,
1159 and :meth:`extract_ip_v4`.
1161 >>> extract_ip('Attacker: 255.200.100.75')
1163 >>> extract_ip('Remote host: 2001:db8:85a3:0000:0000:8a2e:370:7334')
1164 '2001:db8:85a3:0000:0000:8a2e:370:7334'
1165 >>> extract_ip('1.2.3')
1167 ip = extract_ip_v4(in_str)
1169 ip = extract_ip_v6(in_str)
1173 def is_mac_address(in_str: Any) -> bool:
1176 in_str: the string to test
1179 True if in_str is a valid MAC address False otherwise.
1181 See also :meth:`extract_mac_address`, :meth:`is_ip`, etc...
1183 >>> is_mac_address("34:29:8F:12:0D:2F")
1185 >>> is_mac_address('34:29:8f:12:0d:2f')
1187 >>> is_mac_address('34-29-8F-12-0D-2F')
1189 >>> is_mac_address("test")
1192 return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None
1195 def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]:
1198 in_str: the string from which to extract a MAC address.
1201 The first MAC address found in in_str or None to indicate no
1204 See also :meth:`is_mac_address`, :meth:`is_ip`, and :meth:`extract_ip`.
1206 >>> extract_mac_address(' MAC Address: 34:29:8F:12:0D:2F')
1209 >>> extract_mac_address('? (10.0.0.30) at d8:5d:e2:34:54:86 on em0 expires in 1176 seconds [ethernet]')
1212 if not is_full_string(in_str):
1215 m = ANYWHERE_MAC_ADDRESS_RE.search(in_str)
1218 mac.replace(":", separator)
1219 mac.replace("-", separator)
1224 def is_slug(in_str: Any, separator: str = "-") -> bool:
1227 in_str: string to test
1228 separator: the slug character to use
1231 True if in_str is a slug string and False otherwise.
1233 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`slugify`.
1235 >>> is_slug('my-blog-post-title')
1237 >>> is_slug('My blog post title')
1240 if not is_full_string(in_str):
1242 rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$"
1243 return re.match(rex, in_str) is not None
1246 def contains_html(in_str: str) -> bool:
1249 in_str: the string to check for tags in
1252 True if the given string contains HTML/XML tags and False
1255 See also :meth:`strip_html`.
1258 By design, this function matches ANY type of tag, so don't expect
1259 to use it as an HTML validator. It's a quick sanity check at
1260 best. See something like BeautifulSoup for a more full-featuered
1263 >>> contains_html('my string is <strong>bold</strong>')
1265 >>> contains_html('my string is not bold')
1269 if not is_string(in_str):
1270 raise ValueError(in_str)
1271 return HTML_RE.search(in_str) is not None
1274 def words_count(in_str: str) -> int:
1277 in_str: the string to count words in
1280 The number of words contained in the given string.
1283 This method is "smart" in that it does consider only sequences
1284 of one or more letter and/or numbers to be "words". Thus a
1285 string like this: "! @ # % ... []" will return zero. Moreover
1286 it is aware of punctuation, so the count for a string like
1287 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1290 >>> words_count('hello world')
1292 >>> words_count('one,two,three.stop')
1295 if not is_string(in_str):
1296 raise ValueError(in_str)
1297 return len(WORDS_COUNT_RE.findall(in_str))
1300 def word_count(in_str: str) -> int:
1303 in_str: the string to count words in
1306 The number of words contained in the given string.
1309 This method is "smart" in that it does consider only sequences
1310 of one or more letter and/or numbers to be "words". Thus a
1311 string like this: "! @ # % ... []" will return zero. Moreover
1312 it is aware of punctuation, so the count for a string like
1313 "one,two,three.stop" will be 4 not 1 (even if there are no spaces
1316 >>> word_count('hello world')
1318 >>> word_count('one,two,three.stop')
1321 return words_count(in_str)
1324 def generate_uuid(omit_dashes: bool = False) -> str:
1327 omit_dashes: should we omit the dashes in the generated UUID?
1330 A generated UUID string (using `uuid.uuid4()`) with or without
1331 dashes per the omit_dashes arg.
1333 See also :meth:`is_uuid`, :meth:`generate_random_alphanumeric_string`.
1335 generate_uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b'
1336 generate_uuid(omit_dashes=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b'
1344 def generate_random_alphanumeric_string(size: int) -> str:
1347 size: number of characters to generate
1350 A string of the specified size containing random characters
1351 (uppercase/lowercase ascii letters and digits).
1353 See also :meth:`asciify`, :meth:`generate_uuid`.
1356 >>> generate_random_alphanumeric_string(9)
1360 raise ValueError("size must be >= 1")
1361 chars = string.ascii_letters + string.digits
1362 buffer = [random.choice(chars) for _ in range(size)]
1363 return from_char_list(buffer)
1366 def reverse(in_str: str) -> str:
1369 in_str: the string to reverse
1372 The reversed (chracter by character) string.
1377 if not is_string(in_str):
1378 raise ValueError(in_str)
1382 def camel_case_to_snake_case(in_str: str, *, separator: str = "_"):
1385 in_str: the camel case string to convert
1386 separator: the snake case separator character to use
1389 A snake case string equivalent to the camel case input or the
1390 original string if it is not a valid camel case string or some
1393 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1395 >>> camel_case_to_snake_case('MacAddressExtractorFactory')
1396 'mac_address_extractor_factory'
1397 >>> camel_case_to_snake_case('Luke Skywalker')
1400 if not is_string(in_str):
1401 raise ValueError(in_str)
1402 if not is_camel_case(in_str):
1404 return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, in_str).lower()
1407 def snake_case_to_camel_case(
1408 in_str: str, *, upper_case_first: bool = True, separator: str = "_"
1412 in_str: the snake case string to convert
1413 upper_case_first: should we capitalize the first letter?
1414 separator: the separator character to use
1417 A camel case string that is equivalent to the snake case string
1418 provided or the original string back again if it is not valid
1419 snake case or another error occurs.
1421 See also :meth:`is_camel_case`, :meth:`is_snake_case`, and :meth:`is_slug`.
1423 >>> snake_case_to_camel_case('this_is_a_test')
1425 >>> snake_case_to_camel_case('Han Solo')
1428 if not is_string(in_str):
1429 raise ValueError(in_str)
1430 if not is_snake_case(in_str, separator=separator):
1432 tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)]
1433 if not upper_case_first:
1434 tokens[0] = tokens[0].lower()
1435 return from_char_list(tokens)
1438 def to_char_list(in_str: str) -> List[str]:
1441 in_str: the string to split into a char list
1444 A list of strings of length one each.
1446 See also :meth:`from_char_list`.
1448 >>> to_char_list('test')
1449 ['t', 'e', 's', 't']
1451 if not is_string(in_str):
1456 def from_char_list(in_list: List[str]) -> str:
1459 in_list: A list of characters to convert into a string.
1462 The string resulting from gluing the characters in in_list
1465 See also :meth:`to_char_list`.
1467 >>> from_char_list(['t', 'e', 's', 't'])
1470 return "".join(in_list)
1473 def shuffle(in_str: str) -> Optional[str]:
1476 in_str: a string to shuffle randomly by character
1479 A new string containing same chars of the given one but in
1480 a randomized order. Note that in rare cases this could result
1481 in the same original string as no check is done. Returns
1482 None to indicate error conditions.
1485 >>> shuffle('awesome')
1488 if not is_string(in_str):
1490 chars = to_char_list(in_str)
1491 random.shuffle(chars)
1492 return from_char_list(chars)
1495 def scramble(in_str: str) -> Optional[str]:
1498 in_str: a string to shuffle randomly by character
1501 A new string containing same chars of the given one but in
1502 a randomized order. Note that in rare cases this could result
1503 in the same original string as no check is done. Returns
1504 None to indicate error conditions.
1506 See also :mod:`pyutils.unscrambler`.
1509 >>> scramble('awesome')
1512 return shuffle(in_str)
1515 def strip_html(in_str: str, keep_tag_content: bool = False) -> str:
1518 in_str: the string to strip tags from
1519 keep_tag_content: should we keep the inner contents of tags?
1522 A string with all HTML tags removed (optionally with tag contents
1525 See also :meth:`contains_html`.
1528 This method uses simple regular expressions to strip tags and is
1529 not a full fledged HTML parser by any means. Consider using
1530 something like BeautifulSoup if your needs are more than this
1531 simple code can fulfill.
1533 >>> strip_html('test: <a href="foo/bar">click here</a>')
1535 >>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True)
1538 if not is_string(in_str):
1539 raise ValueError(in_str)
1540 r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
1541 return r.sub("", in_str)
1544 def asciify(in_str: str) -> str:
1547 in_str: the string to asciify.
1550 An output string roughly equivalent to the original string
1551 where all content to are ascii-only. This is accomplished
1552 by translating all non-ascii chars into their closest possible
1553 ASCII representation (eg: ó -> o, Ë -> E, ç -> c...).
1555 See also :meth:`to_ascii`, :meth:`generate_random_alphanumeric_string`.
1558 Some chars may be lost if impossible to translate.
1560 >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË')
1561 'eeuuooaaeynAAACIINOE'
1563 if not is_string(in_str):
1564 raise ValueError(in_str)
1566 # "NFKD" is the algorithm which is able to successfully translate
1567 # the most of non-ascii chars.
1568 normalized = unicodedata.normalize("NFKD", in_str)
1570 # encode string forcing ascii and ignore any errors
1571 # (unrepresentable chars will be stripped out)
1572 ascii_bytes = normalized.encode("ascii", "ignore")
1574 # turns encoded bytes into an utf-8 string
1575 return ascii_bytes.decode("utf-8")
1578 def slugify(in_str: str, *, separator: str = "-") -> str:
1581 in_str: the string to slugify
1582 separator: the character to use during sligification (default
1586 The converted string. The returned string has the following properties:
1589 * all letters are in lower case
1590 * all punctuation signs and non alphanumeric chars are removed
1591 * words are divided using provided separator
1592 * all chars are encoded as ascii (by using :meth:`asciify`)
1595 See also :meth:`is_slug` and :meth:`asciify`.
1597 >>> slugify('Top 10 Reasons To Love Dogs!!!')
1598 'top-10-reasons-to-love-dogs'
1599 >>> slugify('Mönstér Mägnët')
1602 if not is_string(in_str):
1603 raise ValueError(in_str)
1605 # replace any character that is NOT letter or number with spaces
1606 out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip()
1608 # replace spaces with join sign
1609 out = SPACES_RE.sub(separator, out)
1611 # normalize joins (remove duplicates)
1612 out = re.sub(re.escape(separator) + r"+", separator, out)
1616 def to_bool(in_str: str) -> bool:
1619 in_str: the string to convert to boolean
1622 A boolean equivalent of the original string based on its contents.
1623 All conversion is case insensitive. A positive boolean (True) is
1624 returned if the string value is any of the following:
1633 Otherwise False is returned.
1635 See also :mod:`pyutils.argparse_utils`.
1655 if not is_string(in_str):
1656 raise ValueError(in_str)
1657 return in_str.lower() in set(["true", "1", "yes", "y", "t", "on"])
1660 def to_date(in_str: str) -> Optional[datetime.date]:
1663 in_str: the string to convert into a date
1666 The datetime.date the string contained or None to indicate
1667 an error. This parser is relatively clever; see
1668 :class:`datetimes.dateparse_utils` docs for details.
1670 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`extract_date`,
1671 :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1673 >>> to_date('9/11/2001')
1674 datetime.date(2001, 9, 11)
1675 >>> to_date('xyzzy')
1677 import pyutils.datetimes.dateparse_utils as du
1680 d = du.DateParser() # type: ignore
1683 except du.ParseException: # type: ignore
1684 msg = f'Unable to parse date {in_str}.'
1689 def extract_date(in_str: Any) -> Optional[datetime.datetime]:
1690 """Finds and extracts a date from the string, if possible.
1693 in_str: the string to extract a date from
1696 a datetime if date was found, otherwise None
1698 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1699 :meth:`is_valid_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1701 >>> extract_date("filename.txt dec 13, 2022")
1702 datetime.datetime(2022, 12, 13, 0, 0)
1704 >>> extract_date("Dear Santa, please get me a pony.")
1709 import pyutils.datetimes.dateparse_utils as du
1711 d = du.DateParser() # type: ignore
1712 chunks = in_str.split()
1713 for ngram in itertools.chain(
1714 list_utils.ngrams(chunks, 5),
1715 list_utils.ngrams(chunks, 4),
1716 list_utils.ngrams(chunks, 3),
1717 list_utils.ngrams(chunks, 2),
1720 expr = " ".join(ngram)
1721 logger.debug("Trying %s", expr)
1723 return d.get_datetime()
1724 except du.ParseException: # type: ignore
1729 def is_valid_date(in_str: str) -> bool:
1732 in_str: the string to check
1735 True if the string represents a valid date that we can recognize
1736 and False otherwise. This parser is relatively clever; see
1737 :class:`datetimes.dateparse_utils` docs for details.
1739 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1740 :meth:`extract_date`, :meth:`to_datetime`, :meth:`valid_datetime`.
1742 >>> is_valid_date('1/2/2022')
1744 >>> is_valid_date('christmas')
1746 >>> is_valid_date('next wednesday')
1748 >>> is_valid_date('xyzzy')
1751 import pyutils.datetimes.dateparse_utils as dp
1754 d = dp.DateParser() # type: ignore
1757 except dp.ParseException: # type: ignore
1758 msg = f'Unable to parse date {in_str}.'
1763 def to_datetime(in_str: str) -> Optional[datetime.datetime]:
1766 in_str: string to parse into a datetime
1769 A python datetime parsed from in_str or None to indicate
1770 an error. This parser is relatively clever; see
1771 :class:`datetimes.dateparse_utils` docs for details.
1773 See also: :mod:`pyutils.datetimes.dateparse_utils`, :meth:`to_date`,
1774 :meth:`extract_date`, :meth:`valid_datetime`.
1776 >>> to_datetime('7/20/1969 02:56 GMT')
1777 datetime.datetime(1969, 7, 20, 2, 56, tzinfo=<StaticTzInfo 'GMT'>)
1779 import pyutils.datetimes.dateparse_utils as dp
1782 d = dp.DateParser() # type: ignore
1783 dt = d.parse(in_str)
1784 if isinstance(dt, datetime.datetime):
1787 msg = f'Unable to parse datetime {in_str}.'
1792 def valid_datetime(in_str: str) -> bool:
1795 in_str: the string to check
1798 True if in_str contains a valid datetime and False otherwise.
1799 This parser is relatively clever; see
1800 :class:`datetimes.dateparse_utils` docs for details.
1802 >>> valid_datetime('next wednesday at noon')
1804 >>> valid_datetime('3 weeks ago at midnight')
1806 >>> valid_datetime('next easter at 5:00 am')
1808 >>> valid_datetime('sometime soon')
1811 _ = to_datetime(in_str)
1814 msg = f'Unable to parse datetime {in_str}.'
1819 def squeeze(in_str: str, character_to_squeeze: str = ' ') -> str:
1822 in_str: the string to squeeze
1823 character_to_squeeze: the character to remove runs of
1824 more than one in a row (default = space)
1826 Returns: A "squeezed string" where runs of more than one
1827 character_to_squeeze into one.
1829 >>> squeeze(' this is a test ')
1832 >>> squeeze('one|!||!|two|!||!|three', character_to_squeeze='|!|')
1837 r'(' + re.escape(character_to_squeeze) + r')+',
1838 character_to_squeeze,
1843 def dedent(in_str: str) -> Optional[str]:
1846 in_str: the string to dedent
1849 A string with tab indentation removed or None on error.
1851 See also :meth:`indent`.
1853 >>> dedent('\t\ttest\\n\t\ting')
1856 if not is_string(in_str):
1858 line_separator = '\n'
1859 lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)]
1860 return line_separator.join(lines)
1863 def indent(in_str: str, amount: int) -> str:
1866 in_str: the string to indent
1867 amount: count of spaces to indent each line by
1870 An indented string created by prepending amount spaces.
1872 See also :meth:`dedent`.
1874 >>> indent('This is a test', 4)
1877 if not is_string(in_str):
1878 raise ValueError(in_str)
1879 line_separator = '\n'
1880 lines = [" " * amount + line for line in in_str.split(line_separator)]
1881 return line_separator.join(lines)
1884 def _sprintf(*args, **kwargs) -> str:
1885 """Internal helper."""
1888 sep = kwargs.pop("sep", None)
1890 if not isinstance(sep, str):
1891 raise TypeError("sep must be None or a string")
1893 end = kwargs.pop("end", None)
1895 if not isinstance(end, str):
1896 raise TypeError("end must be None or a string")
1899 raise TypeError("invalid keyword arguments to sprint()")
1905 for n, arg in enumerate(args):
1908 if isinstance(arg, str):
1916 def strip_ansi_sequences(in_str: str) -> str:
1919 in_str: the string to strip
1922 in_str with recognized ANSI escape sequences removed.
1924 See also :mod:`pyutils.ansi`.
1927 This method works by using a regular expression.
1928 It works for all ANSI escape sequences I've tested with but
1929 may miss some; caveat emptor.
1931 >>> import ansi as a
1932 >>> s = a.fg('blue') + 'blue!' + a.reset()
1933 >>> len(s) # '\x1b[38;5;21mblue!\x1b[m'
1935 >>> len(strip_ansi_sequences(s))
1937 >>> strip_ansi_sequences(s)
1941 return re.sub(r'\x1b\[[\d+;]*[a-z]', '', in_str)
1944 class SprintfStdout(contextlib.AbstractContextManager):
1946 A context manager that captures outputs to stdout to a buffer
1947 without printing them.
1949 >>> with SprintfStdout() as buf:
1951 ... print("1, 2, 3")
1953 >>> print(buf(), end='')
1958 def __init__(self) -> None:
1959 self.destination = io.StringIO()
1960 self.recorder: contextlib.redirect_stdout
1962 def __enter__(self) -> Callable[[], str]:
1963 self.recorder = contextlib.redirect_stdout(self.destination)
1964 self.recorder.__enter__()
1965 return lambda: self.destination.getvalue()
1967 def __exit__(self, *args) -> Literal[False]:
1968 self.recorder.__exit__(*args)
1969 self.destination.seek(0)
1973 def capitalize_first_letter(in_str: str) -> str:
1976 in_str: the string to capitalize
1979 in_str with the first character capitalized.
1981 >>> capitalize_first_letter('test')
1983 >>> capitalize_first_letter("ALREADY!")
1986 return in_str[0].upper() + in_str[1:]
1989 def it_they(n: int) -> str:
1992 n: how many of them are there?
1995 'it' if n is one or 'they' otherwize.
1997 See also :meth:`is_are`, :meth:`pluralize`, :meth:`make_contractions`,
2002 n = num_files_saved_to_tmp()
2003 print(f'Saved file{pluralize(n)} successfully.')
2004 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2016 def is_are(n: int) -> str:
2019 n: how many of them are there?
2022 'is' if n is one or 'are' otherwize.
2024 See also :meth:`it_they`, :meth:`pluralize`, :meth:`make_contractions`,
2029 n = num_files_saved_to_tmp()
2030 print(f'Saved file{pluralize(n)} successfully.')
2031 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2044 def pluralize(n: int) -> str:
2047 n: how many of them are there?
2050 's' if n is greater than one otherwize ''.
2052 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`,
2057 n = num_files_saved_to_tmp()
2058 print(f'Saved file{pluralize(n)} successfully.')
2059 print(f'{it_they(n)} {is_are(n)} located in /tmp.')
2064 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2067 >>> print(f'There {is_are(count)} {count} file{pluralize(count)}.')
2075 def make_contractions(txt: str) -> str:
2076 """This code glues words in txt together to form (English)
2080 txt: the input text to be contractionized.
2083 Output text identical to original input except for any
2084 recognized contractions are formed.
2086 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2089 The order in which we create contractions is defined by the
2090 implementation and what I thought made more sense when writing
2093 >>> make_contractions('It is nice today.')
2096 >>> make_contractions('I can not even...')
2099 >>> make_contractions('She could not see!')
2102 >>> make_contractions('But she will not go.')
2105 >>> make_contractions('Verily, I shall not.')
2108 >>> make_contractions('No you cannot.')
2111 >>> make_contractions('I said you can not go.')
2112 "I said you can't go."
2148 ['woul(d)', 'i(s)', 'a(re)', 'ha(s)', 'ha(ve)', 'ha(d)', 'wi(ll)'],
2152 # Special cases: can't, shan't and won't.
2153 txt = re.sub(r'\b(can)\s*no(t)\b', r"\1'\2", txt, count=0, flags=re.IGNORECASE)
2155 r'\b(sha)ll\s*(n)o(t)\b', r"\1\2'\3", txt, count=0, flags=re.IGNORECASE
2158 r'\b(w)ill\s*(n)(o)(t)\b',
2162 flags=re.IGNORECASE,
2165 for first_list, second_list in first_second:
2166 for first in first_list:
2167 for second in second_list:
2168 # Disallow there're/where're. They're valid English
2170 if (first in set(['there', 'where'])) and second == 'a(re)':
2173 pattern = fr'\b({first})\s+{second}\b'
2174 if second == '(n)o(t)':
2175 replacement = r"\1\2'\3"
2177 replacement = r"\1'\2"
2178 txt = re.sub(pattern, replacement, txt, count=0, flags=re.IGNORECASE)
2183 def thify(n: int) -> str:
2186 n: how many of them are there?
2189 The proper cardinal suffix for a number.
2191 See also :meth:`it_they`, :meth:`is_are`, :meth:`make_contractions`.
2200 print(f'The {attempt_count}{thify(attempt_count)} failed, trying again.')
2210 assert is_integer_number(digit)
2222 def ngrams(txt: str, n: int):
2225 txt: the string to create ngrams using
2226 n: how many words per ngram created?
2229 Generates the ngrams from the input string.
2231 See also :meth:`ngrams_presplit`, :meth:`bigrams`, :meth:`trigrams`.
2233 >>> [x for x in ngrams('This is a test', 2)]
2234 ['This is', 'is a', 'a test']
2237 for ngram in ngrams_presplit(words, n):
2244 def ngrams_presplit(words: Sequence[str], n: int):
2246 Same as :meth:`ngrams` but with the string pre-split.
2248 See also :meth:`ngrams`, :meth:`bigrams`, :meth:`trigrams`.
2250 return list_utils.ngrams(words, n)
2253 def bigrams(txt: str):
2254 """Generates the bigrams (n=2) of the given string.
2256 See also :meth:`ngrams`, :meth:`trigrams`.
2258 >>> [x for x in bigrams('this is a test')]
2259 ['this is', 'is a', 'a test']
2261 return ngrams(txt, 2)
2264 def trigrams(txt: str):
2265 """Generates the trigrams (n=3) of the given string.
2267 See also :meth:`ngrams`, :meth:`bigrams`.
2269 return ngrams(txt, 3)
2272 def shuffle_columns_into_list(
2273 input_lines: Sequence[str], column_specs: Iterable[Iterable[int]], delim: str = ''
2275 """Helper to shuffle / parse columnar data and return the results as a
2279 input_lines: A sequence of strings that represents text that
2280 has been broken into columns by the caller
2281 column_specs: an iterable collection of numeric sequences that
2282 indicate one or more column numbers to copy to form the Nth
2283 position in the output list. See example below.
2284 delim: for column_specs that indicate we should copy more than
2285 one column from the input into this position, use delim to
2286 separate source data. Defaults to ''.
2289 A list of string created by following the instructions set forth
2292 See also :meth:`shuffle_columns_into_dict`.
2294 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2295 >>> shuffle_columns_into_list(
2297 ... [ [8], [2, 3], [5, 6, 7] ],
2300 ['acl_test.py', 'scott!wheel', 'Jul!9!11:34']
2304 # Column specs map input lines' columns into outputs.
2306 for spec in column_specs:
2309 hunk = hunk + delim + input_lines[n]
2310 hunk = hunk.strip(delim)
2315 def shuffle_columns_into_dict(
2316 input_lines: Sequence[str],
2317 column_specs: Iterable[Tuple[str, Iterable[int]]],
2319 ) -> Dict[str, str]:
2320 """Helper to shuffle / parse columnar data and return the results
2324 input_lines: a sequence of strings that represents text that
2325 has been broken into columns by the caller
2326 column_specs: instructions for what dictionary keys to apply
2327 to individual or compound input column data. See example
2329 delim: when forming compound output data by gluing more than
2330 one input column together, use this character to separate
2331 the source data. Defaults to ''.
2334 A dict formed by applying the column_specs instructions.
2336 See also :meth:`shuffle_columns_into_list`, :meth:`interpolate_using_dict`.
2338 >>> cols = '-rwxr-xr-x 1 scott wheel 3.1K Jul 9 11:34 acl_test.py'.split()
2339 >>> shuffle_columns_into_dict(
2341 ... [ ('filename', [8]), ('owner', [2, 3]), ('mtime', [5, 6, 7]) ],
2344 {'filename': 'acl_test.py', 'owner': 'scott!wheel', 'mtime': 'Jul!9!11:34'}
2348 # Column specs map input lines' columns into outputs.
2349 # "key", [col1, col2...]
2350 for spec in column_specs:
2353 hunk = hunk + delim + input_lines[n]
2354 hunk = hunk.strip(delim)
2359 def interpolate_using_dict(txt: str, values: Dict[str, str]) -> str:
2361 Interpolate a string with data from a dict.
2364 txt: the mad libs template
2365 values: what you and your kids chose for each category.
2367 See also :meth:`shuffle_columns_into_list`, :meth:`shuffle_columns_into_dict`.
2369 >>> interpolate_using_dict('This is a {adjective} {noun}.',
2370 ... {'adjective': 'good', 'noun': 'example'})
2371 'This is a good example.'
2373 return _sprintf(txt.format(**values), end='')
2376 def to_ascii(txt: str):
2379 txt: the input data to encode
2382 txt encoded as an ASCII byte string.
2384 See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`to_bytes`,
2385 :meth:`generate_random_alphanumeric_string`, :meth:`asciify`.
2387 >>> to_ascii('test')
2390 >>> to_ascii(b'1, 2, 3')
2393 if isinstance(txt, str):
2394 return txt.encode('ascii')
2395 if isinstance(txt, bytes):
2397 raise Exception('to_ascii works with strings and bytes')
2401 txt: str, *, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2405 txt: the input data to encode
2406 encoding: the encoding to use during conversion
2407 errors: how to handle encoding errors
2410 txt encoded with a 64-chracter alphabet. Similar to and compatible
2411 with uuencode/uudecode.
2413 See also :meth:`is_base64`, :meth:`to_ascii`, :meth:`to_bitstring`,
2414 :meth:`from_base64`.
2416 >>> to_base64('hello?')
2419 return base64.encodebytes(txt.encode(encoding, errors))
2422 def is_base64(txt: str) -> bool:
2425 txt: the string to check
2428 True if txt is a valid base64 encoded string. This assumes
2429 txt was encoded with Python's standard base64 alphabet which
2430 is the same as what uuencode/uudecode uses).
2432 See also :meth:`to_base64`, :meth:`from_base64`.
2434 >>> is_base64('test') # all letters in the b64 alphabet
2437 >>> is_base64('another test, how do you like this one?')
2440 >>> is_base64(b'aGVsbG8/\\n') # Ending newline is ok.
2444 a = string.ascii_uppercase + string.ascii_lowercase + string.digits + '+/'
2445 alphabet = set(a.encode('ascii'))
2446 for char in to_ascii(txt.strip()):
2447 if char not in alphabet:
2453 b64: bytes, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2457 b64: bytestring of 64-bit encoded data to decode / convert.
2458 encoding: the encoding to use during conversion
2459 errors: how to handle encoding errors
2462 The decoded form of b64 as a normal python string. Similar to
2463 and compatible with uuencode / uudecode.
2465 See also :meth:`to_base64`, :meth:`is_base64`.
2467 >>> from_base64(b'aGVsbG8/\\n')
2470 return base64.decodebytes(b64).decode(encoding, errors)
2473 def chunk(txt: str, chunk_size: int):
2476 txt: a string to be chunked into evenly spaced pieces.
2477 chunk_size: the size of each chunk to make
2480 The original string chunked into evenly spaced pieces.
2482 >>> ' '.join(chunk('010011011100010110101010101010101001111110101000', 8))
2483 '01001101 11000101 10101010 10101010 10011111 10101000'
2485 if len(txt) % chunk_size != 0:
2486 msg = f'String to chunk\'s length ({len(txt)} is not an even multiple of chunk_size ({chunk_size})'
2488 warnings.warn(msg, stacklevel=2)
2489 for x in range(0, len(txt), chunk_size):
2490 yield txt[x : x + chunk_size]
2493 def to_bitstring(txt: str, *, delimiter: str = '') -> str:
2496 txt: the string to convert into a bitstring
2497 delimiter: character to insert between adjacent bytes. Note that
2498 only bitstrings with delimiter='' are interpretable by
2499 :meth:`from_bitstring`.
2502 txt converted to ascii/binary and then chopped into bytes.
2504 See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`is_bitstring`,
2507 >>> to_bitstring('hello?')
2508 '011010000110010101101100011011000110111100111111'
2510 >>> to_bitstring('test', delimiter=' ')
2511 '01110100 01100101 01110011 01110100'
2513 >>> to_bitstring(b'test')
2514 '01110100011001010111001101110100'
2516 etxt = to_ascii(txt)
2517 bits = bin(int.from_bytes(etxt, 'big'))
2519 return delimiter.join(chunk(bits.zfill(8 * ((len(bits) + 7) // 8)), 8))
2522 def is_bitstring(txt: str) -> bool:
2525 txt: the string to check
2528 True if txt is a recognized bitstring and False otherwise.
2529 Note that if delimiter is non empty this code will not
2530 recognize the bitstring.
2532 See also :meth:`to_base64`, :meth:`from_bitstring`, :meth:`to_bitstring`,
2535 >>> is_bitstring('011010000110010101101100011011000110111100111111')
2538 >>> is_bitstring('1234')
2541 return is_binary_integer_number(f'0b{txt}')
2545 bits: str, encoding: str = 'utf-8', errors: str = 'surrogatepass'
2549 bits: the bitstring to convert back into a python string
2550 encoding: the encoding to use during conversion
2551 errors: how to handle encoding errors
2554 The regular python string represented by bits. Note that this
2555 code does not work with to_bitstring when delimiter is non-empty.
2557 See also :meth:`to_base64`, :meth:`to_bitstring`, :meth:`is_bitstring`,
2560 >>> from_bitstring('011010000110010101101100011011000110111100111111')
2564 return n.to_bytes((n.bit_length() + 7) // 8, 'big').decode(encoding, errors) or '\0'
2567 def ip_v4_sort_key(txt: str) -> Optional[Tuple[int, ...]]:
2570 txt: an IP address to chunk up for sorting purposes
2573 A tuple of IP components arranged such that the sorting of
2574 IP addresses using a normal comparator will do something sane
2577 See also :meth:`is_ip_v4`.
2579 >>> ip_v4_sort_key('10.0.0.18')
2582 >>> ips = ['10.0.0.10', '100.0.0.1', '1.2.3.4', '10.0.0.9']
2583 >>> sorted(ips, key=lambda x: ip_v4_sort_key(x))
2584 ['1.2.3.4', '10.0.0.9', '10.0.0.10', '100.0.0.1']
2586 if not is_ip_v4(txt):
2587 print(f"not IP: {txt}")
2589 return tuple(int(x) for x in txt.split('.'))
2592 def path_ancestors_before_descendants_sort_key(volume: str) -> Tuple[str, ...]:
2595 volume: the string to chunk up for sorting purposes
2598 A tuple of volume's components such that the sorting of
2599 volumes using a normal comparator will do something sane
2602 See also :mod:`pyutils.files.file_utils`.
2604 >>> path_ancestors_before_descendants_sort_key('/usr/local/bin')
2605 ('usr', 'local', 'bin')
2607 >>> paths = ['/usr/local', '/usr/local/bin', '/usr']
2608 >>> sorted(paths, key=lambda x: path_ancestors_before_descendants_sort_key(x))
2609 ['/usr', '/usr/local', '/usr/local/bin']
2611 return tuple(x for x in volume.split('/') if len(x) > 0)
2614 def replace_all(in_str: str, replace_set: str, replacement: str) -> str:
2616 Execute several replace operations in a row.
2619 in_str: the string in which to replace characters
2620 replace_set: the set of target characters to replace
2621 replacement: the character to replace any member of replace_set
2624 See also :meth:`replace_nth`.
2627 The string with replacements executed.
2629 >>> s = 'this_is a-test!'
2630 >>> replace_all(s, ' _-!', '')
2633 for char in replace_set:
2634 in_str = in_str.replace(char, replacement)
2638 def replace_nth(in_str: str, source: str, target: str, nth: int):
2640 Replaces the nth occurrance of a substring within a string.
2643 in_str: the string in which to run the replacement
2644 source: the substring to replace
2645 target: the replacement text
2646 nth: which occurrance of source to replace?
2648 See also :meth:`replace_all`.
2650 >>> replace_nth('this is a test', ' ', '-', 3)
2653 where = [m.start() for m in re.finditer(source, in_str)][nth - 1]
2654 before = in_str[:where]
2655 after = in_str[where:]
2656 after = after.replace(source, target, 1)
2657 return before + after
2660 if __name__ == '__main__':