#!/usr/bin/env python3 from itertools import zip_longest import json import random import re import string from typing import Any, List, Optional import unicodedata from uuid import uuid4 NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$") HEX_NUMBER_RE = re.compile(r"^([+|-]?)0[x|X]([0-9A-Fa-f]+)$") OCT_NUMBER_RE = re.compile(r"^([+|-]?)0[O|o]([0-7]+)$") BIN_NUMBER_RE = re.compile(r"^([+|-]?)0[B|b]([0|1]+)$") URLS_RAW_STRING = ( r"([a-z-]+://)" # scheme r"([a-z_\d-]+:[a-z_\d-]+@)?" # user:password r"(www\.)?" # www. r"((?]*/?>)(.*?())?||)", re.IGNORECASE | re.MULTILINE | re.DOTALL, ) HTML_TAG_ONLY_RE = re.compile( r"(<([a-z]+:)?[a-z]+[^>]*/?>|||)", re.IGNORECASE | re.MULTILINE | re.DOTALL, ) SPACES_RE = re.compile(r"\s") NO_LETTERS_OR_NUMBERS_RE = re.compile( r"[^\w\d]+|_+", re.IGNORECASE | re.UNICODE ) MARGIN_RE = re.compile(r"^[^\S\r\n]+") ESCAPE_SEQUENCE_RE = re.compile(r"\[[^A-Za-z]*[A-Za-z]") NUM_SUFFIXES = { "Pb": (1024 ** 5), "P": (1024 ** 5), "Tb": (1024 ** 4), "T": (1024 ** 4), "Gb": (1024 ** 3), "G": (1024 ** 3), "Mb": (1024 ** 2), "M": (1024 ** 2), "Kb": (1024 ** 1), "K": (1024 ** 1), } def is_none_or_empty(in_str: Optional[str]) -> bool: return in_str is None or len(in_str.strip()) == 0 def is_string(obj: Any) -> bool: """ Checks if an object is a string. """ return isinstance(obj, str) def is_empty_string(in_str: Any) -> bool: return is_string(in_str) and in_str.strip() == "" def is_full_string(in_str: Any) -> bool: return is_string(in_str) and in_str.strip() != "" def is_number(in_str: str) -> bool: """ Checks if a string is a valid number. """ if not is_string(in_str): raise ValueError(in_str) return NUMBER_RE.match(in_str) is not None def is_integer_number(in_str: str) -> bool: """ Checks whether the given string represents an integer or not. An integer may be signed or unsigned or use a "scientific notation". *Examples:* >>> is_integer('42') # returns true >>> is_integer('42.0') # returns false """ return ( (is_number(in_str) and "." not in in_str) or is_hexidecimal_integer_number(in_str) or is_octal_integer_number(in_str) or is_binary_integer_number(in_str) ) def is_hexidecimal_integer_number(in_str: str) -> bool: if not is_string(in_str): raise ValueError(in_str) return HEX_NUMBER_RE.match(in_str) is not None def is_octal_integer_number(in_str: str) -> bool: if not is_string(in_str): raise ValueError(in_str) return OCT_NUMBER_RE.match(in_str) is not None def is_binary_integer_number(in_str: str) -> bool: if not is_string(in_str): raise ValueError(in_str) return BIN_NUMBER_RE.match(in_str) is not None def to_int(in_str: str) -> int: if not is_string(in_str): raise ValueError(in_str) if is_binary_integer_number(in_str): return int(in_str, 2) if is_octal_integer_number(in_str): return int(in_str, 8) if is_hexidecimal_integer_number(in_str): return int(in_str, 16) return int(in_str) def is_decimal_number(in_str: str) -> bool: """ Checks whether the given string represents a decimal or not. A decimal may be signed or unsigned or use a "scientific notation". >>> is_decimal('42.0') # returns true >>> is_decimal('42') # returns false """ return is_number(in_str) and "." in in_str def strip_escape_sequences(in_str: str) -> str: in_str = ESCAPE_SEQUENCE_RE.sub("", in_str) return in_str def add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str: if isinstance(in_str, int): in_str = f'{in_str}' if is_number(in_str): return _add_thousands_separator( in_str, separator_char = separator_char, places = places ) raise ValueError(in_str) def _add_thousands_separator(in_str: str, *, separator_char = ',', places = 3) -> str: decimal_part = "" if '.' in in_str: (in_str, decimal_part) = in_str.split('.') tmp = [iter(in_str[::-1])] * places ret = separator_char.join( "".join(x) for x in zip_longest(*tmp, fillvalue=""))[::-1] if len(decimal_part) > 0: ret += '.' ret += decimal_part return ret # Full url example: # scheme://username:password@www.domain.com:8042/folder/subfolder/file.extension?param=value¶m2=value2#hash def is_url(in_str: Any, allowed_schemes: Optional[List[str]] = None) -> bool: """ Check if a string is a valid url. *Examples:* >>> is_url('http://www.mysite.com') # returns true >>> is_url('https://mysite.com') # returns true >>> is_url('.mysite.com') # returns false """ if not is_full_string(in_str): return False valid = URL_RE.match(in_str) is not None if allowed_schemes: return valid and any([in_str.startswith(s) for s in allowed_schemes]) return valid def is_email(in_str: Any) -> bool: """ Check if a string is a valid email. Reference: https://tools.ietf.org/html/rfc3696#section-3 *Examples:* >>> is_email('my.email@the-provider.com') # returns true >>> is_email('@gmail.com') # returns false """ if ( not is_full_string(in_str) or len(in_str) > 320 or in_str.startswith(".") ): return False try: # we expect 2 tokens, one before "@" and one after, otherwise # we have an exception and the email is not valid. head, tail = in_str.split("@") # head's size must be <= 64, tail <= 255, head must not start # with a dot or contain multiple consecutive dots. if ( len(head) > 64 or len(tail) > 255 or head.endswith(".") or (".." in head) ): return False # removes escaped spaces, so that later on the test regex will # accept the string. head = head.replace("\\ ", "") if head.startswith('"') and head.endswith('"'): head = head.replace(" ", "")[1:-1] return EMAIL_RE.match(head + "@" + tail) is not None except ValueError: # borderline case in which we have multiple "@" signs but the # head part is correctly escaped. if ESCAPED_AT_SIGN.search(in_str) is not None: # replace "@" with "a" in the head return is_email(ESCAPED_AT_SIGN.sub("a", in_str)) return False def suffix_string_to_number(in_str: str) -> Optional[int]: """Take a string like "33Gb" and convert it into a number (of bytes) like 34603008. Return None if the input string is not valid. """ def suffix_capitalize(s: str) -> str: if len(s) == 1: return s.upper() elif len(s) == 2: return f"{s[0].upper()}{s[1].lower()}" return suffix_capitalize(s[0:1]) if is_string(in_str): if is_integer_number(in_str): return to_int(in_str) suffixes = [in_str[-2:], in_str[-1:]] rest = [in_str[:-2], in_str[:-1]] for x in range(len(suffixes)): s = suffixes[x] s = suffix_capitalize(s) multiplier = NUM_SUFFIXES.get(s, None) if multiplier is not None: r = rest[x] if is_integer_number(r): return int(r) * multiplier return None def number_to_suffix_string(num: int) -> Optional[str]: """Take a number (of bytes) and returns a string like "43.8Gb". Returns none if the input is invalid. """ d = 0.0 suffix = None for (sfx, size) in NUM_SUFFIXES.items(): if num > size: d = num / size suffix = sfx break if suffix is not None: return f"{d:.1f}{suffix}" return None def is_credit_card(in_str: Any, card_type: str = None) -> bool: """ Checks if a string is a valid credit card number. If card type is provided then it checks against that specific type only, otherwise any known credit card number will be accepted. Supported card types are the following: - VISA - MASTERCARD - AMERICAN_EXPRESS - DINERS_CLUB - DISCOVER - JCB """ if not is_full_string(in_str): return False if card_type is not None: if card_type not in CREDIT_CARDS: raise KeyError( f'Invalid card type "{card_type}". Valid types are: {CREDIT_CARDS.keys()}' ) return CREDIT_CARDS[card_type].match(in_str) is not None for c in CREDIT_CARDS: if CREDIT_CARDS[c].match(in_str) is not None: return True return False def is_camel_case(in_str: Any) -> bool: """ Checks if a string is formatted as camel case. A string is considered camel case when: - it's composed only by letters ([a-zA-Z]) and optionally numbers ([0-9]) - it contains both lowercase and uppercase letters - it does not start with a number """ return ( is_full_string(in_str) and CAMEL_CASE_TEST_RE.match(in_str) is not None ) def is_snake_case(in_str: Any, *, separator: str = "_") -> bool: """ Checks if a string is formatted as "snake case". A string is considered snake case when: - it's composed only by lowercase/uppercase letters and digits - it contains at least one underscore (or provided separator) - it does not start with a number """ if is_full_string(in_str): re_map = {"_": SNAKE_CASE_TEST_RE, "-": SNAKE_CASE_TEST_DASH_RE} re_template = ( r"([a-z]+\d*{sign}[a-z\d{sign}]*|{sign}+[a-z\d]+[a-z\d{sign}]*)" ) r = re_map.get( separator, re.compile( re_template.format(sign=re.escape(separator)), re.IGNORECASE ), ) return r.match(in_str) is not None return False def is_json(in_str: Any) -> bool: """ Check if a string is a valid json. *Examples:* >>> is_json('{"name": "Peter"}') # returns true >>> is_json('[1, 2, 3]') # returns true >>> is_json('{nope}') # returns false """ if is_full_string(in_str) and JSON_WRAPPER_RE.match(in_str) is not None: try: return isinstance(json.loads(in_str), (dict, list)) except (TypeError, ValueError, OverflowError): pass return False def is_uuid(in_str: Any, allow_hex: bool = False) -> bool: """ Check if a string is a valid UUID. *Example:* >>> is_uuid('6f8aa2f9-686c-4ac3-8766-5712354a04cf') # returns true >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf') # returns false >>> is_uuid('6f8aa2f9686c4ac387665712354a04cf', allow_hex=True) # returns true """ # string casting is used to allow UUID itself as input data type s = str(in_str) if allow_hex: return UUID_HEX_OK_RE.match(s) is not None return UUID_RE.match(s) is not None def is_ip_v4(in_str: Any) -> bool: """ Checks if a string is a valid ip v4. *Examples:* >>> is_ip_v4('255.200.100.75') # returns true >>> is_ip_v4('nope') # returns false (not an ip) >>> is_ip_v4('255.200.100.999') # returns false (999 is out of range) """ if not is_full_string(in_str) or SHALLOW_IP_V4_RE.match(in_str) is None: return False # checks that each entry in the ip is in the valid range (0 to 255) for token in in_str.split("."): if not 0 <= int(token) <= 255: return False return True def extract_ip_v4(in_str: Any) -> Optional[str]: """ Extracts the IPv4 chunk of a string or None. """ if not is_full_string(in_str): return None in_str.strip() m = SHALLOW_IP_V4_RE.match(in_str) if m is not None: return m.group(0) return None def is_ip_v6(in_str: Any) -> bool: """ Checks if a string is a valid ip v6. *Examples:* >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true >>> is_ip_v6('2001:db8:85a3:0000:0000:8a2e:370:?') # returns false (invalid "?") """ return is_full_string(in_str) and IP_V6_RE.match(in_str) is not None def extract_ip_v6(in_str: Any) -> Optional[str]: """ Extract IPv6 chunk or None. """ if not is_full_string(in_str): return None in_str.strip() m = IP_V6_RE.match(in_str) if m is not None: return m.group(0) return None def is_ip(in_str: Any) -> bool: """ Checks if a string is a valid ip (either v4 or v6). *Examples:* >>> is_ip('255.200.100.75') # returns true >>> is_ip('2001:db8:85a3:0000:0000:8a2e:370:7334') # returns true >>> is_ip('1.2.3') # returns false """ return is_ip_v6(in_str) or is_ip_v4(in_str) def extract_ip(in_str: Any) -> Optional[str]: """Extract the IP address or None.""" ip = extract_ip_v4(in_str) if ip is None: ip = extract_ip_v6(in_str) return ip def is_mac_address(in_str: Any) -> bool: """Return True if in_str is a valid MAC address false otherwise.""" return is_full_string(in_str) and MAC_ADDRESS_RE.match(in_str) is not None def extract_mac_address(in_str: Any, *, separator: str = ":") -> Optional[str]: """Extract the MAC address from in_str""" if not is_full_string(in_str): return None in_str.strip() m = MAC_ADDRESS_RE.match(in_str) if m is not None: mac = m.group(0) mac.replace(":", separator) mac.replace("-", separator) return mac return None def is_slug(in_str: Any, separator: str = "-") -> bool: """ Checks if a given string is a slug (as created by `slugify()`). *Examples:* >>> is_slug('my-blog-post-title') # returns true >>> is_slug('My blog post title') # returns false :param in_str: String to check. :type in_str: str :param separator: Join sign used by the slug. :type separator: str :return: True if slug, false otherwise. """ if not is_full_string(in_str): return False rex = r"^([a-z\d]+" + re.escape(separator) + r"*?)*[a-z\d]$" return re.match(rex, in_str) is not None def contains_html(in_str: str) -> bool: """ Checks if the given string contains HTML/XML tags. By design, this function matches ANY type of tag, so don't expect to use it as an HTML validator, its goal is to detect "malicious" or undesired tags in the text. *Examples:* >>> contains_html('my string is bold') # returns true >>> contains_html('my string is not bold') # returns false """ if not is_string(in_str): raise ValueError(in_str) return HTML_RE.search(in_str) is not None def words_count(in_str: str) -> int: """ Returns the number of words contained into the given string. This method is smart, it does consider only sequence of one or more letter and/or numbers as "words", so a string like this: "! @ # % ... []" will return zero! Moreover it is aware of punctuation, so the count for a string like "one,two,three.stop" will be 4 not 1 (even if there are no spaces in the string). *Examples:* >>> words_count('hello world') # returns 2 >>> words_count('one,two,three.stop') # returns 4 """ if not is_string(in_str): raise ValueError(in_str) return len(WORDS_COUNT_RE.findall(in_str)) def generate_uuid(as_hex: bool = False) -> str: """ Generated an UUID string (using `uuid.uuid4()`). *Examples:* >>> uuid() # possible output: '97e3a716-6b33-4ab9-9bb1-8128cb24d76b' >>> uuid(as_hex=True) # possible output: '97e3a7166b334ab99bb18128cb24d76b' """ uid = uuid4() if as_hex: return uid.hex return str(uid) def generate_random_alphanumeric_string(size: int) -> str: """ Returns a string of the specified size containing random characters (uppercase/lowercase ascii letters and digits). *Example:* >>> random_string(9) # possible output: "cx3QQbzYg" """ if size < 1: raise ValueError("size must be >= 1") chars = string.ascii_letters + string.digits buffer = [random.choice(chars) for _ in range(size)] return from_char_list(buffer) def reverse(in_str: str) -> str: """ Returns the string with its chars reversed. """ if not is_string(in_str): raise ValueError(in_str) return in_str[::-1] def camel_case_to_snake_case(in_str, *, separator="_"): """ Convert a camel case string into a snake case one. (The original string is returned if is not a valid camel case string) """ if not is_string(in_str): raise ValueError(in_str) if not is_camel_case(in_str): return in_str return CAMEL_CASE_REPLACE_RE.sub( lambda m: m.group(1) + separator, in_str ).lower() def snake_case_to_camel_case( in_str: str, *, upper_case_first: bool = True, separator: str = "_" ) -> str: """ Convert a snake case string into a camel case one. (The original string is returned if is not a valid snake case string) """ if not is_string(in_str): raise ValueError(in_str) if not is_snake_case(in_str, separator=separator): return in_str tokens = [s.title() for s in in_str.split(separator) if is_full_string(s)] if not upper_case_first: tokens[0] = tokens[0].lower() return from_char_list(tokens) def to_char_list(in_str: str) -> List[str]: if not is_string(in_str): return [] return list(in_str) def from_char_list(in_list: List[str]) -> str: return "".join(in_list) def shuffle(in_str: str) -> str: """Return a new string containing same chars of the given one but in a randomized order. """ if not is_string(in_str): raise ValueError(in_str) # turn the string into a list of chars chars = to_char_list(in_str) random.shuffle(chars) return from_char_list(chars) def strip_html(in_str: str, keep_tag_content: bool = False) -> str: """ Remove html code contained into the given string. *Examples:* >>> strip_html('test: click here') # returns 'test: ' >>> strip_html('test: click here', keep_tag_content=True) # returns 'test: click here' """ if not is_string(in_str): raise ValueError(in_str) r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE return r.sub("", in_str) def asciify(in_str: str) -> str: """ Force string content to be ascii-only by translating all non-ascii chars into the closest possible representation (eg: ó -> o, Ë -> E, ç -> c...). **Bear in mind**: Some chars may be lost if impossible to translate. *Example:* >>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') # returns 'eeuuooaaeynAAACIINOE' """ if not is_string(in_str): raise ValueError(in_str) # "NFKD" is the algorithm which is able to successfully translate # the most of non-ascii chars. normalized = unicodedata.normalize("NFKD", in_str) # encode string forcing ascii and ignore any errors # (unrepresentable chars will be stripped out) ascii_bytes = normalized.encode("ascii", "ignore") # turns encoded bytes into an utf-8 string return ascii_bytes.decode("utf-8") def slugify(in_str: str, *, separator: str = "-") -> str: """ Converts a string into a "slug" using provided separator. The returned string has the following properties: - it has no spaces - all letters are in lower case - all punctuation signs and non alphanumeric chars are removed - words are divided using provided separator - all chars are encoded as ascii (by using `asciify()`) - is safe for URL *Examples:* >>> slugify('Top 10 Reasons To Love Dogs!!!') # returns: 'top-10-reasons-to-love-dogs' >>> slugify('Mönstér Mägnët') # returns 'monster-magnet' """ if not is_string(in_str): raise ValueError(in_str) # replace any character that is NOT letter or number with spaces out = NO_LETTERS_OR_NUMBERS_RE.sub(" ", in_str.lower()).strip() # replace spaces with join sign out = SPACES_RE.sub(separator, out) # normalize joins (remove duplicates) out = re.sub(re.escape(separator) + r"+", separator, out) return asciify(out) def to_bool(in_str: str) -> bool: """ Turns a string into a boolean based on its content (CASE INSENSITIVE). A positive boolean (True) is returned if the string value is one of the following: - "true" - "1" - "yes" - "y" Otherwise False is returned. """ if not is_string(in_str): raise ValueError(in_str) return in_str.lower() in ("true", "1", "yes", "y", "t") def dedent(in_str: str) -> str: """ Removes tab indentation from multi line strings (inspired by analogous Scala function). *Example:* >>> strip_margin(''' >>> line 1 >>> line 2 >>> line 3 >>> ''') >>> # returns: >>> ''' >>> line 1 >>> line 2 >>> line 3 >>> ''' """ if not is_string(in_str): raise ValueError(in_str) line_separator = '\n' lines = [MARGIN_RE.sub('', line) for line in in_str.split(line_separator)] return line_separator.join(lines) def indent(in_str: str, amount: int) -> str: if not is_string(in_str): raise ValueError(in_str) line_separator = '\n' lines = [" " * amount + line for line in in_str.split(line_separator)] return line_separator.join(lines) def sprintf(*args, **kwargs) -> str: ret = "" sep = kwargs.pop("sep", None) if sep is not None: if not isinstance(sep, str): raise TypeError("sep must be None or a string") end = kwargs.pop("end", None) if end is not None: if not isinstance(end, str): raise TypeError("end must be None or a string") if kwargs: raise TypeError("invalid keyword arguments to sprint()") if sep is None: sep = " " if end is None: end = "\n" for i, arg in enumerate(args): if i: ret += sep if isinstance(arg, str): ret += arg else: ret += str(arg) ret += end return ret