Source code for pyutils.math_utils

#!/usr/bin/env python3

# © Copyright 2021-2023, Scott Gasch

"""Helper utilities with a mathematical / statictical focus."""

import collections
import functools
import math
from heapq import heappop, heappush
from typing import Dict, Hashable, List, Optional, Tuple, cast

from pyutils import dict_utils
from pyutils.typez.type_hints import Numeric


[docs]class NumericPopulation(object):
    """This object *store* a numeric population in a way that enables relatively
    fast addition of new numbers (:math:`O(2log_2 n)`) and instant access to the
    median value in the population (:math:`O(1)`).  It also provides other population
    summary statistics such as the :meth:`get_mode`, :meth:`get_percentile` and
    :meth:`get_stdev`.

    .. note::

        Because this class stores a copy of all numbers added to it, it shouldn't
        be used for very large populations.  Consider sampling.

    >>> pop = NumericPopulation()
    >>> pop.add_number(1)
    >>> pop.add_number(10)
    >>> pop.add_number(3)
    >>> len(pop)
    3
    >>> pop.get_median()
    3
    >>> pop.add_number(7)
    >>> pop.add_number(5)
    >>> pop.get_median()
    5
    >>> pop.get_mean()
    5.2
    >>> round(pop.get_stdev(), 1)
    3.1
    >>> pop.get_percentile(20)
    3
    >>> pop.get_percentile(60)
    7
    """

    def __init__(self):
        self.lowers, self.highers = [], []
        self.aggregate = 0.0
        self.sorted_copy: Optional[List[Numeric]] = None
        self.maximum = None
        self.minimum = None

[docs]    def add_number(self, number: Numeric):
        """Adds a number to the population.  Runtime complexity of this
        operation is :math:`O(2 log_2 n)`

        Args:
            number: the number to add_number to the population
        """

        if not self.highers or number > self.highers[0]:
            heappush(self.highers, number)
        else:
            heappush(self.lowers, -number)  # for lowers we need a max heap
        self.aggregate += number
        self._rebalance()
        if not self.maximum or number > self.maximum:
            self.maximum = number
        if not self.minimum or number < self.minimum:
            self.minimum = number

    def __len__(self):
        """
        Returns:
            the population's current size.
        """
        n = 0
        if self.highers:
            n += len(self.highers)
        if self.lowers:
            n += len(self.lowers)
        return n

    def _rebalance(self):
        """Internal helper for rebalancing the `lowers` and `highers` heaps"""
        if len(self.lowers) - len(self.highers) > 1:
            heappush(self.highers, -heappop(self.lowers))
        elif len(self.highers) - len(self.lowers) > 1:
            heappush(self.lowers, -heappop(self.highers))

[docs]    def get_median(self) -> Numeric:
        """
        Returns:
            The median (p50) of the current population in :math:`O(1)` time.
        """
        if len(self.lowers) == len(self.highers):
            return -self.lowers[0]
        elif len(self.lowers) > len(self.highers):
            return -self.lowers[0]
        else:
            return self.highers[0]

[docs]    def get_mean(self) -> float:
        """
        Returns:
            The mean (arithmetic mean) so far in :math:`O(1)` time.
        """
        count = len(self)
        return self.aggregate / count

[docs]    def get_mode(self) -> Tuple[Numeric, int]:
        """
        Returns:
            The population mode (most common member in the population)
            in :math:`O(n)` time.
        """
        count: Dict[Hashable, int] = collections.defaultdict(int)
        for n in self.lowers:
            count[-n] += 1
        for n in self.highers:
            count[n] += 1
        return cast(Tuple[Numeric, int], dict_utils.item_with_max_value(count))

[docs]    def get_stdev(self) -> float:
        """
        Returns:
            The stdev of the current population in :math:`O(n)` time.
        """
        mean = self.get_mean()
        variance = 0.0
        for n in self.lowers:
            n = -n
            variance += (n - mean) ** 2
        for n in self.highers:
            variance += (n - mean) ** 2
        count = len(self.lowers) + len(self.highers)
        return math.sqrt(variance / count)

    def _create_sorted_copy_if_needed(self, count: int):
        """Internal helper."""
        if not self.sorted_copy or count != len(self.sorted_copy):
            self.sorted_copy = []
            for x in self.lowers:
                self.sorted_copy.append(-x)
            for x in self.highers:
                self.sorted_copy.append(x)
            self.sorted_copy = sorted(self.sorted_copy)

[docs]    def get_percentile(self, n: float) -> Numeric:
        """
        Returns: the number at approximately pn% in the population
        (i.e. the nth percentile) in :math:`O(n log_2 n)` time (it
        performs a full sort).  This is not the most efficient
        algorithm.

        Not thread-safe; does caching across multiple calls without
        an invocation to :meth:`add_number` for perf reasons.

        Args:
            n: the percentile to compute
        """
        if n == 50:
            return self.get_median()
        count = len(self)
        self._create_sorted_copy_if_needed(count)
        assert self.sorted_copy
        index = round(count * (n / 100.0))
        index = max(0, index)
        index = min(count - 1, index)
        return self.sorted_copy[index]


[docs]def gcd_floats(a: float, b: float) -> float:
    """
    Returns:
        The greatest common divisor of a and b.

    Args:
        a: first operand
        b: second operatnd
    """
    if a < b:
        return gcd_floats(b, a)

    # base case
    if abs(b) < 0.001:
        return a
    return gcd_floats(b, a - math.floor(a / b) * b)


[docs]def gcd_float_sequence(lst: List[float]) -> float:
    """
    Returns:
        The greatest common divisor of a list of floats.

    Args:
        lst: a list of operands

    Raises:
        ValueError: if the list doesn't contain at least one number.
    """
    if len(lst) <= 0:
        raise ValueError("Need at least one number")
    if len(lst) == 1:
        return lst[0]
    assert len(lst) >= 2
    gcd = gcd_floats(lst[0], lst[1])
    for i in range(2, len(lst)):
        gcd = gcd_floats(gcd, lst[i])
    return gcd


[docs]def truncate_float(n: float, decimals: int = 2):
    """
    Returns:
        A truncated float to a particular number of decimals.

    Args:
        n: the float to truncate
        decimals: how many decimal places are desired?

    >>> truncate_float(3.1415927, 3)
    3.141
    """
    assert 0 < decimals < 10
    multiplier = 10**decimals
    return int(n * multiplier) / multiplier


[docs]def percentage_to_multiplier(percent: float) -> float:
    """Given a percentage that represents a return or percent change
    (e.g. 155%), determine the factor (i.e.  multiplier) needed to
    scale a number by that percentage (e.g. 2.55x)

    Args:
        percent: the return percent to scale by

    >>> percentage_to_multiplier(155)
    2.55
    >>> percentage_to_multiplier(45)
    1.45
    >>> percentage_to_multiplier(-25)
    0.75

    """
    multiplier = percent / 100
    multiplier += 1.0
    return multiplier


[docs]def multiplier_to_percent(multiplier: float) -> float:
    """Convert a multiplicative factor into a percent change or return
    percentage.

    Args:
        multiplier: the multiplier for which to compute the percent change

    >>> multiplier_to_percent(0.75)
    -25.0
    >>> multiplier_to_percent(1.0)
    0.0
    >>> multiplier_to_percent(1.99)
    99.0
    """
    percent = multiplier
    if percent > 0.0:
        percent -= 1.0
    else:
        percent = 1.0 - percent
    percent *= 100.0
    return percent


[docs]@functools.lru_cache(maxsize=1024, typed=True)
def is_prime(n: int) -> bool:
    """
    Args:
        n: the number for which primeness is to be determined.

    Returns:
        True if n is prime and False otherwise.

    Raises:
        TypeError: if argument is not an into

    .. note::

         Obviously(?) very slow for very large input numbers until
         we get quantum computers.

    >>> is_prime(13)
    True
    >>> is_prime(22)
    False
    >>> is_prime(51602981)
    True
    """
    if not isinstance(n, int):
        raise TypeError("argument passed to is_prime is not of 'int' type")

    # Corner cases
    if n <= 1:
        return False
    if n <= 3:
        return True

    # This is checked so that we can skip middle five numbers in below
    # loop
    if n % 2 == 0 or n % 3 == 0:
        return False

    i = 5
    while i * i <= n:
        if n % i == 0 or n % (i + 2) == 0:
            return False
        i = i + 6
    return True


if __name__ == "__main__":
    import doctest

    doctest.testmod()