X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=math_utils.py;h=31610ba5fd2a0726b5b1f151dc87cc362b7389d4;hb=f2600f30801c849fc1d139386e3ddc3c9eb43e30;hp=e0e3f6c10732b9a3ab20a251a225a0e963c362e9;hpb=eb9e6df32ed696158bf34dba6464277b648f5c74;p=python_utils.git diff --git a/math_utils.py b/math_utils.py index e0e3f6c..31610ba 100644 --- a/math_utils.py +++ b/math_utils.py @@ -1,50 +1,110 @@ #!/usr/bin/env python3 +"""Mathematical helpers.""" + import functools import math -from typing import List -from heapq import heappush, heappop +from heapq import heappop, heappush +from typing import List, Optional -class RunningMedian(object): - """A running median computer. +class NumericPopulation(object): + """A numeric population with some statistics such as median, mean, pN, + stdev, etc... - >>> median = RunningMedian() - >>> median.add_number(1) - >>> median.add_number(10) - >>> median.add_number(3) - >>> median.get_median() + >>> pop = NumericPopulation() + >>> pop.add_number(1) + >>> pop.add_number(10) + >>> pop.add_number(3) + >>> pop.get_median() 3 - >>> median.add_number(7) - >>> median.add_number(5) - >>> median.get_median() + >>> pop.add_number(7) + >>> pop.add_number(5) + >>> pop.get_median() 5 + >>> pop.get_mean() + 5.2 + >>> round(pop.get_stdev(), 2) + 6.99 + >>> pop.get_percentile(20) + 3 + >>> pop.get_percentile(60) + 7 + """ def __init__(self): self.lowers, self.highers = [], [] + self.aggregate = 0.0 + self.sorted_copy: Optional[List[float]] = None + + def add_number(self, number: float): + """O(2 log2 n)""" - def add_number(self, number): if not self.highers or number > self.highers[0]: heappush(self.highers, number) else: heappush(self.lowers, -number) # for lowers we need a max heap - self.rebalance() + self.aggregate += number + self._rebalance() - def rebalance(self): + def _rebalance(self): if len(self.lowers) - len(self.highers) > 1: heappush(self.highers, -heappop(self.lowers)) elif len(self.highers) - len(self.lowers) > 1: heappush(self.lowers, -heappop(self.highers)) - def get_median(self): + def get_median(self) -> float: + """Returns the approximate median (p50) so far in O(1) time.""" + if len(self.lowers) == len(self.highers): - return (-self.lowers[0] + self.highers[0])/2 + return -self.lowers[0] elif len(self.lowers) > len(self.highers): return -self.lowers[0] else: return self.highers[0] + def get_mean(self) -> float: + """Returns the mean (arithmetic mean) so far in O(1) time.""" + + count = len(self.lowers) + len(self.highers) + return self.aggregate / count + + def get_stdev(self) -> float: + """Returns the stdev so far in O(n) time.""" + + mean = self.get_mean() + variance = 0.0 + for n in self.lowers: + n = -n + variance += (n - mean) ** 2 + for n in self.highers: + variance += (n - mean) ** 2 + return math.sqrt(variance) + + def get_percentile(self, n: float) -> float: + """Returns the number at approximately pn% (i.e. the nth percentile) + of the distribution in O(n log n) time (expensive, requires a + complete sort). Not thread safe. Caching does across + multiple calls without an invocation to add_number. + + """ + if n == 50: + return self.get_median() + count = len(self.lowers) + len(self.highers) + if self.sorted_copy is not None: + if count == len(self.sorted_copy): + index = round(count * (n / 100.0)) + assert 0 <= index < count + return self.sorted_copy[index] + self.sorted_copy = [-x for x in self.lowers] + for x in self.highers: + self.sorted_copy.append(x) + self.sorted_copy = sorted(self.sorted_copy) + index = round(count * (n / 100.0)) + assert 0 <= index < count + return self.sorted_copy[index] + def gcd_floats(a: float, b: float) -> float: if a < b: @@ -76,8 +136,8 @@ def truncate_float(n: float, decimals: int = 2): 3.141 """ - assert decimals > 0 and decimals < 10 - multiplier = 10 ** decimals + assert 0 < decimals < 10 + multiplier = 10**decimals return int(n * multiplier) / multiplier @@ -143,12 +203,12 @@ def is_prime(n: int) -> bool: # This is checked so that we can skip middle five numbers in below # loop - if (n % 2 == 0 or n % 3 == 0): + if n % 2 == 0 or n % 3 == 0: return False i = 5 while i * i <= n: - if (n % i == 0 or n % (i + 2) == 0): + if n % i == 0 or n % (i + 2) == 0: return False i = i + 6 return True @@ -156,4 +216,5 @@ def is_prime(n: int) -> bool: if __name__ == '__main__': import doctest + doctest.testmod()