3 # © Copyright 2021-2023, Scott Gasch
5 """Helper utilities with a mathematical / statictical focus."""
10 from heapq import heappop, heappush
11 from typing import Dict, List, Optional, Tuple
13 from pyutils import dict_utils
14 from pyutils.typez.simple import Numeric
17 class NumericPopulation(object):
18 """This object *store* a numeric population in a way that enables relatively
19 fast addition of new numbers (:math:`O(2log_2 n)`) and instant access to the
20 median value in the population (:math:`O(1)`). It also provides other population
21 summary statistics such as the :meth:`get_mode`, :meth:`get_percentile` and
26 Because this class stores a copy of all numbers added to it, it shouldn't
27 be used for very large populations. Consider sampling.
29 >>> pop = NumericPopulation()
31 >>> pop.add_number(10)
43 >>> round(pop.get_stdev(), 1)
45 >>> pop.get_percentile(20)
47 >>> pop.get_percentile(60)
52 self.lowers, self.highers = [], []
54 self.sorted_copy: Optional[List[Numeric]] = None
58 def add_number(self, number: Numeric):
59 """Adds a number to the population. Runtime complexity of this
60 operation is :math:`O(2 log_2 n)`
63 number: the number to add_number to the population
66 if not self.highers or number > self.highers[0]:
67 heappush(self.highers, number)
69 heappush(self.lowers, -number) # for lowers we need a max heap
70 self.aggregate += number
72 if not self.maximum or number > self.maximum:
74 if not self.minimum or number < self.minimum:
80 the population's current size.
84 n += len(self.highers)
90 """Internal helper for rebalancing the `lowers` and `highers` heaps"""
91 if len(self.lowers) - len(self.highers) > 1:
92 heappush(self.highers, -heappop(self.lowers))
93 elif len(self.highers) - len(self.lowers) > 1:
94 heappush(self.lowers, -heappop(self.highers))
96 def get_median(self) -> Numeric:
99 The median (p50) of the current population in :math:`O(1)` time.
101 if len(self.lowers) == len(self.highers):
102 return -self.lowers[0]
103 elif len(self.lowers) > len(self.highers):
104 return -self.lowers[0]
106 return self.highers[0]
108 def get_mean(self) -> float:
111 The mean (arithmetic mean) so far in :math:`O(1)` time.
114 return self.aggregate / count
116 def get_mode(self) -> Tuple[Numeric, int]:
119 The population mode (most common member in the population)
120 in :math:`O(n)` time.
122 count: Dict[Numeric, int] = collections.defaultdict(int)
123 for n in self.lowers:
125 for n in self.highers:
127 return dict_utils.item_with_max_value(count)
129 def get_stdev(self) -> float:
132 The stdev of the current population in :math:`O(n)` time.
134 mean = self.get_mean()
136 for n in self.lowers:
138 variance += (n - mean) ** 2
139 for n in self.highers:
140 variance += (n - mean) ** 2
141 count = len(self.lowers) + len(self.highers)
142 return math.sqrt(variance / count)
144 def _create_sorted_copy_if_needed(self, count: int):
145 """Internal helper."""
146 if not self.sorted_copy or count != len(self.sorted_copy):
147 self.sorted_copy = []
148 for x in self.lowers:
149 self.sorted_copy.append(-x)
150 for x in self.highers:
151 self.sorted_copy.append(x)
152 self.sorted_copy = sorted(self.sorted_copy)
154 def get_percentile(self, n: float) -> Numeric:
156 Returns: the number at approximately pn% in the population
157 (i.e. the nth percentile) in :math:`O(n log_2 n)` time (it
158 performs a full sort). This is not the most efficient
161 Not thread-safe; does caching across multiple calls without
162 an invocation to :meth:`add_number` for perf reasons.
165 n: the percentile to compute
168 return self.get_median()
170 self._create_sorted_copy_if_needed(count)
171 assert self.sorted_copy
172 index = round(count * (n / 100.0))
173 index = max(0, index)
174 index = min(count - 1, index)
175 return self.sorted_copy[index]
178 def gcd_floats(a: float, b: float) -> float:
181 The greatest common divisor of a and b.
188 return gcd_floats(b, a)
193 return gcd_floats(b, a - math.floor(a / b) * b)
196 def gcd_float_sequence(lst: List[float]) -> float:
199 The greatest common divisor of a list of floats.
202 lst: a list of operands
205 raise ValueError("Need at least one number")
209 gcd = gcd_floats(lst[0], lst[1])
210 for i in range(2, len(lst)):
211 gcd = gcd_floats(gcd, lst[i])
215 def truncate_float(n: float, decimals: int = 2):
218 A truncated float to a particular number of decimals.
221 n: the float to truncate
222 decimals: how many decimal places are desired?
224 >>> truncate_float(3.1415927, 3)
227 assert 0 < decimals < 10
228 multiplier = 10**decimals
229 return int(n * multiplier) / multiplier
232 def percentage_to_multiplier(percent: float) -> float:
233 """Given a percentage that represents a return or percent change
234 (e.g. 155%), determine the factor (i.e. multiplier) needed to
235 scale a number by that percentage (e.g. 2.55x)
238 percent: the return percent to scale by
240 >>> percentage_to_multiplier(155)
242 >>> percentage_to_multiplier(45)
244 >>> percentage_to_multiplier(-25)
248 multiplier = percent / 100
253 def multiplier_to_percent(multiplier: float) -> float:
254 """Convert a multiplicative factor into a percent change or return
258 multiplier: the multiplier for which to compute the percent change
260 >>> multiplier_to_percent(0.75)
262 >>> multiplier_to_percent(1.0)
264 >>> multiplier_to_percent(1.99)
271 percent = 1.0 - percent
276 @functools.lru_cache(maxsize=1024, typed=True)
277 def is_prime(n: int) -> bool:
280 n: the number for which primeness is to be determined.
283 True if n is prime and False otherwise.
287 Obviously(?) very slow for very large input numbers until
288 we get quantum computers.
294 >>> is_prime(51602981)
297 if not isinstance(n, int):
298 raise TypeError("argument passed to is_prime is not of 'int' type")
306 # This is checked so that we can skip middle five numbers in below
308 if n % 2 == 0 or n % 3 == 0:
313 if n % i == 0 or n % (i + 2) == 0:
319 if __name__ == "__main__":