3 # © Copyright 2021-2022, Scott Gasch
5 """Helper utilities with a mathematical / statictical focus."""
10 from heapq import heappop, heappush
11 from typing import Dict, List, Optional, Tuple
13 from pyutils import dict_utils
16 class NumericPopulation(object):
17 """This object *store* a numerical population in a way that enables relatively
18 fast addition of new numbers (:math:`O(2log_2 n)`) and instant access to the
19 median value in the population (:math:`O(1)`). It also provides other population
20 summary statistics such as the :meth:`mode`, :meth:`get_percentile` and
25 Because this class stores a copy of all numbers added to it, it shouldn't
26 be used for very large populations. Consider sampling.
28 >>> pop = NumericPopulation()
30 >>> pop.add_number(10)
42 >>> round(pop.get_stdev(), 1)
44 >>> pop.get_percentile(20)
46 >>> pop.get_percentile(60)
51 self.lowers, self.highers = [], []
53 self.sorted_copy: Optional[List[float]] = None
57 def add_number(self, number: float):
58 """Adds a number to the population. Runtime complexity of this
59 operation is :math:`O(2 log_2 n)`
62 number: the number to add_number to the population
65 if not self.highers or number > self.highers[0]:
66 heappush(self.highers, number)
68 heappush(self.lowers, -number) # for lowers we need a max heap
69 self.aggregate += number
71 if not self.maximum or number > self.maximum:
73 if not self.minimum or number < self.minimum:
79 the population's current size.
83 n += len(self.highers)
89 """Internal helper for rebalancing the `lowers` and `highers` heaps"""
90 if len(self.lowers) - len(self.highers) > 1:
91 heappush(self.highers, -heappop(self.lowers))
92 elif len(self.highers) - len(self.lowers) > 1:
93 heappush(self.lowers, -heappop(self.highers))
95 def get_median(self) -> float:
98 The median (p50) of the current population in :math:`O(1)` time.
100 if len(self.lowers) == len(self.highers):
101 return -self.lowers[0]
102 elif len(self.lowers) > len(self.highers):
103 return -self.lowers[0]
105 return self.highers[0]
107 def get_mean(self) -> float:
110 The mean (arithmetic mean) so far in :math:`O(1)` time.
113 return self.aggregate / count
115 def get_mode(self) -> Tuple[float, int]:
118 The population mode (most common member in the population)
119 in :math:`O(n)` time.
121 count: Dict[float, int] = collections.defaultdict(int)
122 for n in self.lowers:
124 for n in self.highers:
126 return dict_utils.item_with_max_value(count)
128 def get_stdev(self) -> float:
131 The stdev of the current population in :math:`O(n)` time.
133 mean = self.get_mean()
135 for n in self.lowers:
137 variance += (n - mean) ** 2
138 for n in self.highers:
139 variance += (n - mean) ** 2
140 count = len(self.lowers) + len(self.highers)
141 return math.sqrt(variance) / count
143 def _create_sorted_copy_if_needed(self, count: int):
144 """Internal helper."""
145 if not self.sorted_copy or count != len(self.sorted_copy):
146 self.sorted_copy = []
147 for x in self.lowers:
148 self.sorted_copy.append(-x)
149 for x in self.highers:
150 self.sorted_copy.append(x)
151 self.sorted_copy = sorted(self.sorted_copy)
153 def get_percentile(self, n: float) -> float:
155 Returns: the number at approximately pn% in the population
156 (i.e. the nth percentile) in :math:`O(n log_2 n)` time (it
157 performs a full sort). This is not the most efficient
160 Not thread-safe; does caching across multiple calls without
161 an invocation to :meth:`add_number` for perf reasons.
164 n: the percentile to compute
167 return self.get_median()
169 self._create_sorted_copy_if_needed(count)
170 assert self.sorted_copy
171 index = round(count * (n / 100.0))
172 index = max(0, index)
173 index = min(count - 1, index)
174 return self.sorted_copy[index]
177 def gcd_floats(a: float, b: float) -> float:
180 The greatest common divisor of a and b.
187 return gcd_floats(b, a)
192 return gcd_floats(b, a - math.floor(a / b) * b)
195 def gcd_float_sequence(lst: List[float]) -> float:
198 The greatest common divisor of a list of floats.
201 lst: a list of operands
204 raise ValueError("Need at least one number")
208 gcd = gcd_floats(lst[0], lst[1])
209 for i in range(2, len(lst)):
210 gcd = gcd_floats(gcd, lst[i])
214 def truncate_float(n: float, decimals: int = 2):
217 A truncated float to a particular number of decimals.
220 n: the float to truncate
222 >>> truncate_float(3.1415927, 3)
225 assert 0 < decimals < 10
226 multiplier = 10**decimals
227 return int(n * multiplier) / multiplier
230 def percentage_to_multiplier(percent: float) -> float:
231 """Given a percentage that represents a return or percent change
232 (e.g. 155%), determine the factor (i.e. multiplier) needed to
233 scale a number by that percentage (e.g. 2.55x)
236 percent: the return percent to scale by
238 >>> percentage_to_multiplier(155)
240 >>> percentage_to_multiplier(45)
242 >>> percentage_to_multiplier(-25)
246 multiplier = percent / 100
251 def multiplier_to_percent(multiplier: float) -> float:
252 """Convert a multiplicative factor into a percent change or return
256 multiplier: the multiplier for which to compute the percent change
258 >>> multiplier_to_percent(0.75)
260 >>> multiplier_to_percent(1.0)
262 >>> multiplier_to_percent(1.99)
269 percent = 1.0 - percent
274 @functools.lru_cache(maxsize=1024, typed=True)
275 def is_prime(n: int) -> bool:
278 n: the number for which primeness is to be determined.
281 True if n is prime and False otherwise.
285 Obviously(?) very slow for very large input numbers until
286 we get quantum computers.
292 >>> is_prime(51602981)
295 if not isinstance(n, int):
296 raise TypeError("argument passed to is_prime is not of 'int' type")
304 # This is checked so that we can skip middle five numbers in below
306 if n % 2 == 0 or n % 3 == 0:
311 if n % i == 0 or n % (i + 2) == 0:
317 if __name__ == '__main__':