Add some useful stats to histogram.
authorScott Gasch <[email protected]>
Thu, 10 Feb 2022 16:13:08 +0000 (08:13 -0800)
committerScott Gasch <[email protected]>
Thu, 10 Feb 2022 16:13:08 +0000 (08:13 -0800)
histogram.py
math_utils.py

index 2657c0bbd6c9f679808b80843b7c4c36c2f2eb1c..9c07df9b588aef626ecf8217a70ac4fd9676eb9d 100644 (file)
@@ -40,7 +40,7 @@ class SimpleHistogram(Generic[T]):
                 raise Exception("Buckets overlap?!")
             self.buckets[start_end] = 0
         self.sigma: float = 0.0
-        self.median: RunningMedian = RunningMedian()
+        self.stats: RunningMedian = RunningMedian()
         self.maximum: Optional[T] = None
         self.minimum: Optional[T] = None
         self.count: Count = 0
@@ -74,7 +74,7 @@ class SimpleHistogram(Generic[T]):
         self.count += 1
         self.buckets[bucket] += 1
         self.sigma += item
-        self.median.add_number(item)
+        self.stats.add_number(item)
         if self.maximum is None or item > self.maximum:
             self.maximum = item
         if self.minimum is None or item < self.minimum:
@@ -142,4 +142,11 @@ class SimpleHistogram(Generic[T]):
         txt += sigma_label.rjust(details.max_label_width)
         txt += ' ' * (bar_width - 2)
         txt += f'Σ=(100.00% n={self.count})\n'
+        txt += ' ' * (bar_width + details.max_label_width - 2)
+        txt += f'mean(μ)={self.stats.get_mean():.3f}\n'
+        txt += ' ' * (bar_width + details.max_label_width - 2)
+        txt += f'p50(η)={self.stats.get_median():.3f}\n'
+        txt += ' ' * (bar_width + details.max_label_width - 2)
+        txt += f'stdev(σ)={self.stats.get_stdev():.3f}\n'
+        txt += '\n'
         return txt
index 37fcec5f6c557cdf1a66d39b671fd8d9438ba29c..28b8e6b3b6d9a407c6f263220e314d9d4acacbc0 100644 (file)
@@ -25,12 +25,14 @@ class RunningMedian(object):
 
     def __init__(self):
         self.lowers, self.highers = [], []
+        self.aggregate = 0.0
 
-    def add_number(self, number):
+    def add_number(self, number: float):
         if not self.highers or number > self.highers[0]:
             heappush(self.highers, number)
         else:
             heappush(self.lowers, -number)  # for lowers we need a max heap
+        self.aggregate += number
         self.rebalance()
 
     def rebalance(self):
@@ -39,7 +41,7 @@ class RunningMedian(object):
         elif len(self.highers) - len(self.lowers) > 1:
             heappush(self.lowers, -heappop(self.highers))
 
-    def get_median(self):
+    def get_median(self) -> float:
         if len(self.lowers) == len(self.highers):
             return (-self.lowers[0] + self.highers[0]) / 2
         elif len(self.lowers) > len(self.highers):
@@ -47,6 +49,19 @@ class RunningMedian(object):
         else:
             return self.highers[0]
 
+    def get_mean(self) -> float:
+        count = len(self.lowers) + len(self.highers)
+        return self.aggregate / count
+
+    def get_stdev(self) -> float:
+        mean = self.get_mean()
+        variance = 0.0
+        for n in self.lowers:
+            variance += (n - mean) ** 2
+        for n in self.highers:
+            variance += (n - mean) ** 2
+        return math.sqrt(variance)
+
 
 def gcd_floats(a: float, b: float) -> float:
     if a < b: