Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a few issues of the FixedBucketsValTracker #73

Merged
merged 3 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/dolma/core/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
import multiprocessing
import re
import shutil
Expand Down Expand Up @@ -27,7 +28,7 @@ def _make_tracker(type_: str = "fixed", **kwargs: int) -> BaseBucketApi:
if type_ == "infer":
return InferBucketsValTracker(**{"n": NUM_BINS, "b": BUFF_SIZE, **kwargs})
elif type_ == "fixed":
return FixedBucketsValTracker(**{"n": NUM_BINS, **kwargs})
return FixedBucketsValTracker(**{"n": int(math.log10(NUM_BINS)), **kwargs})
else:
raise ValueError(f"Unknown tracker type {type_}")

Expand Down
17 changes: 14 additions & 3 deletions python/dolma/core/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,17 @@ def summarize(self, n: int, density: bool = False) -> SummaryTuple:

class FixedBucketsValTracker(BaseBucketApi):
def __init__(self, n: int = 2):
# we use n to determine the precision of the bins; for convenience we store it as a power of 10.
# 10**n will be the maximum number of bins for each power of 2.
# Too large numbers will cause numeric problems and can cause a lot of memory use.
assert n >= 0
# we use n to determine the precision of the bins; for convenience we store it as a power of 10
assert n <= 100
self.n = 10**n
self._bins: Dict[Tuple[int, int], int] = {}

def add(self, value: Union[int, float], count: int = 1):
m, e = math.frexp(value)
k = int(m * self.n), e
k = math.floor(m * self.n), e

if k not in self._bins:
self._bins[k] = 0
Expand All @@ -255,12 +258,20 @@ def __len__(self) -> int:
def full(self) -> bool:
return False

def get_bin_upper_bound(self, val: float) -> float:
"""Return the upper bound of the bin containing val"""
m, e = math.frexp(val)
k = math.floor(m * self.n) + 1 # Add one to obtain the next bin
return k / self.n * 2**e

def summarize(self, n: int, density: bool = False) -> SummaryTuple:
bins, counts = zip(*sorted((m / self.n * 2**e, c) for (m, e), c in self._bins.items()))

if len(self) <= n:
# if there are fewer than n buckets, return the buckets as is
return SummaryTuple(counts=[int(c) for c in counts], bins=[float(b) for b in bins])
# To be consistent we also add the limit of the last bin, so the bins denote bin edges
upper_bin = self.get_bin_upper_bound(max(float(b) for b in bins))
return SummaryTuple(counts=[int(c) for c in counts], bins=[float(b) for b in bins] + [upper_bin])

# computing the weighted histograms
new_counts, new_values = np.histogram(a=bins, bins=n, weights=counts, density=density)
Expand Down
Loading