Skip to content

Commit

Permalink
Adds dclm fasttext classifier (#205)
Browse files Browse the repository at this point in the history
* Adds dclm fasttext classifier

* Bump version

* Normalize attribute suffix
  • Loading branch information
undfined authored Sep 24, 2024
1 parent 8a3c058 commit 06d521d
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dolma"
version = "1.0.13"
version = "1.0.14"
description = "Data filters"
license = { text = "Apache-2.0" }
readme = "README.md"
Expand Down
28 changes: 28 additions & 0 deletions python/dolma/taggers/quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,34 @@
from ..core.registry import TaggerRegistry


@TaggerRegistry.add("dclm-oh-eli5")
class DclmQualityClassifier(BaseFastTextTagger):
MODEL_PATH = "https://huggingface.co/mlfoundations/fasttext-oh-eli5/resolve/main/openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train.bin"

def __init__(self):
super().__init__(model_path=self.MODEL_PATH, model_mode=self.DOCUMENT_LEVEL_TAGGER)

def predict_slice(self, text_slice: TextSlice) -> Iterable[Prediction]:
# Note: This slice should always be the entire document

# Clean the input text by joining all lines into a single string
text = " ".join(text_slice.doc.strip().splitlines())
pred = self.classifier.predict(text)

# Extract the predicted label and its probability
(pred_label, pred_prob) = pred
pred_label = pred_label[0]
probability_score = pred_prob[0]

# If the predicted label is 'CC', adjust the probability of it being 'Wikipedia'
if pred_label == "__label__cc":
probability_score = 1 - probability_score

label = pred_label.replace("__label__", "").replace("cc", "score").replace("hq", "score")

return [Prediction(label=label, score=probability_score)]


@TaggerRegistry.add("dolma17-quality")
class Dolma17QualityClassifier(BaseFastTextTagger):
MODEL_PATH = "https://dolma-artifacts.org/fasttext_models/dolma-1_7/cc_wiki_wikiref_sw_pes2o_adult_fakenews_math_books_openhermes.bin" # noqa: E501
Expand Down

0 comments on commit 06d521d

Please sign in to comment.