diff --git a/src/llm_datasets/datasets/dataset_registry.py b/src/llm_datasets/datasets/dataset_registry.py index aec6d43..d89f429 100644 --- a/src/llm_datasets/datasets/dataset_registry.py +++ b/src/llm_datasets/datasets/dataset_registry.py @@ -89,6 +89,7 @@ ".hr.croatian_news_engri.CroatianNewsENGRIDataset", # it ".it.itwac.ITWacDataset", + ".it.paisa.PaisaCorpus" # mt ".mt.korpus_malti.KorpusMaltiDataset", # nl diff --git a/src/llm_datasets/datasets/it/paisa.py b/src/llm_datasets/datasets/it/paisa.py new file mode 100644 index 0000000..e9d26b6 --- /dev/null +++ b/src/llm_datasets/datasets/it/paisa.py @@ -0,0 +1,40 @@ +import gzip + +from llm_datasets.datasets.base import GB, Availability, BaseDataset, License + + +class PaisaCorpus(BaseDataset): + DATASET_ID = "paisa" + TITLE = "PaisaCorpus" + HOMEPAGE = "http://www.corpusitaliano.it/en/help/getting_started.html" + LICENSE = License( + name="Creative Commons", + url="https://creativecommons.org/licenses/by-nc-sa/3.0/", + ) + AVAILIBILITY = Availability.DIRECT_DOWNLOAD + DOWNLOAD_URLS = [ + "https://clarin.eurac.edu/repository/xmlui/bitstream/handle/20.500.12124/3/paisa.raw.utf8.gz?sequence=1&isAllowed=y" + ] + LANGUAGES = ["it"] + DESCRIPTION = """ + The Paisà corpus is a large collection of Italian web texts, licensed under + Creative Commons (Attribution-ShareAlike and Attribution-Noncommercial-ShareAlike). + It has been created in the context of the project PAISÀ. + """ + BYTES = 2.7 * GB + + def get_texts(self): + """Reads directly from .gz file. + Check for "wiki" in the dataset URL to avoid overlapping with other datasets + """ + from bs4 import BeautifulSoup + + with gzip.open(self.get_dataset_file_paths(single_file=True), "rt", encoding="utf-8") as fin: + print("Parsing file") + soup = BeautifulSoup(fin, "lxml") + print("File ready") + for text in soup.find_all("text"): + if "wiki" in text.get("url"): + continue + else: + yield text.get_text()