Skip to content

Commit

Permalink
Merge branch 'lflage-paisa'
Browse files Browse the repository at this point in the history
  • Loading branch information
malteos committed Jul 18, 2024
2 parents e27c526 + a8774dd commit d247576
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/llm_datasets/datasets/dataset_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
".hr.croatian_news_engri.CroatianNewsENGRIDataset",
# it
".it.itwac.ITWacDataset",
".it.paisa.PaisaCorpus"
# mt
".mt.korpus_malti.KorpusMaltiDataset",
# nl
Expand Down
40 changes: 40 additions & 0 deletions src/llm_datasets/datasets/it/paisa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import gzip

from llm_datasets.datasets.base import GB, Availability, BaseDataset, License


class PaisaCorpus(BaseDataset):
DATASET_ID = "paisa"
TITLE = "PaisaCorpus"
HOMEPAGE = "http://www.corpusitaliano.it/en/help/getting_started.html"
LICENSE = License(
name="Creative Commons",
url="https://creativecommons.org/licenses/by-nc-sa/3.0/",
)
AVAILIBILITY = Availability.DIRECT_DOWNLOAD
DOWNLOAD_URLS = [
"https://clarin.eurac.edu/repository/xmlui/bitstream/handle/20.500.12124/3/paisa.raw.utf8.gz?sequence=1&isAllowed=y"
]
LANGUAGES = ["it"]
DESCRIPTION = """
The Paisà corpus is a large collection of Italian web texts, licensed under
Creative Commons (Attribution-ShareAlike and Attribution-Noncommercial-ShareAlike).
It has been created in the context of the project PAISÀ.
"""
BYTES = 2.7 * GB

def get_texts(self):
"""Reads directly from .gz file.
Check for "wiki" in the dataset URL to avoid overlapping with other datasets
"""
from bs4 import BeautifulSoup

with gzip.open(self.get_dataset_file_paths(single_file=True), "rt", encoding="utf-8") as fin:
print("Parsing file")
soup = BeautifulSoup(fin, "lxml")
print("File ready")
for text in soup.find_all("text"):
if "wiki" in text.get("url"):
continue
else:
yield text.get_text()

0 comments on commit d247576

Please sign in to comment.