From 08c3fcd39b4d9f80960a1194bde71ce078365c80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6nig?= Date: Tue, 1 Oct 2024 14:36:24 +0200 Subject: [PATCH] added parliamentary corpora --- .../parliamentary-corpora/aalto-fin-parla.json | 15 +++++++++++++++ .../archives-parlementaires.json | 15 +++++++++++++++ .../assemblee-nationale.json | 15 +++++++++++++++ .../at-parlamentsreden.json | 14 ++++++++++++++ .../bul-pol-jour-speech.json | 15 +++++++++++++++ .../parliamentary-corpora/bundestag-europe.json | 15 +++++++++++++++ corpora/parliamentary-corpora/cepic.json | 15 +++++++++++++++ .../czech-parl-meetings.json | 16 ++++++++++++++++ corpora/parliamentary-corpora/czechparl.json | 15 +++++++++++++++ .../danish-parliament.json | 15 +++++++++++++++ corpora/parliamentary-corpora/dutchparl.json | 16 ++++++++++++++++ corpora/parliamentary-corpora/epic-uds.json | 15 +++++++++++++++ .../parliamentary-corpora/europarl-ell-eng.json | 15 +++++++++++++++ corpora/parliamentary-corpora/europarl.json | 15 +++++++++++++++ .../german-pol-speeches.json | 16 ++++++++++++++++ corpora/parliamentary-corpora/gerparcor.json | 15 +++++++++++++++ corpora/parliamentary-corpora/handeset.json | 15 +++++++++++++++ corpora/parliamentary-corpora/hansard.json | 15 +++++++++++++++ .../parliamentary-corpora/hellenic-parla.json | 15 +++++++++++++++ .../house-of-commons-europe.json | 15 +++++++++++++++ .../parliamentary-corpora/icelandic-parla.json | 16 ++++++++++++++++ corpora/parliamentary-corpora/kranjska.json | 15 +++++++++++++++ .../large-czech-parl-hearings.json | 15 +++++++++++++++ corpora/parliamentary-corpora/linkedsaeima.json | 15 +++++++++++++++ .../lit-parla-attribution.json | 15 +++++++++++++++ .../parliamentary-corpora/nor-parla-speech.json | 15 +++++++++++++++ .../parliamentary-corpora/parlameter-hr9.json | 16 ++++++++++++++++ .../parliamentary-corpora/parlameter-sl.json | 16 ++++++++++++++++ .../parliamentary-corpora/parlamint-ana-30.json | 16 ++++++++++++++++ .../parlamint-en-ana-30.json | 17 +++++++++++++++++ .../parliamentary-corpora/parlasent-bcs.json | 15 +++++++++++++++ corpora/parliamentary-corpora/parlat-beta.json | 14 ++++++++++++++ corpora/parliamentary-corpora/parlspeech.json | 15 +++++++++++++++ .../plenary-fin-parla.json | 15 +++++++++++++++ corpora/parliamentary-corpora/pol-parla.json | 16 ++++++++++++++++ corpora/parliamentary-corpora/polminer.json | 15 +++++++++++++++ .../proceedings-nor-parla.json | 15 +++++++++++++++ corpora/parliamentary-corpora/ptparl.json | 15 +++++++++++++++ corpora/parliamentary-corpora/riigikogu.json | 16 ++++++++++++++++ .../riksdag-open-data.json | 16 ++++++++++++++++ corpora/parliamentary-corpora/saeima.json | 15 +++++++++++++++ corpora/parliamentary-corpora/siparl.json | 16 ++++++++++++++++ corpora/parliamentary-corpora/slovparl.json | 16 ++++++++++++++++ .../speeches-greek-parla.json | 15 +++++++++++++++ .../parliamentary-corpora/talk-of-norway.json | 15 +++++++++++++++ corpora/parliamentary-corpora/ukparl.json | 15 +++++++++++++++ corpora/parliamentary-corpora/yu1parl.json | 17 +++++++++++++++++ 47 files changed, 719 insertions(+) create mode 100644 corpora/parliamentary-corpora/aalto-fin-parla.json create mode 100644 corpora/parliamentary-corpora/archives-parlementaires.json create mode 100644 corpora/parliamentary-corpora/assemblee-nationale.json create mode 100644 corpora/parliamentary-corpora/at-parlamentsreden.json create mode 100644 corpora/parliamentary-corpora/bul-pol-jour-speech.json create mode 100644 corpora/parliamentary-corpora/bundestag-europe.json create mode 100644 corpora/parliamentary-corpora/cepic.json create mode 100644 corpora/parliamentary-corpora/czech-parl-meetings.json create mode 100644 corpora/parliamentary-corpora/czechparl.json create mode 100644 corpora/parliamentary-corpora/danish-parliament.json create mode 100644 corpora/parliamentary-corpora/dutchparl.json create mode 100644 corpora/parliamentary-corpora/epic-uds.json create mode 100644 corpora/parliamentary-corpora/europarl-ell-eng.json create mode 100644 corpora/parliamentary-corpora/europarl.json create mode 100644 corpora/parliamentary-corpora/german-pol-speeches.json create mode 100644 corpora/parliamentary-corpora/gerparcor.json create mode 100644 corpora/parliamentary-corpora/handeset.json create mode 100644 corpora/parliamentary-corpora/hansard.json create mode 100644 corpora/parliamentary-corpora/hellenic-parla.json create mode 100644 corpora/parliamentary-corpora/house-of-commons-europe.json create mode 100644 corpora/parliamentary-corpora/icelandic-parla.json create mode 100644 corpora/parliamentary-corpora/kranjska.json create mode 100644 corpora/parliamentary-corpora/large-czech-parl-hearings.json create mode 100644 corpora/parliamentary-corpora/linkedsaeima.json create mode 100644 corpora/parliamentary-corpora/lit-parla-attribution.json create mode 100644 corpora/parliamentary-corpora/nor-parla-speech.json create mode 100644 corpora/parliamentary-corpora/parlameter-hr9.json create mode 100644 corpora/parliamentary-corpora/parlameter-sl.json create mode 100644 corpora/parliamentary-corpora/parlamint-ana-30.json create mode 100644 corpora/parliamentary-corpora/parlamint-en-ana-30.json create mode 100644 corpora/parliamentary-corpora/parlasent-bcs.json create mode 100644 corpora/parliamentary-corpora/parlat-beta.json create mode 100644 corpora/parliamentary-corpora/parlspeech.json create mode 100644 corpora/parliamentary-corpora/plenary-fin-parla.json create mode 100644 corpora/parliamentary-corpora/pol-parla.json create mode 100644 corpora/parliamentary-corpora/polminer.json create mode 100644 corpora/parliamentary-corpora/proceedings-nor-parla.json create mode 100644 corpora/parliamentary-corpora/ptparl.json create mode 100644 corpora/parliamentary-corpora/riigikogu.json create mode 100644 corpora/parliamentary-corpora/riksdag-open-data.json create mode 100644 corpora/parliamentary-corpora/saeima.json create mode 100644 corpora/parliamentary-corpora/siparl.json create mode 100644 corpora/parliamentary-corpora/slovparl.json create mode 100644 corpora/parliamentary-corpora/speeches-greek-parla.json create mode 100644 corpora/parliamentary-corpora/talk-of-norway.json create mode 100644 corpora/parliamentary-corpora/ukparl.json create mode 100644 corpora/parliamentary-corpora/yu1parl.json diff --git a/corpora/parliamentary-corpora/aalto-fin-parla.json b/corpora/parliamentary-corpora/aalto-fin-parla.json new file mode 100644 index 0000000..34476a5 --- /dev/null +++ b/corpora/parliamentary-corpora/aalto-fin-parla.json @@ -0,0 +1,15 @@ +{ + "Name": "Aalto Finnish Parliament ASR Corpus 2008-2020", + "URL": "http://urn.fi/urn:nbn:fi:lb-2022052002", + "Family": "Parliamentary corpora", + "Description": "This corpus, which consists of both audio recordings and transcriptions, is extracted from the Finnish parliamentary plenary session transcripts and videos by the Aalto Speech Recognition group. The original session transcripts and videos are available on the websites of the Parliament of Finland (see here and here). The corpus is split into three parts:\n
    \n
  1. the 2015–2020 set
  2. \n
  3. the 2008–2016 set
  4. \n
  5. development and test sets
  6. \n
\nThe corpus is available for download from the Language Bank of Finland.", + "Languages": ["fin"], + "License": "CLARIN PUB", + "Size": ["119.3 million words", "3,130 hours of recordings"], + "Annotation": [], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://urn.fi/urn:nbn:fi:lb-2022052003" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/archives-parlementaires.json b/corpora/parliamentary-corpora/archives-parlementaires.json new file mode 100644 index 0000000..b4e3b09 --- /dev/null +++ b/corpora/parliamentary-corpora/archives-parlementaires.json @@ -0,0 +1,15 @@ +{ + "Name": "Archives Parlementaires", + "URL": "https://sul-philologic.stanford.edu/philologic/archparl/", + "Family": "Parliamentary corpora", + "Description": "The Archives parlementaires is a chronologically-ordered edited collection of sources on the French Revolution. It was conceived in the mid 19th century as a project to produce a definitive record of parliamentary deliberations and also includes letters, reports, speeches, and other first-hand accounts from a great variety of published and archival sources. FRDA currently contains the AP volumes covering the years 1787-1794, which can be searched using ARTFL's PhiloLogic 4 open source software platform. The texts have been marked up using TEI so that speakers, places, dates, and terms in the published index can be easily found. Users can see both scanned images of the AP pages or just the texts. ", + "Languages": ["fra"], + "License": "", + "Size": [], + "Annotation": [], + "Infrastructure": "Other", + "Access": { + "Concordancer": "https://sul-philologic.stanford.edu/philologic/archparl/" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/assemblee-nationale.json b/corpora/parliamentary-corpora/assemblee-nationale.json new file mode 100644 index 0000000..56419e3 --- /dev/null +++ b/corpora/parliamentary-corpora/assemblee-nationale.json @@ -0,0 +1,15 @@ +{ + "Name": "Parliamentary Debates on Europe at the Assemblée nationale (2002-2012)", + "URL": "https://hdl.handle.net/11403/fr-parl/v1", + "Family": "Parliamentary corpora", + "Description": "The corpus contains French parliamentary debates from 2002 to 2012. The contextual metadata in the corpus concern the dates of the council meetings, the description of the main topic(s) of the European council meeting, the place where the European Council meeting took place; they also correspond to information about the government and the legislative session. The speaker metadata correspond to name, gender, occupation, parliamentary group, political orientation and the opposition and majority division.\nThe corpus is available for download from Ortolang.", + "Languages": ["fra"], + "License": "CC-BY", + "Size": ["137,000 tokens"], + "Annotation": ["contextual and speaker metadata"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "https://hdl.handle.net/11403/fr-parl/v1" + }, + "Publication": "Truan and Romary (2021)" +} diff --git a/corpora/parliamentary-corpora/at-parlamentsreden.json b/corpora/parliamentary-corpora/at-parlamentsreden.json new file mode 100644 index 0000000..d4c0f4d --- /dev/null +++ b/corpora/parliamentary-corpora/at-parlamentsreden.json @@ -0,0 +1,14 @@ +{ + "Name": "Korpusbasierte Analyse österreichischer Parlamentsreden", + "URL": "https://homepages.uni-regensburg.de/~sic07430/", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Austrian parliamentary debates from 2013 to 2015. It is annotated with the Stanford Tagger.\nThe corpus currently is not available.", + "Languages": ["German (Austrian)"], + "License": "", + "Size": ["1.2 million tokens"], + "Annotation": ["tokenised", "PoS-tagged"], + "Infrastructure": "Other", + "Access": { + }, + "Publication": "Sippl et al. (2016)" +} diff --git a/corpora/parliamentary-corpora/bul-pol-jour-speech.json b/corpora/parliamentary-corpora/bul-pol-jour-speech.json new file mode 100644 index 0000000..db52331 --- /dev/null +++ b/corpora/parliamentary-corpora/bul-pol-jour-speech.json @@ -0,0 +1,15 @@ +{ + "Name": "Corpus of Bulgarian Political and Journalistic Speech", + "URL": "http://www.political.webclark.org/?locale=bg", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Bulgarian parliamentary debates from 2006 to 2012.\nThe corpus is available through a dedicated concordancer.", + "Languages": ["bul"], + "License": "", + "Size": ["10 million tokens"], + "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], + "Infrastructure": "Other", + "Access": { + "Concordancer": "http://www.political.webclark.org/?locale=bg" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/bundestag-europe.json b/corpora/parliamentary-corpora/bundestag-europe.json new file mode 100644 index 0000000..66e7267 --- /dev/null +++ b/corpora/parliamentary-corpora/bundestag-europe.json @@ -0,0 +1,15 @@ +{ + "Name": "Parliamentary Debates on Europe at the Bundestag (1998-2015)", + "URL": "https://hdl.handle.net/11403/de-parl/v1", + "Family": "Parliamentary corpora", + "Description": "The corpus contains German parliamentary debates from 1998 to 2015. The contextual metadata in the corpus concern the dates of the council meetings, the description of the main topic(s) of the European council meeting, the place where the European Council meeting took place; they also correspond to information about the government and the legislative session. The speaker metadata correspond to name, gender, occupation, parliamentary group, political orientation and the opposition and majority division.\nThe corpus is available for download from Ortolang.", + "Languages": ["deu"], + "License": "CC-BY", + "Size": ["417,000 tokens"], + "Annotation": ["contextual and speaker metadata"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "https://hdl.handle.net/11403/de-parl/v1" + }, + "Publication": "Truan and Romary (2021)" +} diff --git a/corpora/parliamentary-corpora/cepic.json b/corpora/parliamentary-corpora/cepic.json new file mode 100644 index 0000000..76abeab --- /dev/null +++ b/corpora/parliamentary-corpora/cepic.json @@ -0,0 +1,15 @@ +{ + "Name": "The Chinese/English Political Interpreting Corpus (CEPIC)", + "URL": "https://digital.lib.hkbu.edu.hk/cepic/", + "Family": "Parliamentary corpora", + "Description": "The CEPIC consists of transcripts of speeches delivered by top political figures from Hong Kong, Beijing, Washington DC and London, as well as their translated/interpreted texts.\nThe main speech types of CEPIC include the reading of government reports such as policy addresses and budget speeches, Q&A at press conferences, parliamentary debates, as well as remarks delivered at bilateral meetings.\nThe corpus features a parallel display of up to six versions of the same speech segment, aligned at paragraph level.\nThe corpus is available for online querying through a dedicated concordancer.", + "Languages": ["zho", "eng"], + "License": "Terms of Use", + "Size": ["6.5 million words"], + "Annotation": ["PoS-tagged", "prosodic and paralinguistic features"], + "Infrastructure": "Other", + "Access": { + "Concordancer": "https://digital.lib.hkbu.edu.hk/cepic/search.php" + }, + "Publication": "Pan (2019)" +} diff --git a/corpora/parliamentary-corpora/czech-parl-meetings.json b/corpora/parliamentary-corpora/czech-parl-meetings.json new file mode 100644 index 0000000..666757b --- /dev/null +++ b/corpora/parliamentary-corpora/czech-parl-meetings.json @@ -0,0 +1,16 @@ +{ + "Name": "Czech Parliamentary Meetings", + "URL": "http://hdl.handle.net/11858/00-097C-0000-0005-CF9C-4", + "Family": "Parliamentary corpora", + "Description": "The corpus contains recordings of the parliamentary sessions as well as corresponding transcriptions.\nThe corpus is available for download from LINDAT and through the concordancer KonText.", + "Languages": ["ces"], + "License": "CC-BY", + "Size": ["88 hours", "0.5 million tokens"], + "Annotation": ["error correction of transcriptions", "division into speech sections with speaker information"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "http://lindat.mff.cuni.cz/services/kontext/first_form?corpname=czechparl_2012_03_28_cs_w", + "Download": "http://hdl.handle.net/11858/00-097C-0000-0005-CF9C-4" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/czechparl.json b/corpora/parliamentary-corpora/czechparl.json new file mode 100644 index 0000000..5029bc2 --- /dev/null +++ b/corpora/parliamentary-corpora/czechparl.json @@ -0,0 +1,15 @@ +{ + "Name": "CzechParl", + "URL": "https://www.muni.cz/en/research/publications/914268", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Czech parliamentary debates from 1993 to 2010. It is annotated with ajka.\nThe corpus is available through the Sketch Engine.", + "Languages": ["ces"], + "License": "", + "Size": ["81.9 million tokens"], + "Annotation": ["tokenised", "MSD-tagged and lemmatised"], + "Infrastructure": "Other", + "Access": { + "Concordancer": "https://the.sketchengine.co.uk/login/?next=%2Fcorpus%2Ffirst_form%3Fcorpname%3Dpreloaded%2Fczechparl2012%3B" + }, + "Publication": "Jakubíček and Kovář (2010)" +} diff --git a/corpora/parliamentary-corpora/danish-parliament.json b/corpora/parliamentary-corpora/danish-parliament.json new file mode 100644 index 0000000..91feb0d --- /dev/null +++ b/corpora/parliamentary-corpora/danish-parliament.json @@ -0,0 +1,15 @@ +{ + "Name": "The Danish Parliament Corpus 2009 - 2017, v2", + "URL": "http://hdl.handle.net/20.500.12115/44", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Danish parliamentary debates from 2009 to 2017.\nThe corpus is available for download from the DK-CLARIN repository.", + "Languages": ["dan"], + "License": "CC-BY", + "Size": ["40.6 million words"], + "Annotation": ["no linguistic annotation"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.handle.net/20.500.12115/44" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/dutchparl.json b/corpora/parliamentary-corpora/dutchparl.json new file mode 100644 index 0000000..0e4c2ee --- /dev/null +++ b/corpora/parliamentary-corpora/dutchparl.json @@ -0,0 +1,16 @@ +{ + "Name": "DutchParl", + "URL": "http://search.politicalmashup.nl/about.html", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Dutch parliamentary debates from 1814 to 2014. It is annotated with Frog. See also the information on the schema used.\nThe corpus is available for download (the authors needs to be contacted) and is also accessible online through the Political Mashup environment.", + "Languages": ["nld"], + "License": "", + "Size": ["800 million tokens"], + "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], + "Infrastructure": "Other", + "Access": { + "Concordancer": "http://search.politicalmashup.nl/", + "Download": "http://data.politicalmashup.nl/permanent/" + }, + "Publication": "Marx and Schuth (2010)" +} diff --git a/corpora/parliamentary-corpora/epic-uds.json b/corpora/parliamentary-corpora/epic-uds.json new file mode 100644 index 0000000..820e48b --- /dev/null +++ b/corpora/parliamentary-corpora/epic-uds.json @@ -0,0 +1,15 @@ +{ + "Name": "EPIC-UdS", + "URL": "http://hdl.handle.net/21.11119/0000-0008-F519-8", + "Family": "Parliamentary corpora", + "Description": "This is a parallel and comparable corpus of speeches held in the European Parliament; the corpus follows the European Parliament Interpreting Corpora tradition of the EPIC and EPICG corpora. It contains original speeches from 2008 to 2013 by English, German, and Spanish native speakers and their interpretation (English to and from German; Spanish to English).\nAll transcripts in the corpus are based on videos of the European Parliament Proceedings published by the European Parliament.\nAnnotation includes typical characteristics of spoken language such as false starts, hesitations and truncated words. To obtain better results for source-target alignment as well as sentence parsing the transcripts were segmented using a main clause approach: compound sentences were segmented separately. For the second version of the corpus, the transcripts were processed clause by clause with the spaCy NLP tools; the data is encoded in CoNLL-U and provides universal PoS tags, fine-grained language-specific PoS tags as well as Universal Dependency syntactic relations. All data was enriched with relevant metadata such as source language, name of original speaker, speech timing, mode of delivery and delivery rate.\nThe corpus is available for download from CLARIN-D (Saarland University B-centre).", + "Languages": ["eng", "deu", "spa"], + "License": "CC BY-NC-SA 4.0", + "Size": ["350,000 tokens", "20,000 sentences"], + "Annotation": ["tokenised", "PoS-tagged", "syntactically parsed", "speech phenomena"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.handle.net/21.11119/0000-0008-F519-8" + }, + "Publication": "Przybyl et al. (2022)" +} diff --git a/corpora/parliamentary-corpora/europarl-ell-eng.json b/corpora/parliamentary-corpora/europarl-ell-eng.json new file mode 100644 index 0000000..ce6a9ff --- /dev/null +++ b/corpora/parliamentary-corpora/europarl-ell-eng.json @@ -0,0 +1,15 @@ +{ + "Name": "European Parliament Proceedings Parallel Corpus 1996-2011, parallel corpus Greek-English", + "URL": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23DE-F", + "Family": "Parliamentary corpora", + "Description": "This corpus is a bilingual Greek-English subset of the Europal parallel corpus.\nThe corpus is available for download from the CLARIN:EL repository.", + "Languages": ["Greek-English"], + "License": "CC ZERO", + "Size": ["31.9 million words (English)", "1.2 million sentences (Greek)"], + "Annotation": ["sentence aligned"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23DE-F" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/europarl.json b/corpora/parliamentary-corpora/europarl.json new file mode 100644 index 0000000..1344e2d --- /dev/null +++ b/corpora/parliamentary-corpora/europarl.json @@ -0,0 +1,15 @@ +{ + "Name": "Europarl: European Parliament Proceedings Parallel Corpus 1996-2011", + "URL": "https://www.statmt.org/europarl/", + "Family": "Parliamentary corpora", + "Description": "This corpus contains parliamentary debates from the European Parliament from 1996 to 2011.\nThe corpus is available for download from a dedicated webpage.", + "Languages": ["21 languages"], + "License": "CC0", + "Size": ["33.7 million tokens"], + "Annotation": ["sentence/aligned"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://www.statmt.org/europarl/" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/german-pol-speeches.json b/corpora/parliamentary-corpora/german-pol-speeches.json new file mode 100644 index 0000000..b265aa6 --- /dev/null +++ b/corpora/parliamentary-corpora/german-pol-speeches.json @@ -0,0 +1,16 @@ +{ + "Name": "German Political Speeches Corpus", + "URL": "https://www.dwds.de/d/korpora/politische_reden", + "Family": "Parliamentary corpora", + "Description": "The corpus contains speeches by 200 important political figures for the period between 1982 and 2020.\nA large part of the corpus contains speeches by the holders of the four highest German state offices: the Federal President, the Federal Chancellor, the President of the Bundestag and Foreign Ministers with terms of offie between 1982 and 2020.\nThe corpus is available for online browsing through the DWDS platform and a subset encoded in XML with 6,685 speeches until 2019 can be downloaded.", + "Languages": ["deu"], + "License": "CC BY-SA 4.0", + "Size": ["15,240 speeches", "27 million texts"], + "Annotation": [], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "https://www.dwds.de/r?corpus=politische_reden", + "Download": "http://adrien.barbaresi.eu/corpora/speeches/" + }, + "Publication": "Barbaresi (2018)" +} diff --git a/corpora/parliamentary-corpora/gerparcor.json b/corpora/parliamentary-corpora/gerparcor.json new file mode 100644 index 0000000..1aeb285 --- /dev/null +++ b/corpora/parliamentary-corpora/gerparcor.json @@ -0,0 +1,15 @@ +{ + "Name": "German Parliamentary Corpus (GerParCor)", + "URL": "https://github.com/texttechnologylab/GerParCor", + "Family": "Parliamentary corpora", + "Description": "This corpus contains (mostly historical) German-language parliamentary proceedings from the 19th, 20th, and 21th centuries, including state and federal-level data. Additionally, the corpus contains conversions of scanned protocols and, in particular, of protocols in Fraktur converted via an OCR process based on Tesseract. All protocols were preprocessed by means of the NLP pipeline spaCy v3 and automatically annotated with metadata regarding their session date. The corpus is made available in the XML format of the UIMA project.\nThe corpus is available for download from GitHub.", + "Languages": ["deu"], + "License": "AGPL-3.0 Licence", + "Size": [], + "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "sentence segmented", "NER-tagged", "morphology", "dependency parsed"], + "Infrastructure": "Other", + "Access": { + "Download": "https://github.com/texttechnologylab/GerParCor" + }, + "Publication": "Abrami et al. (2022)" +} diff --git a/corpora/parliamentary-corpora/handeset.json b/corpora/parliamentary-corpora/handeset.json new file mode 100644 index 0000000..1bbcfcf --- /dev/null +++ b/corpora/parliamentary-corpora/handeset.json @@ -0,0 +1,15 @@ +{ + "Name": "HanDeSeT: Hansard Debates with Sentiment Tags", + "URL": "https://data.mendeley.com/datasets/xsvp45cbt4/2", + "Family": "Parliamentary corpora", + "Description": "This corpus contains English parliamentary debates from 1997 to 2017.\nThe corpus is available for download from a dedicated webpage.", + "Languages": ["eng"], + "License": "Open Parliament Licence V3.0 and Open Data Commons Open Database License (OdbL)", + "Size": ["1251 motion-speech units taken from 129 separate debates"], + "Annotation": ["sentiment tags"], + "Infrastructure": "Other", + "Access": { + "Download": "https://data.mendeley.com/datasets/xsvp45cbt4/2" + }, + "Publication": "Abercrombie and Batista-Navarro (2018)" +} diff --git a/corpora/parliamentary-corpora/hansard.json b/corpora/parliamentary-corpora/hansard.json new file mode 100644 index 0000000..b376477 --- /dev/null +++ b/corpora/parliamentary-corpora/hansard.json @@ -0,0 +1,15 @@ +{ + "Name": "Hansard corpus", + "URL": "http://www.clarin.ac.uk/hansard-corpus", + "Family": "Parliamentary corpora", + "Description": "The corpus contains British parliamentary debates from 1803 to 2005. It is semantically tagged with the USAS semantic tagger and the Historical Thesaurus Semantic Tagger (HTST).\nThe corpus is available through a dedicated concordancer.", + "Languages": ["eng"], + "License": "", + "Size": ["1.6 billion tokens"], + "Annotation": ["tokenised", "PoS-tagged", "lemmatised", "semantic tagging"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "https://www.hansard-corpus.org/x.asp" + }, + "Publication": "Rayson et al. (2015)" +} diff --git a/corpora/parliamentary-corpora/hellenic-parla.json b/corpora/parliamentary-corpora/hellenic-parla.json new file mode 100644 index 0000000..7281ab9 --- /dev/null +++ b/corpora/parliamentary-corpora/hellenic-parla.json @@ -0,0 +1,15 @@ +{ + "Name": "Hellenic Parliament Minutes (1989-1994, 1997-2018)", + "URL": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-57FA-5", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Greek parliamentary debates for two periods: 1989-1994 and 1997-2018.\nThe corpus is available for download from the CLARIN:el repository.", + "Languages": ["ell"], + "License": "CC-BY-NC", + "Size": ["181 million words"], + "Annotation": [], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-57FA-5" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/house-of-commons-europe.json b/corpora/parliamentary-corpora/house-of-commons-europe.json new file mode 100644 index 0000000..f18ba0f --- /dev/null +++ b/corpora/parliamentary-corpora/house-of-commons-europe.json @@ -0,0 +1,15 @@ +{ + "Name": "Parliamentary Debates on Europe at the House of Commons (1998-2015)", + "URL": "https://hdl.handle.net/11403/uk-parl/v1", + "Family": "Parliamentary corpora", + "Description": "The corpus contains British parliamentary debates from 1998 to 2015. The contextual metadata in the corpus concern the dates of the council meetings, the description of the main topic(s) of the European council meeting, the place where the European Council meeting took place; they also correspond to information about the government and the legislative session. The speaker metadata correspond to name, gender, occupation, parliamentary group, political orientation and the opposition and majority division.\nThe corpus is available for download from Ortolang.", + "Languages": ["eng"], + "License": "CC-BY", + "Size": ["190,000 tokens"], + "Annotation": ["contextual and speaker metadata"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "https://hdl.handle.net/11403/uk-parl/v1" + }, + "Publication": "Truan and Romary (2021)" +} diff --git a/corpora/parliamentary-corpora/icelandic-parla.json b/corpora/parliamentary-corpora/icelandic-parla.json new file mode 100644 index 0000000..15b1918 --- /dev/null +++ b/corpora/parliamentary-corpora/icelandic-parla.json @@ -0,0 +1,16 @@ +{ + "Name": "The Icelandic Parliamentary Corpus", + "URL": "https://clarin.is/en/resources/parliament/", + "Family": "Parliamentary corpora", + "Description": "This corpus contains debates in the Icelandic parliament (Alþingi) from 1911 to 2017.\nThe corpus is available for download from CLARIN-IS (as a part of the Icelandic Gigaword Corpus) and for search through the concordancer Korp.", + "Languages": ["isl"], + "License": "CC-BY 4.0", + "Size": ["238 million tokens"], + "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "https://malheildir.arnastofnun.is/?mode=rmh2018#?stats_reduce=word&isCaseInsensitive&searchBy=word&cqp=%5B%5D&lang=en&corpus=rmh2018_althingi", + "Download": "http://www.malfong.is/index.php?lang=en&pg=&dlid=95" + }, + "Publication": "Steingrímsson et al. (2018)" +} diff --git a/corpora/parliamentary-corpora/kranjska.json b/corpora/parliamentary-corpora/kranjska.json new file mode 100644 index 0000000..a7463f5 --- /dev/null +++ b/corpora/parliamentary-corpora/kranjska.json @@ -0,0 +1,15 @@ +{ + "Name": "Carniolan Provincial Assembly corpus Kranjska 1.0", + "URL": "http://hdl.handle.net/11356/1824", + "Family": "Parliamentary corpora", + "Description": "The corpus contains meeting proceedings of 694 sessions of the Carniolan Provincial Assembly from 1861 to 1913.\nThe source data (scanned and OCR processed pdf documents) originally come from The Digital Library of Slovenia dLib.si and History of Slovenia - SIstory portals. The documents are bilingual, in Slovenian and German, depending on the speaker. German was first typeset in the Gothic script and later on in Latin.\nThe documents were automatically processed and the following data extracted: titles, agenda, attending, start and end of the session, speakers, and comments. Language was detected on the sentence level, roughly 58% sentences are in Slovenian and 42% in German. Linguistic annotation (tokenisation, MSD tagging and lemmatisation) was added using Trankit for Slovenian and German, while Lingua is used for language detection.\nThe documents are in the Parla-CLARIN compliant TEI XML format. Each session in one file.", + "Languages": ["deu", "slv"], + "License": "CC-BY 4.0", + "Size": ["10.9 million words"], + "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.handle.net/11356/1824" + }, + "Publication": "Marolt et al. (2023)" +} diff --git a/corpora/parliamentary-corpora/large-czech-parl-hearings.json b/corpora/parliamentary-corpora/large-czech-parl-hearings.json new file mode 100644 index 0000000..4dc77e6 --- /dev/null +++ b/corpora/parliamentary-corpora/large-czech-parl-hearings.json @@ -0,0 +1,15 @@ +{ + "Name": "Large Corpus of Czech Parliament Plenary Hearings", + "URL": "https://hdl.handle.net/11234/1-3126", + "Family": "Parliamentary corpora", + "Description": "This corpus contains audio recordings of Czech parliamentary sessions along with the corresponding transcriptions. The whole corpus has been segmented to short audio snippets making it suitable for both training and evaluation of automatic speech recognition (ASR) systems.\nThe corpus is available for download form the LINDAT reposiory.", + "Languages": ["ces"], + "License": "CC BY 4.0", + "Size": ["444 hours"], + "Annotation": [], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3126" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/linkedsaeima.json b/corpora/parliamentary-corpora/linkedsaeima.json new file mode 100644 index 0000000..70ba490 --- /dev/null +++ b/corpora/parliamentary-corpora/linkedsaeima.json @@ -0,0 +1,15 @@ +{ + "Name": "LinkedSAEIMA", + "URL": "http://dati.saeima.korpuss.lv/", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Latvian parliamentary debates from 1993 to 2016.\nThe corpus is available through noSketchEngine.", + "Languages": ["lav"], + "License": "", + "Size": ["12.5 million tokens"], + "Annotation": ["tokenised", "lemmatised"], + "Infrastructure": "Other", + "Access": { + "Concordancer": "http://nosketch.korpuss.lv/run.cgi/first_form?corpname=saeima" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/lit-parla-attribution.json b/corpora/parliamentary-corpora/lit-parla-attribution.json new file mode 100644 index 0000000..8cf41c2 --- /dev/null +++ b/corpora/parliamentary-corpora/lit-parla-attribution.json @@ -0,0 +1,15 @@ +{ + "Name": "Lithuanian Parliament Corpus for Authorship Attribution", + "URL": "http://hdl.handle.net/20.500.11821/17", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Lithuanian parliamentary debates from 1990 to 2013. It is annotated with Lemuoklis (morphological analyzer for lemmatization) and MaltParser (generation of dependency tags).\nThe corpus is available for download from the repository of CLARIN-LT.", + "Languages": ["lit"], + "License": "CLARIN PUB", + "Size": ["23.9 million tokens"], + "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.handle.net/20.500.11821/17" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/nor-parla-speech.json b/corpora/parliamentary-corpora/nor-parla-speech.json new file mode 100644 index 0000000..7c23634 --- /dev/null +++ b/corpora/parliamentary-corpora/nor-parla-speech.json @@ -0,0 +1,15 @@ +{ + "Name": "Norwegian Parliamentary Speech Corpus", + "URL": "https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-58/", + "Family": "Parliamentary corpora", + "Description": "This corpus consists of audio recordings of meetings in Stortinget (the Norwegian parliament), and corresponding orthographic transcriptions in either Norwegian Bokmål or Norwegian Nynorsk, as well as various metadata about the speakers. The official proceedings from the meetings are also included in the corpus for reference.\nTranscription was first done automatically; subsequently, the output of the automatic process was manually checked and corrected by trained linguists and philologists. Finally, all transcriptions were proofread to ensure consistency and accuracy. The audio files in the corpus contain the speech of entire days of plenary meetings from 2017 and 2018 (or, if a meeting lasts more than six hours, the first six hours of a day).\nThe corpus is available for download from the Norwegian Language Bank.", + "Languages": ["nor"], + "License": "CC-ZERO", + "Size": ["140 hours", "65,000 sentences", "1.2 million words"], + "Annotation": [], + "Infrastructure": "CLARIN", + "Access": { + "Download": "https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-58/" + }, + "Publication": "Solberg and Ortiz (2022)" +} diff --git a/corpora/parliamentary-corpora/parlameter-hr9.json b/corpora/parliamentary-corpora/parlameter-hr9.json new file mode 100644 index 0000000..1d3a446 --- /dev/null +++ b/corpora/parliamentary-corpora/parlameter-hr9.json @@ -0,0 +1,16 @@ +{ + "Name": "Croatian parliamentary corpus ParlaMeter-hr9 1.0", + "URL": "http://hdl.handle.net/11356/1209", + "Family": "Parliamentary corpora", + "Description": "The corpus contains minutes of the National Assembly of the Republic of Croatia and currently covers its VIth mandate from 15 November 2016 to 21 Nomveber 2018. The corpus contains speaker metadata (gender, age, education, party affiliation).\nThe corpus is available for download from the CLARIN.SI repository and through the concordancers KonText and noSketchEngine, as well as through a dedicated webpage.", + "Languages": ["hrv"], + "License": "CC-BY", + "Size": ["14.1 million tokens"], + "Annotation": ["tokenised", "MSD-tagged", "lemmatised", "named entities"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "http://www.clarin.si/kontext/first_form?corpname=parlameter_hr", + "Download": "http://hdl.handle.net/11356/1209" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/parlameter-sl.json b/corpora/parliamentary-corpora/parlameter-sl.json new file mode 100644 index 0000000..5a44a9f --- /dev/null +++ b/corpora/parliamentary-corpora/parlameter-sl.json @@ -0,0 +1,16 @@ +{ + "Name": "Slovenian parliamentary corpus ParlaMeter-sl 1.0", + "URL": "http://hdl.handle.net/11356/1208", + "Family": "Parliamentary corpora", + "Description": "The corpus contains minutes of the National Assembly of the Republic of Slovenia and currently covers the VIIth mandate from 1 August 2014 to 22 June 2018. The corpus contains speaker metadata (gender, age, education, party affiliation).\nThe corpus is available for download from the CLARIN.SI repository and through the concordancers KonText and noSketchEngine, as well as through a dedicated dedicated webpage.", + "Languages": ["slv"], + "License": "CC-BY", + "Size": ["41 million tokens"], + "Annotation": ["tokenised", "MSD-tagged", "lemmatised", "named entities"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "https://www.clarin.si/kontext/first_form?corpname=parlameter_sl", + "Download": "http://hdl.handle.net/11356/1167" + }, + "Publication": "Ljubešić et al. (2018)" +} diff --git a/corpora/parliamentary-corpora/parlamint-ana-30.json b/corpora/parliamentary-corpora/parlamint-ana-30.json new file mode 100644 index 0000000..9b254ae --- /dev/null +++ b/corpora/parliamentary-corpora/parlamint-ana-30.json @@ -0,0 +1,16 @@ +{ + "Name": "Linguistically annotated multilingual comparable corpora of parliamentary debates ParlaMint.ana 3.0", + "URL": "http://hdl.handle.net/11356/1488", + "Family": "Parliamentary corpora", + "Description": "ParlaMint is a multilingual set of comparable corpora containing parliamentary debates mostly starting at the end of 2015 and extending to mid 2022, with each corpus being between 9 and 125 million words in size. The sessions in the corpora are marked as belonging to the COVID-19 period (after October 2019), the pre-Covid period or the period after 24 February 2022.\nThe corpora have extensive meta-data about the speakers (name, gender, party affiliation, MP status), are structured into time-stamped terms, sessions and meetings, with each speech being marked by its speaker and their role (chair, regular speaker). The speeches also contain marked-up transcriber comments, such as gaps in the transcription, interruptions, applause, etc.\nThe corpus is available for download from the CLARIN.SI repository and through the concordancer noSketch Engine. Note that the version of the corpus without linguistic mark-up is available for download under a separate CLARIN.SI entry.", + "Languages": ["bos", "bul", "cat", "hrv", "ces", "dan", "nld", "eng", "est", "fra", "glg", "deu", "hun", "isl", "ita", "lav", "ell", "nor", "pol", "por", "rus", "srp", "slv", "spa", "swe", "tur", "ukr"], + "License": "CC BY 4.0", + "Size": ["7.5 million utterances", "1.1 billion words"], + "Annotation": ["tokenised", "MSD-tagged (Universal Dependencies)", "syntactically parsed (Universal Dependencies)", "named entities"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "https://www.clarin.si/ske/#open", + "Download": "http://hdl.handle.net/11356/1431" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/parlamint-en-ana-30.json b/corpora/parliamentary-corpora/parlamint-en-ana-30.json new file mode 100644 index 0000000..3750a5a --- /dev/null +++ b/corpora/parliamentary-corpora/parlamint-en-ana-30.json @@ -0,0 +1,17 @@ +{ + "Name": "Linguistically annotated multilingual comparable corpora of parliamentary debates in English ParlaMint-en.ana 3.0", + "URL": "http://hdl.handle.net/11356/1810", + "Family": "Parliamentary corpora", + "Description": "This corpus comprises linguistically annotated multilingual comparable corpora of parliamentary debates ParlaMint.ana 3.0 which were machine translated to English and the translation linguistically annotated.\nExcept for the translation to English, small changes in the metadata and the absence of the British parliament corpus, the corpora included in this entry are in all respects identical to the source language corpora, i.e. the entry comprises the same 26 European parliamentary corpora, together with over 1.1 billion words. The translation to English was done with EasyNMT withOPUS-MT models. Machine translation was done on the sentence level, and includes both speeches and transcriber notes, including headings. The linguistic annotation of the speeches, i.e. tokenisation, tagging with UD PoS and morphological features, lemmatisation, and NER annotation was done with Stanza , using the English language model. For NER the conll03 model with 4 NE classes was used.\nThe corpus is available for download from the CLARIN.SI repository and for browsing through concordancers noSketchEngine and KonText.", + "Languages": ["bos", "bul", "cat", "hrv", "ces", "dan", "nld", "eng", "est", "fra", "glg", "deu", "hun", "isl", "ita", "lav", "ell", "nor", "pol", "por", "rus", "srp", "slv", "spa", "swe", "tur", "ukr"], + "License": "CC BY 4.0", + "Size": ["1.1 billion words"], + "Annotation": ["tokenised", "MSD-tagged (Universal Dependencies)", "syntactically parsed (Universal Dependencies)", "named entities"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=parlamint30_xx_en", + "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=parlamint30_xx_en", + "Download": "http://hdl.handle.net/11356/1810" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/parlasent-bcs.json b/corpora/parliamentary-corpora/parlasent-bcs.json new file mode 100644 index 0000000..f001ad9 --- /dev/null +++ b/corpora/parliamentary-corpora/parlasent-bcs.json @@ -0,0 +1,15 @@ +{ + "Name": "The sentiment corpus of parliamentary debates ParlaSent-BCS v1.0", + "URL": "http://hdl.handle.net/11356/1585", + "Family": "Parliamentary corpora", + "Description": "This corpus consists of mid-length sentences from the Bosnian, Croatian, and Serbian parliamentary proceedings that are annotated with a 6-level sentiment schema. The date of the speech and the speaker name are given as well. If the speaker is MP, information on party, gender and year of birth are available as well.\nThe corpus is available for download from the CLARIN.SI repository.", + "Languages": ["bos", "hrv", "srp"], + "License": "CC BY-SA 4.0", + "Size": ["2600 sentences"], + "Annotation": ["sentiment analysis"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.handle.net/11356/1585" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/parlat-beta.json b/corpora/parliamentary-corpora/parlat-beta.json new file mode 100644 index 0000000..54415b5 --- /dev/null +++ b/corpora/parliamentary-corpora/parlat-beta.json @@ -0,0 +1,14 @@ +{ + "Name": "ParlAT beta", + "URL": "https://www.oeaw.ac.at/acdh/tools/parlat/", + "Family": "Parliamentary corpora", + "Description": "This corpus contains Austrian parliamentary proceedings from 1996 to 2017.\nCurrently in development, ParlAT is planned to be a monitor corpus with new material added over time.", + "Languages": ["German (Austrian)"], + "License": "", + "Size": ["75.2 million tokens"], + "Annotation": ["tokenised", "linked data (e.g., speaker information)"], + "Infrastructure": "CLARIN", + "Access": { + }, + "Publication": "Wissik and Pirker (2018)" +} diff --git a/corpora/parliamentary-corpora/parlspeech.json b/corpora/parliamentary-corpora/parlspeech.json new file mode 100644 index 0000000..5941288 --- /dev/null +++ b/corpora/parliamentary-corpora/parlspeech.json @@ -0,0 +1,15 @@ +{ + "Name": "The ParlSpeech V2 data set", + "URL": "https://doi.org/10.7910/DVN/L4OAKN", + "Family": "Parliamentary corpora", + "Description": "The corpus contains complete parliamentary speeches in the key legislative chambers of Austria, the Czech Republic, Germany, Denmark, the Netherlands, New Zealand, Spain, Sweden, and the United Kingdom, covering periods between 21 and 32 years.\nThe corpus is available for download from the Harvard Dataverse repository.", + "Languages": ["deu", "ces", "dan", "nld", "eng", "spa", "swe"], + "License": "CC0", + "Size": ["6.3 million parliamentary speeches"], + "Annotation": ["date, speaker, party, agenda item metadata"], + "Infrastructure": "Other", + "Access": { + "Download": "http://doi.org/10.7910/DVN/L4OAKN" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/plenary-fin-parla.json b/corpora/parliamentary-corpora/plenary-fin-parla.json new file mode 100644 index 0000000..165aff6 --- /dev/null +++ b/corpora/parliamentary-corpora/plenary-fin-parla.json @@ -0,0 +1,15 @@ +{ + "Name": "Plenary Sessions of the Parliament of Finland", + "URL": "http://urn.fi/urn:nbn:fi:lb-2017020202", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Finnish parliamentary debates from 2008 to 2016.\nThe corpus is available through the concordancer Korp.", + "Languages": ["fin"], + "License": "CC-BY", + "Size": ["22.4 million tokens"], + "Annotation": ["tokenised", "MSD-tagged", "lemmatised", "syntactically parsed"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "http://urn.fi/urn:nbn:fi:lb-2017020201" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/pol-parla.json b/corpora/parliamentary-corpora/pol-parla.json new file mode 100644 index 0000000..99785f3 --- /dev/null +++ b/corpora/parliamentary-corpora/pol-parla.json @@ -0,0 +1,16 @@ +{ + "Name": "Polish Parliamentary Corpus", + "URL": "http://hdl.handle.net/11321/467", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Polish parliamentary debates from 1991 to 2017. It is annotated with Morfeusz SGJP (morphological analyser), Pantera (disambiguating tagger), Spejd (shallow parser), Nerf (named entity recognizer).\nThe corpus is available for download from a dedicated webpage and through the concordancer NKJP. ", + "Languages": ["pol"], + "License": "", + "Size": ["300 million tokens"], + "Annotation": ["tokenised, MSD-tagged, named entities, etc."], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "http://sejm.nlp.ipipan.waw.pl/", + "Download": "http://clip.ipipan.waw.pl/PSC" + }, + "Publication": "Ogrodniczuk (2018)" +} diff --git a/corpora/parliamentary-corpora/polminer.json b/corpora/parliamentary-corpora/polminer.json new file mode 100644 index 0000000..111c045 --- /dev/null +++ b/corpora/parliamentary-corpora/polminer.json @@ -0,0 +1,15 @@ +{ + "Name": "polmineR corpus", + "URL": "https://github.com/PolMine", + "Family": "Parliamentary corpora", + "Description": "A small sample is available for download from the GitHub webpage of the corpus.", + "Languages": ["deu"], + "License": "", + "Size": ["Only a small sample available"], + "Annotation": [], + "Infrastructure": "Other", + "Access": { + "Download": "http://github.com/PolMine" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/proceedings-nor-parla.json b/corpora/parliamentary-corpora/proceedings-nor-parla.json new file mode 100644 index 0000000..f7c61ec --- /dev/null +++ b/corpora/parliamentary-corpora/proceedings-nor-parla.json @@ -0,0 +1,15 @@ +{ + "Name": "Proceedings of Norwegian Parliamentary Debates", + "URL": "http://hdl.handle.net/11495/DA65-D02F-0EB0-9", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Norwegian parliamentary debates from 2008 to 2015.\nThe corpus is available through the concordancer Corpuscle.", + "Languages": ["nor"], + "License": "NLOD", + "Size": ["29 million tokens"], + "Annotation": ["tokenised", "sentence segmentation", "speaker metadata (name, party, time, type of utterance)"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "http://hdl.handle.net/11495/DA65-D02F-0EB0-9" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/ptparl.json b/corpora/parliamentary-corpora/ptparl.json new file mode 100644 index 0000000..99b71fa --- /dev/null +++ b/corpora/parliamentary-corpora/ptparl.json @@ -0,0 +1,15 @@ +{ + "Name": "PTPARL Corpus", + "URL": "https://hdl.handle.net/21.11129/0000-000B-D33C-4", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Portuguese parliamentary debates from 1970 to 2008. It is annotated with LX-Tokenizer, LX-Tagger, MBT, MBLEM (lemmatisation).\nThe corpus is available for download from the CLARIN PORTUGAL repository.", + "Languages": ["por"], + "License": "CLARIN RES", + "Size": ["1 million tokens"], + "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "https://hdl.handle.net/21.11129/0000-000B-D33C-4" + }, + "Publication": "Généreux et al. (2012)" +} diff --git a/corpora/parliamentary-corpora/riigikogu.json b/corpora/parliamentary-corpora/riigikogu.json new file mode 100644 index 0000000..62e5b23 --- /dev/null +++ b/corpora/parliamentary-corpora/riigikogu.json @@ -0,0 +1,16 @@ +{ + "Name": "Transcripts of Riigikogu (Estonian Parliament)", + "URL": "http://www.cl.ut.ee/korpused/segakorpus/riigikogu/", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Estonian parliamentary debates from 1995 to 2001.\nThe corpus is available for download from a dedicated webpage and through a concordancer on the same webpage.", + "Languages": ["est"], + "License": "CLARIN_ACA", + "Size": ["13 million tokens"], + "Annotation": ["tokenised"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "http://www.keeleveeb.ee/", + "Download": "http://www.cl.ut.ee/korpused/segakorpus/riigikogu/" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/riksdag-open-data.json b/corpora/parliamentary-corpora/riksdag-open-data.json new file mode 100644 index 0000000..fcc2738 --- /dev/null +++ b/corpora/parliamentary-corpora/riksdag-open-data.json @@ -0,0 +1,16 @@ +{ + "Name": "Riksdag’s Open Data", + "URL": "https://spraakbanken.gu.se/eng/resources", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Swedish parliamentary debates from 1971 to 2016. It is annotated with Sparv.\nThe corpus is available for download from Språkbanken (all entries with \"Riksdag's Open Data\" in the subtitle) and through the concordancer Korp.", + "Languages": ["swe"], + "License": "CC-BY", + "Size": ["1.25 billion tokens"], + "Annotation": ["tokenised", "lemmatised"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "https://spraakbanken.gu.se/korp/#?stats_reduce=word&cqp=%5B%5D&corpus=rd-bet,rd-ds,rd-eun,rd-fpm,rd-frsrdg,rd-flista,rd-ip,rd-kammakt,rd-kom,rd-mot,rd-prop,rd-prot,rd-rskr,rd-samtr,rd-skfr,rd-sou,rd-tlista,rd-utr,rd-utsk,rd-yttr,rd-ovr&lang=en", + "Download": "http://spraakbanken.gu.se/eng/resources" + }, + "Publication": "Borin et al. (2016)" +} diff --git a/corpora/parliamentary-corpora/saeima.json b/corpora/parliamentary-corpora/saeima.json new file mode 100644 index 0000000..99a6120 --- /dev/null +++ b/corpora/parliamentary-corpora/saeima.json @@ -0,0 +1,15 @@ +{ + "Name": "Corpus of the Saeima (the Parliament of Latvia)", + "URL": "https://hdl.handle.net/20.500.12574/50", + "Family": "Parliamentary corpora", + "Description": "This corpus contains parliamentary debates from seven parliamentary terms (5th–12th Saeima) covering years 1993–2017. The available metadata for each utterance includes the date and type of the parliamentary session and speakers’ names and affiliations.\nThe corpus is available for online browsing through the noSketch Engine (CLARIN-LV) concordancer.", + "Languages": ["lav"], + "License": "", + "Size": ["21 million words"], + "Annotation": ["tokenised", "msd-tagged", "lemmatised", "syntactically parsed", "named entities"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "http://nosketch.korpuss.lv/#dashboard?corpname=saeima" + }, + "Publication": "Darģis et al. (2018)" +} diff --git a/corpora/parliamentary-corpora/siparl.json b/corpora/parliamentary-corpora/siparl.json new file mode 100644 index 0000000..6ad9944 --- /dev/null +++ b/corpora/parliamentary-corpora/siparl.json @@ -0,0 +1,16 @@ +{ + "Name": "Slovenian parliamentary corpus siParl 3.0", + "URL": "http://hdl.handle.net/11356/1748", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Slovenian parliamnetary debates from 1990 to 2022. It differs from the SlovParl 2.0 corpus (listed below) in that it contains only basic meta-data about the speakers, a typology of sessions and structural and editorian annotations.\nThe corpus is available for download from the CLARIN.SI repository and through the concordancers KonText and noSketchEngine.", + "Languages": ["slv"], + "License": "CC-BY", + "Size": ["213 million words"], + "Annotation": ["tokenised", "PoS-tagged|", "lemmatised"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "https://www.clarin.si/noske/run.cgi/corp_info?corpname=siparl30&struct_attr_stats=1", + "Download": "http://hdl.handle.net/11356/1748" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/slovparl.json b/corpora/parliamentary-corpora/slovparl.json new file mode 100644 index 0000000..d9078e8 --- /dev/null +++ b/corpora/parliamentary-corpora/slovparl.json @@ -0,0 +1,16 @@ +{ + "Name": "Slovenian parliamentary corpus SlovParl 2.0", + "URL": "http://hdl.handle.net/11356/1167", + "Family": "Parliamentary corpora", + "Description": "The SlovParl corpus contains minutes of the Assembly of the Republic of Slovenia for the legislative period 1990-1992, i.e. it covers the period before, during, and after Slovenia became an independent country in 1991. The corpus comprises 232 sessions, 58,813 speeches and 10.8 million words. The corpus contains extensive meta-data about the speakers, a typology of sessions etc. and structural and editorial annotations.", + "Languages": ["slv"], + "License": "CC-BY", + "Size": ["3.2 million tokens"], + "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer": "https://www.clarin.si/kontext/first_form?corpname=slovparl", + "Download": "http://hdl.handle.net/11356/1167" + }, + "Publication": "Pančur and Šorn (2016)" +} diff --git a/corpora/parliamentary-corpora/speeches-greek-parla.json b/corpora/parliamentary-corpora/speeches-greek-parla.json new file mode 100644 index 0000000..1aec4b0 --- /dev/null +++ b/corpora/parliamentary-corpora/speeches-greek-parla.json @@ -0,0 +1,15 @@ +{ + "Name": "Speeches of Politicians in the Greek Parliament", + "URL": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-5808-5", + "Family": "Parliamentary corpora", + "Description": "This corpus contains speeches delivered by 5 members of parliament: Dimitris Anagnostakis, Nikos Tsoukalis, Paros Koukoulopoulos, Niki Founta, and Panayiotis Kammenos.\nThe corpus is available for download from the CLARIN:el repository.", + "Languages": ["ell"], + "License": "CC-BY-NC", + "Size": ["258,036 words"], + "Annotation": [], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-5808-5" + }, + "Publication": "" +} diff --git a/corpora/parliamentary-corpora/talk-of-norway.json b/corpora/parliamentary-corpora/talk-of-norway.json new file mode 100644 index 0000000..baddbeb --- /dev/null +++ b/corpora/parliamentary-corpora/talk-of-norway.json @@ -0,0 +1,15 @@ +{ + "Name": "Talk of Norway", + "URL": "http://hdl.handle.net/11509/123", + "Family": "Parliamentary corpora", + "Description": "The corpus contains Norwegian parliamentary debates from 1998 to 2016.\nThe corpus is available for download from the CLARINO repository.", + "Languages": ["nor"], + "License": "NLOD", + "Size": ["63.8 million tokens"], + "Annotation": ["tokenised", "PoS-tagged", "lemmatised"], + "Infrastructure": "CLARIN", + "Access": { + "Download": "http://hdl.handle.net/11509/123" + }, + "Publication": "Lapponi et al. (2018)" +} diff --git a/corpora/parliamentary-corpora/ukparl.json b/corpora/parliamentary-corpora/ukparl.json new file mode 100644 index 0000000..6ae13fd --- /dev/null +++ b/corpora/parliamentary-corpora/ukparl.json @@ -0,0 +1,15 @@ +{ + "Name": "UKParl Dataset", + "URL": "https://federiconanni.com/%20ukparl/", + "Family": "Parliamentary corpora", + "Description": "This corpus contains British parliamentary debates of the House of Commons from 2013 to 2016.\nThe corpus is available for download from Google Drive.", + "Languages": ["eng"], + "License": "", + "Size": ["354,400 tokens"], + "Annotation": ["fine-grained topic annotation", "additional semantic information (entity links)"], + "Infrastructure": "Other", + "Access": { + "Download": "https://drive.google.com/file/d/1XRsGl8HHW2DUsuRxfajT3ycRIJN_MrE8/view?usp=sharing" + }, + "Publication": "Nanni et al. (2018)" +} diff --git a/corpora/parliamentary-corpora/yu1parl.json b/corpora/parliamentary-corpora/yu1parl.json new file mode 100644 index 0000000..2218a6c --- /dev/null +++ b/corpora/parliamentary-corpora/yu1parl.json @@ -0,0 +1,17 @@ +{ + "Name": "Parliamentary corpus of first Yugoslavia (1919-1939) yu1Parl 1.0", + "URL": "http://hdl.handle.net/11356/1845", + "Family": "Parliamentary corpora", + "Description": "This historical parliamentary corpus contains meeting proceedings of the National Representation of the Kingdom of Yugoslavia from 191 to 1939. The corpus comprises 714 sessions.\nThe source data (scanned images of printed Stenographic Minutes) come from the History of Slovenia - SIstory portal. The images were OCR processed and the results saved as pdf, docx and txt. The documents are multilingual, in Serbo-Croatian and Slovenian, depending on the speaker. Serbo-Croatian is typeset in the Cyrillic (Serbian) or in the Latin (Croatian) alphabet.\nThe documents were automatically processed and the following data extracted: titles, agenda, attending, start and end of the session, speakers, and comments. Lingua was used for language detection on the sentence level. Roughly 59% of sentences are in Serbian (Cyrillic script), 38% in Croatian (Latin script) and 3% in Slovenian. Some sentences in German and French were also detected. Linguistic annotation (tokenisation, MSD tagging and lemmatisation) was added using CLASSLA for Serbian, Croatian and Slovenian. Words in Serbian (Cyrillic script) have lemmas in Latin script.\n The corpus is available for download from the CLARIN.SI repository as well as for online browsing through the noSketch Engine and KonText concordancers.", + "Languages": ["hrv", "srp", "slv"], + "License": "CC BY 4.0", + "Size": ["34,542 utterances", "578,958 sentences", "13,271,885 words", "15,403 pages"], + "Annotation": ["tokenised", "MSD-tagged", "lemmatised"], + "Infrastructure": "CLARIN", + "Access": { + "Concordancer (noSketch)": "https://www.clarin.si/ske/#dashboard?corpname=yu1parl", + "Concordancer (KonText)": "https://www.clarin.si/kontext/query?corpname=yu1parl", + "Download": "http://hdl.handle.net/11356/1845" + }, + "Publication": "" +}