Skip to content

Commit

Permalink
added CLARIN/non-CLARIN to literary, newspaper and reference corpora
Browse files Browse the repository at this point in the history
  • Loading branch information
kreetrapper committed Sep 5, 2024
1 parent b64c0f7 commit 552e249
Show file tree
Hide file tree
Showing 123 changed files with 123 additions and 0 deletions.
1 change: 1 addition & 0 deletions corpora/literary-corpora/1000-novels.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY 4.0",
"Size": ["1000 texts"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/11321/312"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/1000plus-novels.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY-SA 3.0",
"Size": ["1000 texts", "17,352,826 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/11321/699"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/15c-castilian.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "",
"Size": [],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://cancionerovirtual.liv.ac.uk/"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/1920-polish.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY 3.0",
"Size": [],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/11321/57"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/aformes.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY-NC",
"Size": ["376,250 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.grnet.gr/11500/UOA-0000-0000-2575-3"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/anglosaxon.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "",
"Size": [],
"Annotation": ["none"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "https://www.sacred-texts.com/neu/ascp/"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/anth-me.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "Oxford Text Archive Licence",
"Size": ["4,000 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/20.500.14106/1398"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/bonnier-one.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY 4.0",
"Size": ["6,578,675 tokens", "462,625 sentences"],
"Annotation": ["sentence scrambling"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "https://spraakbanken.gu.se/korp/#corpus=romi",
"Download": "http://hdl.handle.net/10794/115"
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/bonnier-two.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY 4.0",
"Size": ["4,304,271 tokens", "298,361 sentences"],
"Annotation": ["sentence scrambling"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "https://spraakbanken.gu.se/korp/#corpus=romii",
"Download": "http://hdl.handle.net/10794/116"
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/ceal.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN RES + NC",
"Size": ["3 novels", "484,010 tokens"],
"Annotation": ["MSD-tagged", "syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Browse (original)": "http://urn.fi/urn:nbn:fi:lb-2018011201",
"Browse (scrambled)": "http://urn.fi/urn:nbn:fi:lb-2018011202"
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/classic-fin-lit.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "",
"Size": ["1,456,658 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://kaino.kotus.fi/korpus/klassikot/meta/klassikot_coll_rdf.xml"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/classic-fin.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY",
"Size": [],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "https://www.kielipankki.fi/corpora/nlfcl-fi-authors/"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/early-fin-lit.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "",
"Size": [],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": ""
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/est-fiction.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN ACA - NC",
"Size": ["5,768,504 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/10.15155/1-00-0000-0000-0000-0007EL"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/est-runic.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN ACA",
"Size": ["92,134 texts"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://www.folklore.ee/regilaul/"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/etcsl.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "",
"Size": ["400 literary compositions"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://etcsl.orinst.ox.ac.uk/"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/fin-folk.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY-NC",
"Size": ["7.1 million words"],
"Annotation": ["unannotated"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://urn.fi/urn:nbn:fi:lb-2014052711"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/fin-gutenberg.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY",
"Size": ["34,487,420 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://urn.fi/urn:nbn:fi:lb-2014102101"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/fin-lit.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "EUPL v.1.1 SA",
"Size": ["1,500,000 words"],
"Annotation": ["syntactically parsed (TDT alpha)", "named entities (FiNER)", "MSD-tagged", "lemmatized"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://urn.fi/urn:nbn:fi:lb-2016081601"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/greek-medieval.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY-NC",
"Size": ["3,419,553 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.grnet.gr/11500/AEGEAN-0000-0000-251D-7"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/greek-thesaurus.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "proprietary",
"Size": ["1 million tokens"],
"Annotation": ["semantic"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://www.potheg.gr/Intro.aspx?lan=2"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/joh-jen.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC BY-SA 4.0",
"Size": ["1,760,093 words", "8,489 pages"],
"Annotation": ["unannotated"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://johannesvjensen.dk/jensenonline/liste-over-vaerker/",
"Download": "http://hdl.handle.net/20.500.12115/20"
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/kdsp.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY 4.0",
"Size": ["262 texts", "11 million words", "14 million tokens"],
"Annotation": ["MSD-tagged (MULTEXT-East & UD)", "lemmatised", "annotated with author and text metadata"],
"Infrastructure": "CLARIN",
"Access": {
"Browse (noSketchEngine)": "https://www.clarin.si/ske/#dashboard?corpname=kdsp",
"Browse (KonText)": "https://www.clarin.si/kontext/query?corpname=kdsp",
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/kivi.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY-NC",
"Size": ["413,735 words"],
"Annotation": ["MSD-tagged", "syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://urn.fi/urn:nbn:fi:lb-2016121604"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/lat-lit-classic.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "",
"Size": [],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": ""
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/ltcorpus.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN RES",
"Size": ["1,781,083 words"],
"Annotation": ["PoS-tagged", "lemmatized"],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/21.11115/0000-000B-D33D-3"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/m-agricola.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY-ND",
"Size": ["83,678 sentences", "428,314 tokens", "38,308 words"],
"Annotation": ["MSD-tagged", "syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://urn.fi/urn:nbn:fi:lb-201803273"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/micro-pol.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "plWordNet",
"Size": [],
"Annotation": ["unannotated"],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/11321/604"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/multext1984.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC BY-NC SA 4.0",
"Size": ["12 texts", "79,718 sentences", "1,064,424 words"],
"Annotation": ["sentence-alignment", "MSD tagging"],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/11356/1043"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/norbok-children.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN ACA",
"Size": ["4,111,213 words", "389,564 sentences"],
"Annotation": ["syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://clarino.uib.no/iness/landing-page?collection=NorGramBank"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/norbok-fiction.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN ACA",
"Size": ["26,903,637 words", "2,469,916 sentences"],
"Annotation": ["syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://clarino.uib.no/iness/landing-page?collection=NorGramBank"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/nornyn-children.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN ACA",
"Size": ["1,043,260 words", "106,434 sentences"],
"Annotation": ["syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://clarino.uib.no/iness/landing-page?collection=NorGramBank"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/nornyn-fiction.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN ACA",
"Size": ["2,884,376 words", "260,285 sentences"],
"Annotation": ["syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://clarino.uib.no/iness/landing-page?collection=NorGramBank"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/north-saami.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN RES +NC +NORED +PLAN",
"Size": ["17,830 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "https://lbr.csc.fi/apply-for?resource=urn:nbn:fi:lb-2014032620"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/old-fin-lit.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "",
"Size": ["3,428,618 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://kaino.kotus.fi/korpus/vks/meta/vks_coll_rdf.xml"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/one-mil-cro.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "",
"Size": ["1 million tokens"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": ""
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/orig-est.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN ACA",
"Size": ["173 texts"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://krzwlive.kirmus.ee/et/"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/parfin.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN RES +NC +INF +ND",
"Size": ["2,044,172 tokens"],
"Annotation": ["MSD-tagged", "syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://urn.fi/urn:nbn:fi:lb-2016121601"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/parrus.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN RES +NC +INF +ND",
"Size": ["5,900,000 tokens"],
"Annotation": ["MSD-tagged, syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://urn.fi/urn:nbn:fi:lb-2016121604"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/prilit.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC BY 4.0",
"Size": ["43 texts", "1,275,209 tokens"],
"Annotation": ["word modernisation", "lemmatisation", "syntactic annotation (<a href=\"https://universaldependencies.org/\">Universal Dependencies</a>)"],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/11356/1319"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/rep-bastille.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY",
"Size": ["37,965 words"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.grnet.gr/11500/AUTH-0000-0000-24DC-0"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/sol.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY 4.0",
"Size": ["1,267,391 tokens", "69,270 sentences"],
"Annotation": ["sentence scrambled"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "https://spraakbanken.gu.se/korp/?mode=spanish#?corpus=one71",
"Download": "http://hdl.handle.net/10794/80"
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/strindberg.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY 4.0",
"Size": ["4,309,037 tokens", "321,759 sentences"],
"Annotation": ["sentence scrambling"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "https://spraakbanken.gu.se/korp/#?corpus=strindbergromaner",
"Download": "http://hdl.handle.net/10794/79"
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/uhlcs.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN RES",
"Size": ["68,425 words"],
"Annotation": ["tagged"],
"Infrastructure": "CLARIN",
"Access": {
"Browse": "http://lbr.csc.fi/apply-for?resource=urn:nbn:fi:lb-2014032622"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/literary-corpora/york-poetry.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "Restricted",
"Size": ["71,490 words"],
"Annotation": ["MSD-tagged", "syntactically parsed"],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/20.500.14106/2425"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/newspaper-corpora/8-sidor.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY",
"Size": ["678,000 tokens"],
"Annotation": ["tokenised", "PoS-tagged", "parsed", "compounds"],
"Infrastructure": "CLARIN",
"Access": {
"Concordancer": "https://spraakbanken.gu.se/korp/#?corpus=attasidor&stats_reduce=word&cqp=%5B%5D",
"Download": "http://spraakbanken.gu.se/eng/resource/attasidor"
Expand Down
1 change: 1 addition & 0 deletions corpora/newspaper-corpora/accurat.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC BY",
"Size": ["23,820 sentences"],
"Annotation": [],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.grnet.gr/11500/ATHENA-0000-0000-23BF-2"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/newspaper-corpora/chronopress.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CLARIN PUB",
"Size": ["20 million tokens"],
"Annotation": ["tokenised", "PoS-tagged", "named entities"],
"Infrastructure": "CLARIN",
"Access": {
"Concordancer": "https://chronopress.clarin-pl.eu/"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/newspaper-corpora/contemp-serbian.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "CC-BY",
"Size": ["916 million tokens"],
"Annotation": ["tokenised", "PoS-tagged", "lemmatised"],
"Infrastructure": "Other",
"Access": {
"Special": "For access, contact the <a href=\"http://metashare.elda.org/repository/browse/corpus-of-contemporary-serbian-newpapers-and-magazines/210858448b2a11e2b539001517144592b76e35aee8794c51bd3016f1e57e765e/#\">resource manager.</a>"
},
Expand Down
1 change: 1 addition & 0 deletions corpora/newspaper-corpora/corp-news-texts.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"License": "under negotiation",
"Size": ["435 million tokens"],
"Annotation": ["tokenised"],
"Infrastructure": "CLARIN",
"Access": {
"Download": ""
},
Expand Down
Loading

0 comments on commit 552e249

Please sign in to comment.