From 78067be21bd6d7605eb3621a64e9a82dc9dd7cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6nig?= Date: Thu, 9 Jan 2025 14:18:29 +0100 Subject: [PATCH] added language models --- .../language-models/albertina-pt-br-base.json | 16 +++++++++++++++ .../albertina-pt-br-no-brwac.json | 16 +++++++++++++++ .../language-models/albertina-pt-br.json | 16 +++++++++++++++ .../language-models/albertina-pt-pt-base.json | 16 +++++++++++++++ .../language-models/albertina-pt-pt.json | 16 +++++++++++++++ .../language-models/bertimbau-base.json | 16 +++++++++++++++ .../language-models/bertimbau-large.json | 16 +++++++++++++++ .../language-models/ccgigafida-arpa.json | 16 +++++++++++++++ .../language-models/cered-base.json | 16 +++++++++++++++ .../language-models/clarin-si-embed.json | 20 +++++++++++++++++++ .../classla-stanford-lemma-slv.json | 16 +++++++++++++++ .../classla-stanford-ner-bul.json | 16 +++++++++++++++ .../classla-stanford-ner-hrv.json | 16 +++++++++++++++ .../classla-stanford-ner-non-std-hrv.json | 16 +++++++++++++++ .../classla-stanford-ner-non-std-slv.json | 16 +++++++++++++++ .../classla-stanford-ner-non-std-srp.json | 16 +++++++++++++++ .../classla-stanford-ner-slv.json | 16 +++++++++++++++ .../classla-stanford-ner-srp.json | 16 +++++++++++++++ .../language-models/classla-stanza-bul.json | 16 +++++++++++++++ .../language-models/classla-stanza-hrv.json | 16 +++++++++++++++ .../classla-stanza-jos-dep-slv.json | 16 +++++++++++++++ .../classla-stanza-lemma-bul.json | 16 +++++++++++++++ .../classla-stanza-lemma-hrv.json | 16 +++++++++++++++ .../classla-stanza-lemma-mkd.json | 16 +++++++++++++++ .../classla-stanza-lemma-non-std-hrv.json | 16 +++++++++++++++ .../classla-stanza-lemma-non-std-slv.json | 16 +++++++++++++++ .../classla-stanza-lemma-non-std-srp.json | 16 +++++++++++++++ .../classla-stanza-lemma-srp.json | 16 +++++++++++++++ .../language-models/classla-stanza-mkd.json | 16 +++++++++++++++ .../classla-stanza-non-std-hrv.json | 16 +++++++++++++++ .../classla-stanza-non-std-slv.json | 16 +++++++++++++++ .../classla-stanza-non-std-srp.json | 16 +++++++++++++++ .../classla-stanza-sem-roles-slv.json | 16 +++++++++++++++ .../language-models/classla-stanza-slv.json | 16 +++++++++++++++ .../language-models/classla-stanza-srp.json | 16 +++++++++++++++ .../classla-stanza-ud-dep-bul.json | 16 +++++++++++++++ .../classla-stanza-ud-dep-hrv.json | 16 +++++++++++++++ .../classla-stanza-ud-dep-slv.json | 16 +++++++++++++++ .../classla-stanza-ud-dep-srp.json | 16 +++++++++++++++ .../language-models/cnec-nametag.json | 16 +++++++++++++++ .../language-models/commonsense-reason.json | 16 +++++++++++++++ .../language-models/conll-2017-shared.json | 16 +++++++++++++++ .../language-models/conll-2018-shared.json | 16 +++++++++++++++ .../language-models/conll-nametag.json | 16 +++++++++++++++ .../language-models/crosloengual-bert.json | 16 +++++++++++++++ .../language-models/cubbitt-en-cs.json | 16 +++++++++++++++ .../language-models/cubbitt-en-fr.json | 16 +++++++++++++++ .../language-models/cubbitt-en-pl.json | 16 +++++++++++++++ .../language-models/czech-neural-monkeys.json | 16 +++++++++++++++ .../language-models/dep-parsing-pol.json | 16 +++++++++++++++ .../language-models/dep-parsing-stanza.json | 16 +++++++++++++++ .../language-models/elmo-embeddings.json | 16 +++++++++++++++ .../language-models/embeddings-eng-wiki.json | 16 +++++++++++++++ .../language-models/eng-mod-morphodita.json | 16 +++++++++++++++ .../language-models/face-domain-specific.json | 16 +++++++++++++++ .../language-models/finbert.json | 16 +++++++++++++++ .../language-models/frenk-mmc-rtv.json | 16 +++++++++++++++ .../language-models/g2p-icelandic.json | 16 +++++++++++++++ .../language-models/gervasio-pt-br-base.json | 16 +++++++++++++++ .../language-models/gervasio-pt-pt-base.json | 16 +++++++++++++++ .../language-models/greynir-mbart.json | 16 +++++++++++++++ .../language-models/greynir-t2t.json | 16 +++++++++++++++ .../language-models/korektor-czech.json | 16 +++++++++++++++ .../language-models/lemma-stanza.json | 16 +++++++++++++++ .../language-models/liner-events.json | 16 +++++++++++++++ .../language-models/liner-ner-nkjp.json | 16 +++++++++++++++ .../language-models/liner-ner.json | 16 +++++++++++++++ .../language-models/liner-timex.json | 16 +++++++++++++++ lexical-resources/language-models/liner.json | 16 +++++++++++++++ .../language-models/litlatbert.json | 16 +++++++++++++++ lexical-resources/language-models/lvbert.json | 16 +++++++++++++++ .../language-models/lx-dsemvectors.json | 16 +++++++++++++++ .../language-models/mcsq-tm-en-de.json | 16 +++++++++++++++ .../language-models/mcsq-tm-en-ru.json | 16 +++++++++++++++ .../language-models/morflex-cz-161115.json | 16 +++++++++++++++ .../language-models/nametag2.json | 16 +++++++++++++++ .../language-models/por-roberta.json | 16 +++++++++++++++ .../language-models/pos-lemma-ces.json | 16 +++++++++++++++ .../language-models/pos-tag-flair.json | 16 +++++++++++++++ .../language-models/pos-tag-marmot.json | 16 +++++++++++++++ .../language-models/pos-tag-stanza.json | 16 +++++++++++++++ .../language-models/pytorch-sloner.json | 16 +++++++++++++++ .../language-models/pytorch-slv.json | 16 +++++++++++++++ lexical-resources/language-models/ruv-di.json | 16 +++++++++++++++ .../language-models/sentiment-czech.json | 16 +++++++++++++++ .../language-models/slavic-forest.json | 16 +++++++++++++++ .../language-models/slk-morphodita.json | 16 +++++++++++++++ .../language-models/sloberta.json | 15 ++++++++++++++ .../language-models/trans-models-en-de.json | 16 +++++++++++++++ .../language-models/trans-models-en-ru.json | 16 +++++++++++++++ .../language-models/ud-parsito-models.json | 16 +++++++++++++++ .../language-models/udify-pre.json | 16 +++++++++++++++ .../language-models/udpipe-models.json | 16 +++++++++++++++ .../language-models/wmt-ca-oc-multi.json | 16 +++++++++++++++ .../language-models/wmt-ca-oc.json | 16 +++++++++++++++ .../language-models/wmt-ca-ro-it.json | 16 +++++++++++++++ .../language-models/wmt-tuning-cs-en.json | 16 +++++++++++++++ 97 files changed, 1555 insertions(+) create mode 100644 lexical-resources/language-models/albertina-pt-br-base.json create mode 100644 lexical-resources/language-models/albertina-pt-br-no-brwac.json create mode 100644 lexical-resources/language-models/albertina-pt-br.json create mode 100644 lexical-resources/language-models/albertina-pt-pt-base.json create mode 100644 lexical-resources/language-models/albertina-pt-pt.json create mode 100644 lexical-resources/language-models/bertimbau-base.json create mode 100644 lexical-resources/language-models/bertimbau-large.json create mode 100644 lexical-resources/language-models/ccgigafida-arpa.json create mode 100644 lexical-resources/language-models/cered-base.json create mode 100644 lexical-resources/language-models/clarin-si-embed.json create mode 100644 lexical-resources/language-models/classla-stanford-lemma-slv.json create mode 100644 lexical-resources/language-models/classla-stanford-ner-bul.json create mode 100644 lexical-resources/language-models/classla-stanford-ner-hrv.json create mode 100644 lexical-resources/language-models/classla-stanford-ner-non-std-hrv.json create mode 100644 lexical-resources/language-models/classla-stanford-ner-non-std-slv.json create mode 100644 lexical-resources/language-models/classla-stanford-ner-non-std-srp.json create mode 100644 lexical-resources/language-models/classla-stanford-ner-slv.json create mode 100644 lexical-resources/language-models/classla-stanford-ner-srp.json create mode 100644 lexical-resources/language-models/classla-stanza-bul.json create mode 100644 lexical-resources/language-models/classla-stanza-hrv.json create mode 100644 lexical-resources/language-models/classla-stanza-jos-dep-slv.json create mode 100644 lexical-resources/language-models/classla-stanza-lemma-bul.json create mode 100644 lexical-resources/language-models/classla-stanza-lemma-hrv.json create mode 100644 lexical-resources/language-models/classla-stanza-lemma-mkd.json create mode 100644 lexical-resources/language-models/classla-stanza-lemma-non-std-hrv.json create mode 100644 lexical-resources/language-models/classla-stanza-lemma-non-std-slv.json create mode 100644 lexical-resources/language-models/classla-stanza-lemma-non-std-srp.json create mode 100644 lexical-resources/language-models/classla-stanza-lemma-srp.json create mode 100644 lexical-resources/language-models/classla-stanza-mkd.json create mode 100644 lexical-resources/language-models/classla-stanza-non-std-hrv.json create mode 100644 lexical-resources/language-models/classla-stanza-non-std-slv.json create mode 100644 lexical-resources/language-models/classla-stanza-non-std-srp.json create mode 100644 lexical-resources/language-models/classla-stanza-sem-roles-slv.json create mode 100644 lexical-resources/language-models/classla-stanza-slv.json create mode 100644 lexical-resources/language-models/classla-stanza-srp.json create mode 100644 lexical-resources/language-models/classla-stanza-ud-dep-bul.json create mode 100644 lexical-resources/language-models/classla-stanza-ud-dep-hrv.json create mode 100644 lexical-resources/language-models/classla-stanza-ud-dep-slv.json create mode 100644 lexical-resources/language-models/classla-stanza-ud-dep-srp.json create mode 100644 lexical-resources/language-models/cnec-nametag.json create mode 100644 lexical-resources/language-models/commonsense-reason.json create mode 100644 lexical-resources/language-models/conll-2017-shared.json create mode 100644 lexical-resources/language-models/conll-2018-shared.json create mode 100644 lexical-resources/language-models/conll-nametag.json create mode 100644 lexical-resources/language-models/crosloengual-bert.json create mode 100644 lexical-resources/language-models/cubbitt-en-cs.json create mode 100644 lexical-resources/language-models/cubbitt-en-fr.json create mode 100644 lexical-resources/language-models/cubbitt-en-pl.json create mode 100644 lexical-resources/language-models/czech-neural-monkeys.json create mode 100644 lexical-resources/language-models/dep-parsing-pol.json create mode 100644 lexical-resources/language-models/dep-parsing-stanza.json create mode 100644 lexical-resources/language-models/elmo-embeddings.json create mode 100644 lexical-resources/language-models/embeddings-eng-wiki.json create mode 100644 lexical-resources/language-models/eng-mod-morphodita.json create mode 100644 lexical-resources/language-models/face-domain-specific.json create mode 100644 lexical-resources/language-models/finbert.json create mode 100644 lexical-resources/language-models/frenk-mmc-rtv.json create mode 100644 lexical-resources/language-models/g2p-icelandic.json create mode 100644 lexical-resources/language-models/gervasio-pt-br-base.json create mode 100644 lexical-resources/language-models/gervasio-pt-pt-base.json create mode 100644 lexical-resources/language-models/greynir-mbart.json create mode 100644 lexical-resources/language-models/greynir-t2t.json create mode 100644 lexical-resources/language-models/korektor-czech.json create mode 100644 lexical-resources/language-models/lemma-stanza.json create mode 100644 lexical-resources/language-models/liner-events.json create mode 100644 lexical-resources/language-models/liner-ner-nkjp.json create mode 100644 lexical-resources/language-models/liner-ner.json create mode 100644 lexical-resources/language-models/liner-timex.json create mode 100644 lexical-resources/language-models/liner.json create mode 100644 lexical-resources/language-models/litlatbert.json create mode 100644 lexical-resources/language-models/lvbert.json create mode 100644 lexical-resources/language-models/lx-dsemvectors.json create mode 100644 lexical-resources/language-models/mcsq-tm-en-de.json create mode 100644 lexical-resources/language-models/mcsq-tm-en-ru.json create mode 100644 lexical-resources/language-models/morflex-cz-161115.json create mode 100644 lexical-resources/language-models/nametag2.json create mode 100644 lexical-resources/language-models/por-roberta.json create mode 100644 lexical-resources/language-models/pos-lemma-ces.json create mode 100644 lexical-resources/language-models/pos-tag-flair.json create mode 100644 lexical-resources/language-models/pos-tag-marmot.json create mode 100644 lexical-resources/language-models/pos-tag-stanza.json create mode 100644 lexical-resources/language-models/pytorch-sloner.json create mode 100644 lexical-resources/language-models/pytorch-slv.json create mode 100644 lexical-resources/language-models/ruv-di.json create mode 100644 lexical-resources/language-models/sentiment-czech.json create mode 100644 lexical-resources/language-models/slavic-forest.json create mode 100644 lexical-resources/language-models/slk-morphodita.json create mode 100644 lexical-resources/language-models/sloberta.json create mode 100644 lexical-resources/language-models/trans-models-en-de.json create mode 100644 lexical-resources/language-models/trans-models-en-ru.json create mode 100644 lexical-resources/language-models/ud-parsito-models.json create mode 100644 lexical-resources/language-models/udify-pre.json create mode 100644 lexical-resources/language-models/udpipe-models.json create mode 100644 lexical-resources/language-models/wmt-ca-oc-multi.json create mode 100644 lexical-resources/language-models/wmt-ca-oc.json create mode 100644 lexical-resources/language-models/wmt-ca-ro-it.json create mode 100644 lexical-resources/language-models/wmt-tuning-cs-en.json diff --git a/lexical-resources/language-models/albertina-pt-br-base.json b/lexical-resources/language-models/albertina-pt-br-base.json new file mode 100644 index 0000000..0775e74 --- /dev/null +++ b/lexical-resources/language-models/albertina-pt-br-base.json @@ -0,0 +1,16 @@ +{ + "Name": "Albertina PT-BR base", + "URL": "https://hdl.handle.net/21.11129/0000-000F-FF45-5", + "Family": "Language Models", + "Description": "This model is for Portuguese spoken in Brazil. It is based on the Transformer neural architecture and is developed over the DeBERTa model. ", + "Language": ["por"], + "Licence": "MIT", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://huggingface.co/PORTULAN/albertina-ptbr-base" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/albertina-pt-br-no-brwac.json b/lexical-resources/language-models/albertina-pt-br-no-brwac.json new file mode 100644 index 0000000..b414e7b --- /dev/null +++ b/lexical-resources/language-models/albertina-pt-br-no-brwac.json @@ -0,0 +1,16 @@ +{ + "Name": "Albertina PT-BR No-brWaC", + "URL": "https://hdl.handle.net/21.11129/0000-000F-FF46-4 ", + "Family": "Language Models", + "Description": "This is a model for Portuguese spoken in Brazil trained on adta sets othan than brWaC. It is I developed over the DeBERTa model.\nThe model is available for download from Hugging Face.", + "Language": ["por"], + "Licence": "MIT", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://huggingface.co/PORTULAN/albertina-ptbr-nobrwac" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/albertina-pt-br.json b/lexical-resources/language-models/albertina-pt-br.json new file mode 100644 index 0000000..9ef65ca --- /dev/null +++ b/lexical-resources/language-models/albertina-pt-br.json @@ -0,0 +1,16 @@ +{ + "Name": "Albertina PT-BR", + "URL": "https://hdl.handle.net/21.11129/0000-000F-FF43-7 ", + "Family": "Language Models", + "Description": "This model is an encoder of the BERT family and is based on the neural architecture Transformer and developed over the DeBERTa model. This model is for American Portuguese spoken in Brazil, is trained on the brWaC dataset, and is a larger version of the Albertina PT-BR base model.\nThis model is available for download through Hugging Face.", + "Language": ["por"], + "Licence": "MIT", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://huggingface.co/PORTULAN/albertina-ptbr" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/albertina-pt-pt-base.json b/lexical-resources/language-models/albertina-pt-pt-base.json new file mode 100644 index 0000000..ee104a3 --- /dev/null +++ b/lexical-resources/language-models/albertina-pt-pt-base.json @@ -0,0 +1,16 @@ +{ + "Name": "Albertina PT-PT base", + "URL": "https://hdl.handle.net/21.11129/0000-000F-FF44-6", + "Family": "Language Models", + "Description": "This model is for European. It is based on the Transformer neural architecture and is developed over the DeBERTa model.\nThis model is available for download through Hugging Face.", + "Language": ["por"], + "Licence": "MIT", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://huggingface.co/PORTULAN/albertina-ptpt-base" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/albertina-pt-pt.json b/lexical-resources/language-models/albertina-pt-pt.json new file mode 100644 index 0000000..dced56a --- /dev/null +++ b/lexical-resources/language-models/albertina-pt-pt.json @@ -0,0 +1,16 @@ +{ + "Name": "Albertina PT-PT", + "URL": "https://hdl.handle.net/21.11129/0000-000F-FF42-8", + "Family": "Language Models", + "Description": "This model is an encoder of the BERT family and is based on the neural architecture Transformer and developed over the DeBERTa model. This model is for European Portuguese and is trained on the brWaC dataset, and is a larger version of the Albertina PT-PT base model.\nThis model is available for download through Hugging Face.", + "Language": ["por"], + "Licence": "MIT", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://huggingface.co/PORTULAN/albertina-ptpt" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/bertimbau-base.json b/lexical-resources/language-models/bertimbau-base.json new file mode 100644 index 0000000..f63fe38 --- /dev/null +++ b/lexical-resources/language-models/bertimbau-base.json @@ -0,0 +1,16 @@ +{ + "Name": "BERTimbau - Portuguese BERT-Base language model", + "URL": "https://hdl.handle.net/21.11129/0000-000E-6726-4", + "Family": "Language Models", + "Description": "This is a BERT model, trained on BrWaC (Brazilian Web as Corpus), a large Portuguese corpus, for 1,000,000 steps, using whole-word mask.\nThe model is available for download from the PORTULAN repository.", + "Language": ["por"], + "Licence": "Under negotiation", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://huggingface.co/PORTULAN/gervasio-ptpt" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/bertimbau-large.json b/lexical-resources/language-models/bertimbau-large.json new file mode 100644 index 0000000..ebed339 --- /dev/null +++ b/lexical-resources/language-models/bertimbau-large.json @@ -0,0 +1,16 @@ +{ + "Name": "BERTimbau - Portuguese BERT-Large language model", + "URL": "https://hdl.handle.net/21.11129/0000-000E-6725-5", + "Family": "Language Models", + "Description": "This is a BERT model, trained on BrWaC (Brazilian Web as Corpus), a large Portuguese corpus, for 1,000,000 steps, using whole-word mask.\nThe model is available for download from the PORTULAN repository.", + "Language": ["por"], + "Licence": "Under negotiation", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://github.com/neuralmind-ai/portuguese-bert/" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/ccgigafida-arpa.json b/lexical-resources/language-models/ccgigafida-arpa.json new file mode 100644 index 0000000..cf7b9db --- /dev/null +++ b/lexical-resources/language-models/ccgigafida-arpa.json @@ -0,0 +1,16 @@ +{ + "Name": "ccGigafida ARPA language model 1.0", + "URL": "http://hdl.handle.net/11356/1119", + "Family": "Language Models", + "Description": "This model was created from the ccGigafida written corpus of Slovenian using the KenLM algorithm in the Moses machine translation framework. It is a general language model of contemporary standard Slovenian language that can be used as a language model in statistical machine translation systems.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "http://hdl.handle.net/11356/1119" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/cered-base.json b/lexical-resources/language-models/cered-base.json new file mode 100644 index 0000000..e2af5b0 --- /dev/null +++ b/lexical-resources/language-models/cered-base.json @@ -0,0 +1,16 @@ +{ + "Name": "CERED baseline models", + "URL": "http://hdl.handle.net/11234/1-3266", + "Family": "Language Models", + "Description": "These models are trained on CERED, a dataset created by distant supervision on Czech Wikipedia and Wikidata, and recognize a subset of Wikidata relations.\nThe model is available for download from the LINDAT repository.", + "Language": ["ces"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3266" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/clarin-si-embed.json b/lexical-resources/language-models/clarin-si-embed.json new file mode 100644 index 0000000..93d4d0c --- /dev/null +++ b/lexical-resources/language-models/clarin-si-embed.json @@ -0,0 +1,20 @@ +{ + "Name": "Word embeddings CLARIN.SI-embed", + "URL": "http://hdl.handle.net/11356/1796", + "Family": "Language Models", + "Description": "This is a set of word embeddings for 5 languages.\nThe models are available for download from the CLARIN.SI repository.", + "Language": ["bul", "hrv", "mkd", "srp", "slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["word embeddings"], + "Infrastructure": "CLARIN", + "Group": "Contextual Word Embeddings", + "Access": { + "Download (Bulgarian)": "http://hdl.handle.net/11356/1796", + "Download (Croatian)": "http://hdl.handle.net/11356/1790", + "Download (Macedonian)": "http://hdl.handle.net/11356/1788", + "Download (Serbian)": "http://hdl.handle.net/11356/1789", + "Download (Slovenian)": "http://hdl.handle.net/11356/1791" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/classla-stanford-lemma-slv.json b/lexical-resources/language-models/classla-stanford-lemma-slv.json new file mode 100644 index 0000000..ae93e26 --- /dev/null +++ b/lexical-resources/language-models/classla-stanford-lemma-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-StanfordNLP model for lemmatisation of standard Slovenian 2.0", + "URL": "http://hdl.handle.net/11356/1768", + "Family": "Language Models", + "Description": "The model for lemmatisation of standard Slovenian was built with the CLASSLA-Stanza tool by training on the SUK training corpus and using the CLARIN.SI-embed.sl word embeddings expanded with the MaCoCu-sl Slovene web corpus. The estimated F1 of the lemma annotations is ~99.7.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Lemmatisation", + "Access": { + "Download": "http://hdl.handle.net/11356/1768" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanford-ner-bul.json b/lexical-resources/language-models/classla-stanford-ner-bul.json new file mode 100644 index 0000000..6cb5461 --- /dev/null +++ b/lexical-resources/language-models/classla-stanford-ner-bul.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-StanfordNLP model for named entity recognition of standard Bulgarian 1.0", + "URL": "http://hdl.handle.net/11356/1329", + "Family": "Language Models", + "Description": "This model for named entity recognition of standard Bulgarian was built with the CLASSLA-StanfordNLP tool by training on the BulTreeBank training corpus and using the CoNLL2017 word embeddings.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["bul"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11356/1329" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanford-ner-hrv.json b/lexical-resources/language-models/classla-stanford-ner-hrv.json new file mode 100644 index 0000000..4c2445f --- /dev/null +++ b/lexical-resources/language-models/classla-stanford-ner-hrv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-StanfordNLP model for named entity recognition of standard Croatian 1.0", + "URL": "http://hdl.handle.net/11356/1322", + "Family": "Language Models", + "Description": "This model for named entity recognition of standard Croatian was built with the CLASSLA-StanfordNLP tool by training on the hr500k training corpus and using the CLARIN.SI-embed.hr word embeddings.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["hrv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11356/1322" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanford-ner-non-std-hrv.json b/lexical-resources/language-models/classla-stanford-ner-non-std-hrv.json new file mode 100644 index 0000000..73d1655 --- /dev/null +++ b/lexical-resources/language-models/classla-stanford-ner-non-std-hrv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-StanfordNLP model for named entity recognition of non-standard Croatian 1.0", + "URL": "http://hdl.handle.net/11356/1340", + "Family": "Language Models", + "Description": "This model for named entity recognition of non-standard Croatian was built with the CLASSLA-StanfordNLP tool by training on the hr500k training corpus, the ReLDI-NormTagNER-hr corpus and the ReLDI-NormTagNER-sr corpus, using the CLARIN.SI-embed.hr word embeddings . The training corpora were additionally augmented for handling missing diacritics by repeating parts of the corpora with diacritics removed.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["Croatian (non-standard)"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11356/1340" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanford-ner-non-std-slv.json b/lexical-resources/language-models/classla-stanford-ner-non-std-slv.json new file mode 100644 index 0000000..80c5fe7 --- /dev/null +++ b/lexical-resources/language-models/classla-stanford-ner-non-std-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-StanfordNLP model for named entity recognition of non-standard Slovenian 1.0", + "URL": "http://hdl.handle.net/11356/1339", + "Family": "Language Models", + "Description": "This model for named entity recognition of non-standard Slovenian was built with the CLASSLA-StanfordNLP tool by training on the ssj500k training corpus and the Janes-Tag training corpus, using the CLARIN.SI-embed.sl word embeddings. The training corpora were additionally augmented for handling missing diacritics by repeating parts of the corpora with diacritics removed.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["Slovenian (non-standard)"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11356/1339" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanford-ner-non-std-srp.json b/lexical-resources/language-models/classla-stanford-ner-non-std-srp.json new file mode 100644 index 0000000..f4fb7ad --- /dev/null +++ b/lexical-resources/language-models/classla-stanford-ner-non-std-srp.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-StanfordNLP model for named entity recognition of non-standard Serbian 1.0", + "URL": "http://hdl.handle.net/11356/1341", + "Family": "Language Models", + "Description": "This model for named entity recognition of non-standard Serbian was built with the CLASSLA-StanfordNLP tool by training on the SETimes.SR training corpus/a>, the hr500k training corpus, the ReLDI-NormTagNER-sr corpus, and the ReLDI-NormTagNER-hr corpus, using the CLARIN.SI-embed.sr word embeddings. The training corpora were additionally augmented for handling missing diacritics by repeating parts of the corpora with diacritics removed.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["Serbian (non-standard)"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11356/1341" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanford-ner-slv.json b/lexical-resources/language-models/classla-stanford-ner-slv.json new file mode 100644 index 0000000..7b3bed5 --- /dev/null +++ b/lexical-resources/language-models/classla-stanford-ner-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-StanfordNLP model for named entity recognition of standard Slovenian 1.0", + "URL": "http://hdl.handle.net/11356/1321", + "Family": "Language Models", + "Description": "This model for named entity recognition of standard Slovenian was built with the CLASSLA-StanfordNLP tool by training on the ssj500k training corpus and using the CLARIN.SI-embed.sl word embeddings.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11356/1321" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanford-ner-srp.json b/lexical-resources/language-models/classla-stanford-ner-srp.json new file mode 100644 index 0000000..e180715 --- /dev/null +++ b/lexical-resources/language-models/classla-stanford-ner-srp.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-StanfordNLP model for named entity recognition of standard Serbian 1.0", + "URL": "http://hdl.handle.net/11356/1323", + "Family": "Language Models", + "Description": "This model for named entity recognition of standard Serbian was built with the CLASSLA-StanfordNLP tool by training on the SETimes.SR training corpus and using the CLARIN.SI-embed.sr word embeddings.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["srp"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11356/1323" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-bul.json b/lexical-resources/language-models/classla-stanza-bul.json new file mode 100644 index 0000000..c3c5cf1 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-bul.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for morphosyntactic annotation of standard Bulgarian 2.1", + "URL": "http://hdl.handle.net/11356/1849", + "Family": "Language Models", + "Description": "The model for morphosyntactic annotation of standard Bulgarian was built with the CLASSLA-Stanza tool by training on the BulTreeBank training corpus and using the CLARIN.SI-embed.bg word embeddings. The model produces simultaneously UPOS, FEATS and XPOS (MULTEXT-East) labels. The estimated F1 of the XPOS annotations is ~96.83.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["bul"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11356/1849" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-hrv.json b/lexical-resources/language-models/classla-stanza-hrv.json new file mode 100644 index 0000000..c3fb1aa --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-hrv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for morphosyntactic annotation of standard Croatian 2.1", + "URL": "http://hdl.handle.net/11356/1832", + "Family": "Language Models", + "Description": "The model for morphosyntactic annotation of standard Croatian was built with the CLASSLA-Stanza tool by training on the hr500k training corpus and using the CLARIN.SI-embed.hr word embeddings. The model produces simultaneously UPOS, FEATS and XPOS (MULTEXT-East) labels. The estimated F1 of the XPOS annotations is ~94.87.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["hrv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11356/1832" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-jos-dep-slv.json b/lexical-resources/language-models/classla-stanza-jos-dep-slv.json new file mode 100644 index 0000000..ab78216 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-jos-dep-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for JOS dependency parsing of standard Slovenian 2.0", + "URL": "http://hdl.handle.net/11356/1764", + "Family": "Language Models", + "Description": "The model for JOS dependency parsing of standard Slovenian was built with the CLASSLA-Stanza tool by training on the SUK training corpus and using the CLARIN.SI-embed.sl word embeddings expanded with the MaCoCu-sl Slovene web corpus. The estimated LAS of the parser is ~93.89.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11356/1764" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-lemma-bul.json b/lexical-resources/language-models/classla-stanza-lemma-bul.json new file mode 100644 index 0000000..3161d0a --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-lemma-bul.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for lemmatisation of standard Bulgarian 2.1", + "URL": "http://hdl.handle.net/11356/1850", + "Family": "Language Models", + "Description": "The model for lemmatisation of standard Bulgarian was built with the CLASSLA-Stanza tool by training on the BulTreeBank training corpus and using the Bulgarian inflectional lexicon (Popov, Simov, and Vidinska 1998). The estimated F1 of the lemma annotations is ~98.93.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["bul"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Lemmatisation", + "Access": { + "Download": "http://hdl.handle.net/11356/1850" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-lemma-hrv.json b/lexical-resources/language-models/classla-stanza-lemma-hrv.json new file mode 100644 index 0000000..0d2e4ec --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-lemma-hrv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for lemmatisation of standard Croatian 2.1", + "URL": "http://hdl.handle.net/11356/1829", + "Family": "Language Models", + "Description": "The model for lemmatisation of standard Croatian was built with the CLASSLA-Stanza tool by training on the hr500k training corpus and using the hrLex inflectional lexicon. The estimated F1 of the lemma annotations is ~98.02.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["hrv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Lemmatisation", + "Access": { + "Download": "http://hdl.handle.net/11356/1829" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-lemma-mkd.json b/lexical-resources/language-models/classla-stanza-lemma-mkd.json new file mode 100644 index 0000000..69de92e --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-lemma-mkd.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for lemmatisation of standard Macedonian 2.1", + "URL": "http://hdl.handle.net/11356/1848", + "Family": "Language Models", + "Description": "The model for lemmatisation of standard Macedonian was built with the CLASSLA-Stanza tool by training on the 1984 training corpus expanded with the Macedonian SETimes corpus (to be published). The estimated F1 of the lemma annotations is ~98.81.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["mkd"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Lemmatisation", + "Access": { + "Download": "http://hdl.handle.net/11356/1848" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-lemma-non-std-hrv.json b/lexical-resources/language-models/classla-stanza-lemma-non-std-hrv.json new file mode 100644 index 0000000..43af036 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-lemma-non-std-hrv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for lemmatisation of non-standard Croatian 2.1", + "URL": "http://hdl.handle.net/11356/1827", + "Family": "Language Models", + "Description": "The model for lemmatisation of non-standard Croatian was built with the CLASSLA-Stanza tool by training on the hr500k training corpus and the ReLDI-NormTagNER-hr corpus, using the hrLex inflectional lexicon. These corpora were additionally augmented for handling missing diacritics by repeating parts of the corpora with diacritics removed. The estimated F1 of the lemma annotations is ~94.23.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["Croatian (non-standard)"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Lemmatisation", + "Access": { + "Download": "http://hdl.handle.net/11356/1827" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-lemma-non-std-slv.json b/lexical-resources/language-models/classla-stanza-lemma-non-std-slv.json new file mode 100644 index 0000000..8ff9c78 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-lemma-non-std-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for lemmatisation of non-standard Slovenian 2.1", + "URL": "http://hdl.handle.net/11356/1784", + "Family": "Language Models", + "Description": "The model for lemmatisation of non-standard Slovenian was built with the CLASSLA-Stanza tool by training on the SUK training corpus and on the Janes-Tag corpus using the CLARIN.SI-embed.sl word embeddings expanded with the MaCoCu-sl Slovene web corpus. These corpora were additionally augmented for handling missing diacritics by repeating parts of the corpora with diacritics removed. The estimated F1 of the lemma annotations is ~91.45.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["Slovenian (non-standard)"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Lemmatisation", + "Access": { + "Download": "http://hdl.handle.net/11356/1784" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-lemma-non-std-srp.json b/lexical-resources/language-models/classla-stanza-lemma-non-std-srp.json new file mode 100644 index 0000000..2d57356 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-lemma-non-std-srp.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for lemmatisation of non-standard Serbian 2.1", + "URL": "http://hdl.handle.net/11356/1828", + "Family": "Language Models", + "Description": "The model for lemmatisation of non-standard Serbian was built with the CLASSLA-Stanza tool by training on the SETimes.SR training corpus combined with the Serbian non-standard training corpus ReLDI-NormTagNER-sr and using the srLex inflectional lexicon. These corpora were additionally augmented for handling missing diacritics by repeating parts of the corpora with diacritics removed. The estimated F1 of the lemma annotations is ~94.92.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["Serbian (non-standard)"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Lemmatisation", + "Access": { + "Download": "http://hdl.handle.net/11356/1828" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-lemma-srp.json b/lexical-resources/language-models/classla-stanza-lemma-srp.json new file mode 100644 index 0000000..dc68abe --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-lemma-srp.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for lemmatisation of standard Serbian 2.1", + "URL": "http://hdl.handle.net/11356/1830", + "Family": "Language Models", + "Description": "The model for lemmatisation of standard Serbian was built with the CLASSLA-Stanza tool by training on the SETimes.SR training corpus combined with the Serbian non-standard training corpus ReLDI-NormTagNER-sr and using the srLex inflectional lexicon. The estimated F1 of the lemma annotations is ~98.02.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["srp"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Lemmatisation", + "Access": { + "Download": "http://hdl.handle.net/11356/1830" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-mkd.json b/lexical-resources/language-models/classla-stanza-mkd.json new file mode 100644 index 0000000..8cc18f2 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-mkd.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for morphosyntactic annotation of standard Macedonian 2.1", + "URL": "http://hdl.handle.net/11356/1847", + "Family": "Language Models", + "Description": "The model for morphosyntactic annotation of standard Macedonian was built with the CLASSLA-Stanza tool by training on the 1984 training corpus expanded with the Macedonian SETimes corpus (to be published) and using the Macedonian CLARIN.SI word embeddings. The model produces simultaneously UPOS, FEATS and XPOS (MULTEXT-East) labels. The estimated F1 of the XPOS annotations is ~97.14.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["mkd"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11356/1847" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-non-std-hrv.json b/lexical-resources/language-models/classla-stanza-non-std-hrv.json new file mode 100644 index 0000000..7cb4410 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-non-std-hrv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for morphosyntactic annotation of non-standard Croatian 2.1", + "URL": "http://hdl.handle.net/11356/1826", + "Family": "Language Models", + "Description": "The model for morphosyntactic annotation of non-standard Croatian was built with the CLASSLA-Stanza tool by training on the hr500k training corpus and the ReLDI-NormTagNER-hr corpus, using the CLARIN.SI-embed.hr word embeddings. These corpora were additionally augmented for handling missing diacritics by repeating parts of the corpora with diacritics removed. The model produces simultaneously UPOS, FEATS and XPOS (MULTEXT-East) labels. The estimated F1 of the XPOS annotations is ~92.49.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["hrv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11356/1826" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-non-std-slv.json b/lexical-resources/language-models/classla-stanza-non-std-slv.json new file mode 100644 index 0000000..18fbbd2 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-non-std-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for morphosyntactic annotation of non-standard Slovenian 2.1", + "URL": "http://hdl.handle.net/11356/1786", + "Family": "Language Models", + "Description": "The model for morphosyntactic annotation of non-standard Slovenian was built with the CLASSLA-Stanza tool by training on the SUK training corpus and on the Janes-Tag corpus using the CLARIN.SI-embed.sl word embeddings expanded with the MaCoCu-sl Slovene web corpus. These corpora were additionally augmented for handling missing diacritics by repeating parts of the corpora with diacritics removed. The model produces simultaneously UPOS, FEATS and XPOS (MULTEXT-East) labels. The estimated F1 of the XPOS annotations is ~92.17.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["Slovenian (non-standard)"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11356/1786" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-non-std-srp.json b/lexical-resources/language-models/classla-stanza-non-std-srp.json new file mode 100644 index 0000000..f14f66d --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-non-std-srp.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for morphosyntactic annotation of non-standard Serbian 2.1", + "URL": "http://hdl.handle.net/11356/1825", + "Family": "Language Models", + "Description": "The model for morphosyntactic annotation of non-standard Serbian was built with the CLASSLA-Stanza tool by training on the SETimes.SR training corpus combined with the Serbian non-standard training corpus ReLDI-NormTagNER-sr and the hr500k training corpus and using the CLARIN.SI-embed.sr word embeddings. These corpora were additionally augmented for handling missing diacritics by repeating parts of the corpora with diacritics removed. The model produces simultaneously UPOS, FEATS and XPOS (MULTEXT-East) labels. The estimated F1 of the XPOS annotations is ~92.64.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["Serbian (non-standard)"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11356/1825" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-sem-roles-slv.json b/lexical-resources/language-models/classla-stanza-sem-roles-slv.json new file mode 100644 index 0000000..6681740 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-sem-roles-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for semantic role labeling of standard Slovenian 2.0", + "URL": "http://hdl.handle.net/11356/1770", + "Family": "Language Models", + "Description": "The model for lemmatisation of standard Slovenian was built with the CLASSLA-Stanza tool by training on the SUK training corpus and using the CLARIN.SI-embed.sl word embeddings extended with the MaCoCu-sl Slovene web corpus. The estimated F1 of the lemma annotations is ~76.24.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["semantic role labeling"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "http://hdl.handle.net/11356/1770" + }, + "Publication": "Ljubešić & Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-slv.json b/lexical-resources/language-models/classla-stanza-slv.json new file mode 100644 index 0000000..08c318c --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for morphosyntactic annotation of standard Slovenian 2.0", + "URL": "http://hdl.handle.net/11356/1767", + "Family": "Language Models", + "Description": "The model for morphosyntactic annotation of standard Slovenian was built with the CLASSLA-Stanza tool by training on the SUK training corpus and using the CLARIN.SI-embed.sl word embeddings expanded with the MaCoCu-sl Slovene web corpus.The model produces simultaneously UPOS, FEATS and XPOS (MULTEXT-East) labels. The estimated F1 of the XPOS annotations is ~98.27.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11356/1767" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-srp.json b/lexical-resources/language-models/classla-stanza-srp.json new file mode 100644 index 0000000..f976e49 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-srp.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for morphosyntactic annotation of standard Serbian 2.1", + "URL": "http://hdl.handle.net/11356/1831", + "Family": "Language Models", + "Description": "The model for morphosyntactic annotation of standard Serbian was built with the CLASSLA-Stanza tool by training on the SETimes.SR training corpus combined with the Croatian hr500k training dataset to ensure sufficient representation of certain labels, and using the CLARIN.SI-embed.sr word embeddings. The model produces simultaneously UPOS, FEATS and XPOS (MULTEXT-East) labels. The estimated F1 of the XPOS annotations is ~96.19.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["srp"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11356/1831" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-ud-dep-bul.json b/lexical-resources/language-models/classla-stanza-ud-dep-bul.json new file mode 100644 index 0000000..3c9c3a7 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-ud-dep-bul.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for UD dependency parsing of standard Bulgarian 2.1", + "URL": "http://hdl.handle.net/11356/1851", + "Family": "Language Models", + "Description": "The model for UD dependency parsing of standard Bulgarian was built with the CLASSLA-Stanza tool by training on the UD-parsed portion of the BulTreeBank training corpus and using the CLARIN.SI-embed.bg word embeddings. The estimated LAS of the parser is ~91.18.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["bul"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11356/1851" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-ud-dep-hrv.json b/lexical-resources/language-models/classla-stanza-ud-dep-hrv.json new file mode 100644 index 0000000..50cb01c --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-ud-dep-hrv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for UD dependency parsing of standard Croatian 2.1", + "URL": "http://hdl.handle.net/11356/1836", + "Family": "Language Models", + "Description": "The model for UD dependency parsing of standard Croatian was built with the CLASSLA-Stanza tool by training on the UD-parsed portion of the hr500k training corpus and using the CLARIN.SI-embed.hr word embeddings.The estimated LAS of the parser is ~87.46.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["hrv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11356/1836" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-ud-dep-slv.json b/lexical-resources/language-models/classla-stanza-ud-dep-slv.json new file mode 100644 index 0000000..18aa345 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-ud-dep-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for UD dependency parsing of standard Slovenian 2.0", + "URL": "http://hdl.handle.net/11356/1769", + "Family": "Language Models", + "Description": "The model for UD dependency parsing of standard Slovenian was built with the CLASSLA-Stanza tool by training on the SUK training corpus and using the CLARIN.SI-embed.sl word embeddings expanded with the MaCoCu-sl Slovene web corpus. The estimated LAS of the parser is ~91.11.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11356/1769" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/classla-stanza-ud-dep-srp.json b/lexical-resources/language-models/classla-stanza-ud-dep-srp.json new file mode 100644 index 0000000..94e43f4 --- /dev/null +++ b/lexical-resources/language-models/classla-stanza-ud-dep-srp.json @@ -0,0 +1,16 @@ +{ + "Name": "The CLASSLA-Stanza model for UD dependency parsing of standard Serbian 2.1", + "URL": "http://hdl.handle.net/11356/1835", + "Family": "Language Models", + "Description": "The model for UD dependency parsing of standard Serbian was built with the CLASSLA-Stanza tool by training on the SETimes.SR training corpus and using the CLARIN.SI-embed.sr word embeddings.The estimated LAS of the parser is ~89.83.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["srp"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11356/1835" + }, + "Publication": "Ljubešić and Dobrovoljc (2019)" +} diff --git a/lexical-resources/language-models/cnec-nametag.json b/lexical-resources/language-models/cnec-nametag.json new file mode 100644 index 0000000..571dca7 --- /dev/null +++ b/lexical-resources/language-models/cnec-nametag.json @@ -0,0 +1,16 @@ +{ + "Name": "Czech Models (CNEC) for NameTag", + "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-7D42-8", + "Family": "Language Models", + "Description": "These are models for the named entity recognizer NameTag.\nThe models are available for download from the LINDAT repository.", + "Language": ["ces"], + "Licence": "CC BY-NC-SA 3.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11858/00-097C-0000-0023-7D42-8" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/commonsense-reason.json b/lexical-resources/language-models/commonsense-reason.json new file mode 100644 index 0000000..8504205 --- /dev/null +++ b/lexical-resources/language-models/commonsense-reason.json @@ -0,0 +1,16 @@ +{ + "Name": "Model weights for a study of commonsense reasoning", + "URL": "https://hdl.handle.net/21.11129/0000-000F-4869-B", + "Family": "Language Models", + "Description": "This resource contains model weights for five Transformer-based models: roBERTa, GPT-2, T5, BART and COMET.These models were implemented using HuggingFace, and fine-tuned on the following four commonsense reasoning tasks: Argument Reasoning Comprehension Task (ARCT), AI2 Reasoning Challenge (ARC), Physical Interaction Question Answering (PIQA) and CommonsenseQA (CSQA).\nThe models are available for download form the PORTULAN repository.", + "Language": ["eng"], + "Licence": "MIT", + "Size": [], + "Annotation": ["commonsense reasoning"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "https://hdl.handle.net/21.11129/0000-000F-4869-B" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/conll-2017-shared.json b/lexical-resources/language-models/conll-2017-shared.json new file mode 100644 index 0000000..b834207 --- /dev/null +++ b/lexical-resources/language-models/conll-2017-shared.json @@ -0,0 +1,16 @@ +{ + "Name": "CoNLL 2017 Shared Task - UDPipe Baseline Models and Supplementary Materials", + "URL": "http://hdl.handle.net/11234/1-1990", + "Family": "Language Models", + "Description": "These are models for the dependency parser UDPipe, developed as part of the CoNLL 2017 Shared Task in UD Parsing.\nThe models are available for download from the LINDAT repository.", + "Language": ["Multiple languages"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11234/1-1990" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/conll-2018-shared.json b/lexical-resources/language-models/conll-2018-shared.json new file mode 100644 index 0000000..a6e3749 --- /dev/null +++ b/lexical-resources/language-models/conll-2018-shared.json @@ -0,0 +1,16 @@ +{ + "Name": "CoNLL 2018 Shared Task - UDPipe Baseline Models and Supplementary Materials", + "URL": "http://hdl.handle.net/11234/1-2859", + "Family": "Language Models", + "Description": "This is a baseline model for UDPipe (version 1.2 and up), created for the CoNLL 2018 Shared Task in UD Parsing. The models were trained using a custom data split for treebanks where no development data is provided.\nThe model is available for download from the LINDAT repository.", + "Language": ["Multiple languages"], + "Licence": "License Universal Dependencies v2.2", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11234/1-2859" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/conll-nametag.json b/lexical-resources/language-models/conll-nametag.json new file mode 100644 index 0000000..2255042 --- /dev/null +++ b/lexical-resources/language-models/conll-nametag.json @@ -0,0 +1,16 @@ +{ + "Name": "English Model (CoNLL-2003) for NameTag", + "URL": "http://hdl.handle.net/11234/1-3118", + "Family": "Language Models", + "Description": "This is an English model for NameTag, a named entity recognition tool. The model is trained on CoNLL-2003 training data and recognizes PER, ORG, LOC and MISC named entities. It achieves an F-measure 84.73 on the CoNLL-2003 test data.\nThe model is available for download from the LINDAT repository.", + "Language": ["eng"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3118" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/crosloengual-bert.json b/lexical-resources/language-models/crosloengual-bert.json new file mode 100644 index 0000000..1e10455 --- /dev/null +++ b/lexical-resources/language-models/crosloengual-bert.json @@ -0,0 +1,16 @@ +{ + "Name": "CroSloEngual BERT 1.1", + "URL": "http://hdl.handle.net/11356/1330", + "Family": "Language Models", + "Description": "Trilingual BERT (Bidirectional Encoder Representations from Transformers) model, trained on Croatian, Slovenian, and English data. State of the art tool representing words/tokens as contextually dependent word embeddings, used for various NLP classification tasks by finetuning the model end-to-end. CroSloEngual BERT are neural network weights and configuration files in pytorch format (i.e. to be used with pytorch library).\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["hrv", "eng", "slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["word embeddings"], + "Infrastructure": "CLARIN", + "Group": "Contextual Word Embeddings", + "Access": { + "Download": "http://hdl.handle.net/11356/1330" + }, + "Publication": "Ulčar and Robnik-Šikonja (2020)" +} diff --git a/lexical-resources/language-models/cubbitt-en-cs.json b/lexical-resources/language-models/cubbitt-en-cs.json new file mode 100644 index 0000000..d962ea0 --- /dev/null +++ b/lexical-resources/language-models/cubbitt-en-cs.json @@ -0,0 +1,16 @@ +{ + "Name": "CUBBITT Translation Models (en-cs) (v1.0)", + "URL": "http://hdl.handle.net/11234/1-3733", + "Family": "Language Models", + "Description": "These English-Czech translation models are used by the .\nThe model is available for download from the LINDAT repository.", + "Language": ["ces", "eng"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3733" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/cubbitt-en-fr.json b/lexical-resources/language-models/cubbitt-en-fr.json new file mode 100644 index 0000000..0f24dda --- /dev/null +++ b/lexical-resources/language-models/cubbitt-en-fr.json @@ -0,0 +1,16 @@ +{ + "Name": "CUBBITT Translation Models (en-fr) (v1.0)", + "URL": "http://hdl.handle.net/11234/1-3743", + "Family": "Language Models", + "Description": "These are CUBBITT English-French translation models available in the LINDAT translation service.\nThe models are available for download from the LINDAT repository.", + "Language": ["eng", "fra"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3743" + }, + "Publication": "Popel et al. (2020)" +} diff --git a/lexical-resources/language-models/cubbitt-en-pl.json b/lexical-resources/language-models/cubbitt-en-pl.json new file mode 100644 index 0000000..32283f1 --- /dev/null +++ b/lexical-resources/language-models/cubbitt-en-pl.json @@ -0,0 +1,16 @@ +{ + "Name": "CUBBITT Translation Models (en-pl) (v1.0)", + "URL": "http://hdl.handle.net/11234/1-3742", + "Family": "Language Models", + "Description": "These are CUBBITT English-Polish translation models available in the LINDAT translation service.\nThe models are available for download from the LINDAT repository.", + "Language": ["eng", "pol"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3742" + }, + "Publication": "Popel et al. (2020)" +} diff --git a/lexical-resources/language-models/czech-neural-monkeys.json b/lexical-resources/language-models/czech-neural-monkeys.json new file mode 100644 index 0000000..37f42f9 --- /dev/null +++ b/lexical-resources/language-models/czech-neural-monkeys.json @@ -0,0 +1,16 @@ +{ + "Name": "Czech image captioning, machine translation, sentiment analysis and summarization (Neural Monkey models)", + "URL": "http://hdl.handle.net/11234/1-3145", + "Family": "Language Models", + "Description": "These models are for the Neural Monkey toolkit for Czech and English, solving four NLP tasks: machine translation, image captioning, sentiment analysis, and summarization. The models are trained on standard datasets and achieve state-of-the-art or near state-of-the-art performance in the tasks. The same models can also be invoked via an online demo.\nThis entry also includes models for automatic news summarization for Czech and English. The Czech models were trained using the SumeCzech dataset, while the English models were trained using the CNN-Daily Mail corpus, using the standard recurrent sequence-to-sequence architecture.\nThe models are available for download from the LINDAT repository.", + "Language": ["ces", "eng"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3145" + }, + "Publication": "Libovicky et al. (2018)" +} diff --git a/lexical-resources/language-models/dep-parsing-pol.json b/lexical-resources/language-models/dep-parsing-pol.json new file mode 100644 index 0000000..df8e56d --- /dev/null +++ b/lexical-resources/language-models/dep-parsing-pol.json @@ -0,0 +1,16 @@ +{ + "Name": "Dependency parsing models for Polish", + "URL": "http://hdl.handle.net/11321/552", + "Family": "Language Models", + "Description": "These models are trained on the 3.5 version of the Polish Dependency Treebank with the publicly available parsing systems: MaltParser, MateParser, and UDPipe.\nThe models are available for download from the CLARIN-PL repository.", + "Language": ["pol"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://zil.ipipan.waw.pl/PDB/PDBparser" + }, + "Publication": "Wroblewska and Rybak (2019)" +} diff --git a/lexical-resources/language-models/dep-parsing-stanza.json b/lexical-resources/language-models/dep-parsing-stanza.json new file mode 100644 index 0000000..85e93ba --- /dev/null +++ b/lexical-resources/language-models/dep-parsing-stanza.json @@ -0,0 +1,16 @@ +{ + "Name": "Dependency parsing model: Stanza", + "URL": "https://spraakbanken.gu.se/index.php/en/resources/stanzasynt", + "Family": "Language Models", + "Description": "", + "Language": ["swe"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "https://spraakbanken.gu.se/index.php/en/resources/stanzasynt" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/elmo-embeddings.json b/lexical-resources/language-models/elmo-embeddings.json new file mode 100644 index 0000000..32e269d --- /dev/null +++ b/lexical-resources/language-models/elmo-embeddings.json @@ -0,0 +1,16 @@ +{ + "Name": "ELMo embeddings models for seven languages", + "URL": "http://hdl.handle.net/11356/1277", + "Family": "Language Models", + "Description": "This model is used to produce contextual word embeddings. It is trained on large monolingual corpora for 7 languages. Each language's model was trained for approximately 10 epochs. Corpora sizes used in training range from over 270 M tokens in Latvian to almost 2 B tokens in Croatian. About 1 million most common tokens were provided as vocabulary during the training for each language model. The model can also infer OOV words, since the neural network input is on the character level.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["hrv", "est", "fin", "lav", "lit", "slv", "swe"], + "Licence": "Apache License 2.0", + "Size": [], + "Annotation": ["word embeddings"], + "Infrastructure": "CLARIN", + "Group": "Contextual Word Embeddings", + "Access": { + "Download": "http://hdl.handle.net/11356/1277" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/embeddings-eng-wiki.json b/lexical-resources/language-models/embeddings-eng-wiki.json new file mode 100644 index 0000000..93e58e0 --- /dev/null +++ b/lexical-resources/language-models/embeddings-eng-wiki.json @@ -0,0 +1,16 @@ +{ + "Name": "Word Embeddings trained on English Wikipedia", + "URL": "https://spraakbanken.gu.se/en/resources/wikipedia-embeddings", + "Family": "Language Models", + "Description": "This is a set of contextual word embeddings.\nThe models are available for download from the Swedish Language Bank.", + "Language": ["swe"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["word embeddings"], + "Infrastructure": "CLARIN", + "Group": "Contextual Word Embeddings", + "Access": { + "Download": "https://spraakbanken.gu.se/en/resources/wikipedia-embeddings" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/eng-mod-morphodita.json b/lexical-resources/language-models/eng-mod-morphodita.json new file mode 100644 index 0000000..1d76494 --- /dev/null +++ b/lexical-resources/language-models/eng-mod-morphodita.json @@ -0,0 +1,16 @@ +{ + "Name": "English Models (Morphium + WSJ) for MorphoDiTa", + "URL": "http://hdl.handle.net/11858/00-097C-0000-0023-68D9-0", + "Family": "Language Models", + "Description": "These models are for MorphoDiTa, which performs morphological analysis, morphological generation and part-of-speech tagging (see also the PoS-taggers and lemmatizers Resource Family).\nThe morphological dictionary is created from Morphium and SCOWL (Spell Checker Oriented Word Lists), the PoS tagger is trained on the Wall Street Journal.", + "Language": ["eng"], + "Licence": "CC BY-NC-SA 3.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11858/00-097C-0000-0023-68D9-0" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/face-domain-specific.json b/lexical-resources/language-models/face-domain-specific.json new file mode 100644 index 0000000..3c38e20 --- /dev/null +++ b/lexical-resources/language-models/face-domain-specific.json @@ -0,0 +1,16 @@ +{ + "Name": "Face-domain-specific automatic speech recognition models", + "URL": "http://hdl.handle.net/11356/1749", + "Family": "Language Models", + "Description": "This model contains all the files required to implement face-domain-specific automatic speech recognition (ASR) applications using the Kaldi ASR toolkit, including the acoustic model, language model, and other relevant files. It also includes all the scripts and configuration files needed to use these models for implementing face-domain-specific automatic speech recognition.\nThe acoustic model was trained using the relevant Kaldi ASR tools and the Artur speech corpus (audio,transcriptions). The language model was trained using the domain-specific text data involving face descriptions obtained by translating the Face2Text English dataset into the Slovenian language. These models, combined with other necessary files like the HCLG.fst and decoding scripts, enable the implementation of face-domain-specific ASR applications.\nThis resource is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "Apache License 2.0", + "Size": [], + "Annotation": ["face-domain-specific automatic speech recognition"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "http://hdl.handle.net/11356/1749" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/finbert.json b/lexical-resources/language-models/finbert.json new file mode 100644 index 0000000..a278eef --- /dev/null +++ b/lexical-resources/language-models/finbert.json @@ -0,0 +1,16 @@ +{ + "Name": "FinBERT", + "URL": "http://urn.fi/urn:nbn:fi:lb-202004212", + "Family": "Language Models", + "Description": "This BERT model can be fine-tuned to achieve state-of-the-art results for various Finnish natural language processing tasks.\nThe model is available for download from the Language Bank of Finland.", + "Language": ["fin"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://urn.fi/urn:nbn:fi:lb-202004212" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/frenk-mmc-rtv.json b/lexical-resources/language-models/frenk-mmc-rtv.json new file mode 100644 index 0000000..02cd118 --- /dev/null +++ b/lexical-resources/language-models/frenk-mmc-rtv.json @@ -0,0 +1,16 @@ +{ + "Name": "Dataset and baseline model of moderated content FRENK-MMC-RTV 1.0", + "URL": "http://hdl.handle.net/11356/1201", + "Family": "Language Models", + "Description": "FRENK-MMC-RTV is a dataset of moderated newspaper comments from the website rtvslo.si with metadata on the time of publishing, user identifier, thread identifier and whether the comment was deleted by the moderators or not. The full text of each comment is encrypted via a character-replacement method so that the comments are not readable by humans. Basic punctuation is not encrypted in order to enable tokenization. The main use of this dataset are experiments on automating comment moderation. For real-world usage, a fastText classification model trained on non-encrypted data is made available as well.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "http://hdl.handle.net/11356/1201" + }, + "Publication": "Ljubešić et al. (2018)" +} diff --git a/lexical-resources/language-models/g2p-icelandic.json b/lexical-resources/language-models/g2p-icelandic.json new file mode 100644 index 0000000..cdda283 --- /dev/null +++ b/lexical-resources/language-models/g2p-icelandic.json @@ -0,0 +1,16 @@ +{ + "Name": "Models for automatic g2p for Icelandic (20.10)", + "URL": "http://hdl.handle.net/20.500.12537/84", + "Family": "Language Models", + "Description": "These are grapheme-to-phoneme models for Icelandic, trained on an encoder-decoder LSTM neural network. The models are delivered with scripts for automatic transcription of Icelandic in the standard pronunciation variation, in the northern variation, north-east variation, and the south variation. To run the scripts the user needs to install Fairseq.", + "Language": ["isl"], + "Licence": "Apache License 2.0", + "Size": [], + "Annotation": ["phonemic transcription"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "http://hdl.handle.net/20.500.12537/84" + }, + "Publication": "Gorman et al. (2020)" +} diff --git a/lexical-resources/language-models/gervasio-pt-br-base.json b/lexical-resources/language-models/gervasio-pt-br-base.json new file mode 100644 index 0000000..80ccccc --- /dev/null +++ b/lexical-resources/language-models/gervasio-pt-br-base.json @@ -0,0 +1,16 @@ +{ + "Name": "Gervásio PT-BR base", + "URL": "https://hdl.handle.net/21.11129/0000-000F-FF48-2 ", + "Family": "Language Models", + "Description": "This model, which is for Portuguese spoken in Brazil, is a decoder of the GPT family that is based on the neural architecture Transformer and developed over the Pythia model.\nThe model is available for download from Hugging Face.", + "Language": ["por"], + "Licence": "MIT", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://hdl.handle.net/21.11129/0000-000F-FF48-2 " + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/gervasio-pt-pt-base.json b/lexical-resources/language-models/gervasio-pt-pt-base.json new file mode 100644 index 0000000..1cced0c --- /dev/null +++ b/lexical-resources/language-models/gervasio-pt-pt-base.json @@ -0,0 +1,16 @@ +{ + "Name": "Gervásio PT-PT base", + "URL": "https://hdl.handle.net/21.11129/0000-000F-FF47-3", + "Family": "Language Models", + "Description": "This model, which is for European Portuguese, is a decoder of the GPT family that is based on the neural architecture Transformer and developed over the Pythia model.\nThe model is available for download from Hugging Face.", + "Language": ["por"], + "Licence": "MIT", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://huggingface.co/PORTULAN/gervasio-ptpt-base" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/greynir-mbart.json b/lexical-resources/language-models/greynir-mbart.json new file mode 100644 index 0000000..f44ef43 --- /dev/null +++ b/lexical-resources/language-models/greynir-mbart.json @@ -0,0 +1,16 @@ +{ + "Name": "GreynirTranslate - mBART25 NMT (with layer drop) models for Translations between Icelandic and English (1.0)", + "URL": "http://hdl.handle.net/20.500.12537/128", + "Family": "Language Models", + "Description": "These are a variant of GreynirTranslate - mBART25 NMT models for Translations between Icelandic and English (1.0), trained with a 40% layer drop. They are suitable for inference using every other layer for optimized inference speed with lower translation performance.\nThese models are available for download from the repository of CLARIN-IS.", + "Language": ["isl", "eng"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/20.500.12537/128" + }, + "Publication": "Simonarson et al. (2021)" +} diff --git a/lexical-resources/language-models/greynir-t2t.json b/lexical-resources/language-models/greynir-t2t.json new file mode 100644 index 0000000..c5572f5 --- /dev/null +++ b/lexical-resources/language-models/greynir-t2t.json @@ -0,0 +1,16 @@ +{ + "Name": "GreynirT2T Serving - En--Is NMT Inference and Pre-trained Models (1.0)", + "URL": "http://hdl.handle.net/20.500.12537/72", + "Family": "Language Models", + "Description": "This CLARIN-IS repository entry includes code and models required to run the GreynirT2T Transformer NMT system for translation between English and Icelandic.\nThe models along with the code are available for download from the CLARIN-IS repository.", + "Language": ["eng", "isl"], + "Licence": "The MIT License", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/20.500.12537/72" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/korektor-czech.json b/lexical-resources/language-models/korektor-czech.json new file mode 100644 index 0000000..f51e7b8 --- /dev/null +++ b/lexical-resources/language-models/korektor-czech.json @@ -0,0 +1,16 @@ +{ + "Name": "Czech Models for Korektor 2", + "URL": "http://hdl.handle.net/11234/1-1460", + "Family": "Language Models", + "Description": "These models are for the statistical spellchecker Korektor 2. The models can either perform spellchecking and grammar-checking, or only generate diacritical marks.\nThe models are available for download from the LINDAT repository.", + "Language": ["ces"], + "Licence": "CC BY-NC-SA 3.0", + "Size": [], + "Annotation": ["normalization"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "http://hdl.handle.net/11234/1-1460" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/lemma-stanza.json b/lexical-resources/language-models/lemma-stanza.json new file mode 100644 index 0000000..c03eea2 --- /dev/null +++ b/lexical-resources/language-models/lemma-stanza.json @@ -0,0 +1,16 @@ +{ + "Name": "Lemmatization model: Stanza", + "URL": "https://spraakbanken.gu.se/index.php/en/resources/stanzalem", + "Family": "Language Models", + "Description": "This model enables lemmatisation of Swedish text following the SUC3 standard.\nThe models are available for download from the Swedish Language Bank.", + "Language": ["swe"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Lemmatisation", + "Access": { + "Download": "https://spraakbanken.gu.se/index.php/en/resources/stanzalem" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/liner-events.json b/lexical-resources/language-models/liner-events.json new file mode 100644 index 0000000..d15ff6c --- /dev/null +++ b/lexical-resources/language-models/liner-events.json @@ -0,0 +1,16 @@ +{ + "Name": "Liner2.5 model Events", + "URL": "http://hdl.handle.net/11321/301", + "Family": "Language Models", + "Description": "This is a model for the Liner2.5 tool for the recognition of event mentions.\nThe model is available for download from the CLARIN-PL repository.", + "Language": ["pol"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["event mentions"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "http://hdl.handle.net/11321/301" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/liner-ner-nkjp.json b/lexical-resources/language-models/liner-ner-nkjp.json new file mode 100644 index 0000000..5b388ac --- /dev/null +++ b/lexical-resources/language-models/liner-ner-nkjp.json @@ -0,0 +1,16 @@ +{ + "Name": "Liner2.6 model NER NKJP", + "URL": "http://hdl.handle.net/11321/598", + "Family": "Language Models", + "Description": "This is a Liner2 model for the recognition of named entities. The model was trained on the NKJP corpus and evaluated in the PolEval 2018 Task 2.\nThe model is available for download from the CLARIN-PL repository.", + "Language": ["pol"], + "Licence": "GNU GPL3", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11321/598" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/liner-ner.json b/lexical-resources/language-models/liner-ner.json new file mode 100644 index 0000000..b768c17 --- /dev/null +++ b/lexical-resources/language-models/liner-ner.json @@ -0,0 +1,16 @@ +{ + "Name": "Liner2.5 model NER", + "URL": "http://hdl.handle.net/11321/263", + "Family": "Language Models", + "Description": "This is a model for the Liner 2.5 tool. #SEPThe model is available for download from the CLARIN-PL repository.", + "Language": ["pol"], + "Licence": "GNU LGPL 3.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11321/263" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/liner-timex.json b/lexical-resources/language-models/liner-timex.json new file mode 100644 index 0000000..d63cbf3 --- /dev/null +++ b/lexical-resources/language-models/liner-timex.json @@ -0,0 +1,16 @@ +{ + "Name": "Liner2.5 model Timex", + "URL": "http://hdl.handle.net/11321/302", + "Family": "Language Models", + "Description": "This is a model for the Liner2.5 tool for the recognition and normalization on temporal expressions.#SEPThe model is available for download from the CLARIN-PL repository.", + "Language": ["pol"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["temporal expressions"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "http://hdl.handle.net/11321/302" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/liner.json b/lexical-resources/language-models/liner.json new file mode 100644 index 0000000..76d6946 --- /dev/null +++ b/lexical-resources/language-models/liner.json @@ -0,0 +1,16 @@ +{ + "Name": "Liner2.5 model Minos", + "URL": "http://hdl.handle.net/11321/292", + "Family": "Language Models", + "Description": "This is a model for the Liner2.5 tool for the recognition of verbs without explicit subjects.\nThe model is available for download from the CLARIN-PL repository.", + "Language": ["pol"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11321/292" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/litlatbert.json b/lexical-resources/language-models/litlatbert.json new file mode 100644 index 0000000..f894798 --- /dev/null +++ b/lexical-resources/language-models/litlatbert.json @@ -0,0 +1,16 @@ +{ + "Name": "LitLat BERT", + "URL": "http://hdl.handle.net/20.500.11821/42", + "Family": "Language Models", + "Description": "", + "Language": ["lit", "lav", "eng"], + "Licence": "PUB CLARIN-LT", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "http://hdl.handle.net/20.500.11821/42" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/lvbert.json b/lexical-resources/language-models/lvbert.json new file mode 100644 index 0000000..a7d09ef --- /dev/null +++ b/lexical-resources/language-models/lvbert.json @@ -0,0 +1,16 @@ +{ + "Name": "LVBERT - Latvian BERT", + "URL": "http://hdl.handle.net/20.500.12574/43", + "Family": "Language Models", + "Description": "This model is trained on the original implementation of BERT on the TensorFlow machine-learning platform with the whole-word masking and the next sentence prediction objectives. This uses the BERT configuration with 12 layers, 768 hidden units, 12 heads, 128 sequence length, 128 mini-batch size and a 32,000 token vocabulary.\nTHe model is available for download from the CLARIN-LV repository.", + "Language": ["lav"], + "Licence": "GNU GPL3", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "http://hdl.handle.net/20.500.12574/43" + }, + "Publication": "Znotinš and Barzdinš (2020)" +} diff --git a/lexical-resources/language-models/lx-dsemvectors.json b/lexical-resources/language-models/lx-dsemvectors.json new file mode 100644 index 0000000..76b96e8 --- /dev/null +++ b/lexical-resources/language-models/lx-dsemvectors.json @@ -0,0 +1,16 @@ +{ + "Name": "LX-DSemVectors", + "URL": "https://hdl.handle.net/21.11129/0000-000B-D38A-B", + "Family": "Language Models", + "Description": "This model represents tokens as contextual word embeddings for Portuguese. It was trained on a corpus of 2 billion tokens and achieved state-of-the-art results on multiple lexical semantic tasks.\nThe model is available for download from the PORTULAN repository.", + "Language": ["por"], + "Licence": "CC-BY", + "Size": [], + "Annotation": ["word embeddings"], + "Infrastructure": "CLARIN", + "Group": "Contextual Word Embeddings", + "Access": { + "Download": "https://hdl.handle.net/21.11129/0000-000B-D38A-B" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/mcsq-tm-en-de.json b/lexical-resources/language-models/mcsq-tm-en-de.json new file mode 100644 index 0000000..e7457c2 --- /dev/null +++ b/lexical-resources/language-models/mcsq-tm-en-de.json @@ -0,0 +1,16 @@ +{ + "Name": "MCSQ Translation Models (en-de) (v1.0)", + "URL": "http://hdl.handle.net/11234/1-4680", + "Family": "Language Models", + "Description": "These are English-German translation models available in the LINDAT translation service. The models are trained using the MCSQ social surveys dataset (available here ).\nThe models are available for download from the LINDAT repository.", + "Language": ["eng", "deu"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-4680" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/mcsq-tm-en-ru.json b/lexical-resources/language-models/mcsq-tm-en-ru.json new file mode 100644 index 0000000..4a5e41e --- /dev/null +++ b/lexical-resources/language-models/mcsq-tm-en-ru.json @@ -0,0 +1,16 @@ +{ + "Name": "MCSQ Translation Models (en-ru) (v1.0)", + "URL": "http://hdl.handle.net/11234/1-4681", + "Family": "Language Models", + "Description": "These are English-Russian translation models available in the LINDAT translation service. The models are trained using the MCSQ social surveys dataset (available here ).\nThe models are available for download from the LINDAT repository.", + "Language": ["eng", "rus"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-4681" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/morflex-cz-161115.json b/lexical-resources/language-models/morflex-cz-161115.json new file mode 100644 index 0000000..da7c653 --- /dev/null +++ b/lexical-resources/language-models/morflex-cz-161115.json @@ -0,0 +1,16 @@ +{ + "Name": "Czech Models (MorfFlex CZ 161115 + PDT 3.0) for MorphoDiTa 161115", + "URL": "http://hdl.handle.net/11234/1-1836", + "Family": "Language Models", + "Description": "These models were developed for MorphoDiTa, which performs morphological analysis, morphological generation and part-of-speech tagging (see also the PoS-taggers and lemmatizers Resource Family). The morphological dictionary is created from the 161115 version of the MorfFlex CZ lexicon and the 1.2 version of the DeriNet lexical network. The PoS tagger is trained on Prague Dependency Treebank 3.0.\nThe models are available for download from the LINDAT repository.", + "Language": ["ces"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11234/1-1836" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/nametag2.json b/lexical-resources/language-models/nametag2.json new file mode 100644 index 0000000..0d937bc --- /dev/null +++ b/lexical-resources/language-models/nametag2.json @@ -0,0 +1,16 @@ +{ + "Name": "NameTag 2 Models", + "URL": "http://hdl.handle.net/11234/1-3773", + "Family": "Language Models", + "Description": "These models are for NameTag 2, a named entity recognition tool (see also the Named Entity Recognizers Resource Family). The documentation is available separately on the project webpage.\nThe models are available for download from the LINDAT repository.", + "Language": ["ces", "nld", "eng", "deu", "spa"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3773" + }, + "Publication": "Straková et al. (2019)" +} diff --git a/lexical-resources/language-models/por-roberta.json b/lexical-resources/language-models/por-roberta.json new file mode 100644 index 0000000..e9fab43 --- /dev/null +++ b/lexical-resources/language-models/por-roberta.json @@ -0,0 +1,16 @@ +{ + "Name": "Portuguese RoBERTa language model", + "URL": "https://hdl.handle.net/21.11129/0000-000E-631E-2", + "Family": "Language Models", + "Description": "This is a pre-trained roBERTa model in Portuguese, with 6 layers and 12 attention-heads, totaling 68M parameters. Pre-training was done on 10 million Portuguese sentences and 10 million English sentences from the OSCAR corpus.\nThe model is available for download from the PORUTLAN repository.", + "Language": ["por"], + "Licence": "CC-BY", + "Size": [], + "Annotation": ["Baseline"], + "Infrastructure": "CLARIN", + "Group": "Baseline", + "Access": { + "Download": "https://hdl.handle.net/21.11129/0000-000E-631E-2" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/pos-lemma-ces.json b/lexical-resources/language-models/pos-lemma-ces.json new file mode 100644 index 0000000..50e254b --- /dev/null +++ b/lexical-resources/language-models/pos-lemma-ces.json @@ -0,0 +1,16 @@ +{ + "Name": "POS Tagging and Lemmatization (Czech model)", + "URL": "http://hdl.handle.net/11234/1-4613", + "Family": "Language Models", + "Description": "This model is trained using RobeCzech, which is the Czech version of BERT. The model is trained on the Prague Dependency Treebank 3.5.\nThe model is available for download from the LINDAT repository.", + "Language": ["ces"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax and lemmatisation"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11234/1-4613" + }, + "Publication": "Vysušilová (2021)" +} diff --git a/lexical-resources/language-models/pos-tag-flair.json b/lexical-resources/language-models/pos-tag-flair.json new file mode 100644 index 0000000..84f58ef --- /dev/null +++ b/lexical-resources/language-models/pos-tag-flair.json @@ -0,0 +1,16 @@ +{ + "Name": "POS-tagging model: Flair", + "URL": "https://spraakbanken.gu.se/index.php/en/resources/flair", + "Family": "Language Models", + "Description": "This is a set of 2 models. flair_eval is trained on SUC3 with Talbanken_SBX_devas dev set. The advantage of this model is that it can be evaluated, using Talbanken_SBX_test or SIC2. flair_full is trained on SUC3, Talbanken_SBX_test, SIC2 with Talbanken_SBX_dev as dev set.\nThe models are available for download from the Swedish Language Bank.", + "Language": ["swe"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "https://spraakbanken.gu.se/index.php/en/resources/flair" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/pos-tag-marmot.json b/lexical-resources/language-models/pos-tag-marmot.json new file mode 100644 index 0000000..a337915 --- /dev/null +++ b/lexical-resources/language-models/pos-tag-marmot.json @@ -0,0 +1,16 @@ +{ + "Name": "POS-tagging model: Marmot", + "URL": "https://spraakbanken.gu.se/index.php/en/resources/marmot", + "Family": "Language Models", + "Description": "This is a set of 2 models. marmot_eval is trained on SUC3 and the Talbanken_SBX_dev treebank, using Saldo as dictionary. marmot_full is trained on SUC3, the Talbanken_SBX_dev treebank, and SIC2 (with Saldo as dictionary).\nThe models are available for download from the Swedish Language Bank.", + "Language": ["swe"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "https://spraakbanken.gu.se/index.php/en/resources/marmot" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/pos-tag-stanza.json b/lexical-resources/language-models/pos-tag-stanza.json new file mode 100644 index 0000000..681186b --- /dev/null +++ b/lexical-resources/language-models/pos-tag-stanza.json @@ -0,0 +1,16 @@ +{ + "Name": "POS-tagging model: Stanza", + "URL": "https://spraakbanken.gu.se/index.php/en/resources/stanzamorph", + "Family": "Language Models", + "Description": "This is a set of 2 models. stanza_eval is trained on SUC3 and the Talbanken_SBX_dev treebank. stanza_full is trained on the SUC3, Talbanken_SBX_test, and SIC2 sets, with Talbanken_SBX_dev as dev set.\nThe models are available for download from the Swedish Language Bank.", + "Language": ["swe"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "https://spraakbanken.gu.se/index.php/en/resources/stanzamorph" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/pytorch-sloner.json b/lexical-resources/language-models/pytorch-sloner.json new file mode 100644 index 0000000..b26cf69 --- /dev/null +++ b/lexical-resources/language-models/pytorch-sloner.json @@ -0,0 +1,16 @@ +{ + "Name": "PyTorch model for Slovenian Named Entity Recognition SloNER 1.0", + "URL": "http://hdl.handle.net/11356/1758", + "Family": "Language Models", + "Description": "This is a model for Slovenian Named Entity Recognition. It is is a PyTorch neural network model, intended for usage with the HuggingFace transformers library .\nThe model is based on the Slovenian RoBERTa contextual embeddings model SloBERTa 2.0. The model was trained on the SUK 1.0 training corpus.The source code of the model is available on GitHub repository.\nThe model is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["named entity recognition"], + "Infrastructure": "CLARIN", + "Group": "Named Entity Recognition", + "Access": { + "Download": "http://hdl.handle.net/11356/1758" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/pytorch-slv.json b/lexical-resources/language-models/pytorch-slv.json new file mode 100644 index 0000000..0d6862b --- /dev/null +++ b/lexical-resources/language-models/pytorch-slv.json @@ -0,0 +1,16 @@ +{ + "Name": "PyTorch model for Slovenian Coreference Resolution", + "URL": "http://hdl.handle.net/11356/1773", + "Family": "Language Models", + "Description": "This is a Slovenian model for coreference resolution: a neural network based on a customized transformer architecture, usable with this code. The model is based on the Slovenian CroSloEngual BERT 1.1 model. It was trained on the SUK 1.0 training corpus, specifically the SentiCoref subcorpus.\nThis resource is available for download from the CLARIN.SI repository.", + "Language": ["slv"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["coreference resolution"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "http://hdl.handle.net/11356/1773" + }, + "Publication": "Klemen & Žitnik (2022)" +} diff --git a/lexical-resources/language-models/ruv-di.json b/lexical-resources/language-models/ruv-di.json new file mode 100644 index 0000000..039ab31 --- /dev/null +++ b/lexical-resources/language-models/ruv-di.json @@ -0,0 +1,16 @@ +{ + "Name": "RÚV-DI Speaker Diarization v5 models (21.05)", + "URL": "http://hdl.handle.net/20.500.12537/109", + "Family": "Language Models", + "Description": "These models are trained on the Althingi Parliamentary Speech corpus hosted by CLARIN-IS. The models use MFCCS, x-vectors, PLDA and AHC\nThe models are available for download from the CLARIN-IS repository.", + "Language": ["isl"], + "Licence": "CC BY 4.0", + "Size": [], + "Annotation": ["diarization"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "http://hdl.handle.net/20.500.12537/109" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/sentiment-czech.json b/lexical-resources/language-models/sentiment-czech.json new file mode 100644 index 0000000..466099f --- /dev/null +++ b/lexical-resources/language-models/sentiment-czech.json @@ -0,0 +1,16 @@ +{ + "Name": "Sentiment Analysis (Czech Model)", + "URL": "http://hdl.handle.net/11234/1-4601", + "Family": "Language Models", + "Description": "These models are trained on data from the following sources: Mall (product reviews), CSFD (movie reviews), and Facebook, and joint data from all three datasets above (data available here, using RobeCzech, which is the Czech version of BERT.", + "Language": ["ces"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["sentiment analysis"], + "Infrastructure": "CLARIN", + "Group": "Other", + "Access": { + "Download": "http://hdl.handle.net/11234/1-4601" + }, + "Publication": "Vysušilová (2021)" +} diff --git a/lexical-resources/language-models/slavic-forest.json b/lexical-resources/language-models/slavic-forest.json new file mode 100644 index 0000000..b119d27 --- /dev/null +++ b/lexical-resources/language-models/slavic-forest.json @@ -0,0 +1,16 @@ +{ + "Name": "Slavic Forest, Norwegian Wood (models)", + "URL": "http://hdl.handle.net/11234/1-1971", + "Family": "Language Models", + "Description": "These are models for the dependency parser UDPipe used to produce the authors' final submission to the Vardial 2017 CLP shared task. The scripts and commands used to create the models are part of a separate LINDAT repository entry. The models were trained with UDPipe version 3e65d69 from 3 January 2017; their functionality with newer or older versions of UDPipe is not guaranteed.\nThe models are available for download from the LINDAT repository.", + "Language": ["hrv", "nor", "slk"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11234/1-1971" + }, + "Publication": "Rosa et al. (2017)" +} diff --git a/lexical-resources/language-models/slk-morphodita.json b/lexical-resources/language-models/slk-morphodita.json new file mode 100644 index 0000000..2805d79 --- /dev/null +++ b/lexical-resources/language-models/slk-morphodita.json @@ -0,0 +1,16 @@ +{ + "Name": "Slovak MorphoDiTa Models 170914", + "URL": "http://hdl.handle.net/11234/1-3278", + "Family": "Language Models", + "Description": "These are Slovak models for MorphoDiTa, a tool which provides morphological analysis, morphological generation and part-of-speech tagging. The morphological dictionary is created from MorfFlex (SK 170914) and the PoS tagger is trained on automatic translations in Prague Dependency Treebank 3.0.\nThe models are available for download from the LINDAT repository.", + "Language": ["slk"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["morphosyntax"], + "Infrastructure": "CLARIN", + "Group": "Morphosyntax", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3278" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/sloberta.json b/lexical-resources/language-models/sloberta.json new file mode 100644 index 0000000..9986aac --- /dev/null +++ b/lexical-resources/language-models/sloberta.json @@ -0,0 +1,15 @@ +{ + "Name": "Slovenian RoBERTa contextual embeddings model: SloBERTa 2.0", + "URL": "http://hdl.handle.net/11356/1397", + "Family": "Language Models", + "Description": "The monolingual Slovene RoBERTa (A Robustly Optimized Bidirectional Encoder Representations from Transformers) model is a state-of-the-art model representing words/tokens as contextually dependent word embeddings, used for various NLP tasks. Word embeddings can be extracted for every word occurrence and then used in training a model for an end task, but typically the whole RoBERTa model is fine-tuned end-to-end.", + "Language": ["slv"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["word embeddings"], + "Infrastructure": "CLARIN", + "Group": "Contextual Word Embeddings", + "Access": { + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/trans-models-en-de.json b/lexical-resources/language-models/trans-models-en-de.json new file mode 100644 index 0000000..c1e8992 --- /dev/null +++ b/lexical-resources/language-models/trans-models-en-de.json @@ -0,0 +1,16 @@ +{ + "Name": "Translation Models (English-German)", + "URL": "http://hdl.handle.net/11234/1-3732", + "Family": "Language Models", + "Description": "These English-German translation models are used by the Lindat translation service.\nThe models are available for download from the LINDAT repository.", + "Language": ["eng", "deu"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3732" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/trans-models-en-ru.json b/lexical-resources/language-models/trans-models-en-ru.json new file mode 100644 index 0000000..a15b46c --- /dev/null +++ b/lexical-resources/language-models/trans-models-en-ru.json @@ -0,0 +1,16 @@ +{ + "Name": "Translation Models (en-ru) (v1.0)", + "URL": "http://hdl.handle.net/11234/1-3744", + "Family": "Language Models", + "Description": "These are CUBBITT English-Russiantranslation models available in the LINDAT translation service.\nThe models are available for download from the LINDAT repository.", + "Language": ["eng", "rus"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3744" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/ud-parsito-models.json b/lexical-resources/language-models/ud-parsito-models.json new file mode 100644 index 0000000..904e790 --- /dev/null +++ b/lexical-resources/language-models/ud-parsito-models.json @@ -0,0 +1,16 @@ +{ + "Name": "Universal Dependencies 1.2 Models for Parsito", + "URL": "http://hdl.handle.net/11234/1-1573", + "Family": "Language Models", + "Description": "These are models for the dependency parser Parsito. They are trained on Universal Dependencies 1.2 Treebanks.", + "Language": ["eng"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11234/1-1573" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/udify-pre.json b/lexical-resources/language-models/udify-pre.json new file mode 100644 index 0000000..eb75389 --- /dev/null +++ b/lexical-resources/language-models/udify-pre.json @@ -0,0 +1,16 @@ +{ + "Name": "UDify Pretrained Model", + "URL": "http://hdl.handle.net/11234/1-3042", + "Family": "Language Models", + "Description": "UDify is a single model that parses Universal Dependencies (UPOS, UFeats, Lemmas, Deps) jointly, accepting any of 75 supported languages as input (trained on UD v2.3 with 124 treebanks). ", + "Language": ["afr", "akk", "amh", "grc", "ara", "hye", "bam", "eus", "bel", "bre", "bul", "cat", "zho", "Church Slavonic", "cop", "hrv", "ces", "dan", "nld", "eng", "myv", "est", "fao", "fin", "fra", "glg", "deu", "got", "heb", "hin", "hun", "ind", "gle", "ita", "jpn", "kaz", "kpv", "kor", "lat", "lav", "lit", "mlt", "mar", "ell", "pcm", "kmr", "sme", "nor", "fro", "fas", "pol", "por", "ron", "Buryat", "rus", "san", "srp", "slk", "slv", "spa", "swe", "swl", "tgl", "tam", "tel", "tha", "tur", "uig", "ukr", "hsb", "urd", "vie", "wbp", "yor", "yue"], + "Licence": "CC BY-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3042" + }, + "Publication": "Kondratyuk and Straka (2019)" +} diff --git a/lexical-resources/language-models/udpipe-models.json b/lexical-resources/language-models/udpipe-models.json new file mode 100644 index 0000000..2162849 --- /dev/null +++ b/lexical-resources/language-models/udpipe-models.json @@ -0,0 +1,16 @@ +{ + "Name": "Universal Dependencies 2.5 Models for UDPipe", + "URL": "http://hdl.handle.net/11234/1-3131", + "Family": "Language Models", + "Description": "These models are for the Universal Dependencies 2.5 treebanks (94 treebanks of 61 languages). In addition to dependency parsing, the models are also for toeknisation, part-of-speech tagging and lemmatisation.\nThe models are available for download from the LINDAT repository.", + "Language": ["afr", "grc", "ara", "hye", "eus", "bel", "bul", "cat", "zho", "Church Slavonic", "cop", "hrv", "ces", "dan", "nld", "eng", "est", "fin", "fra", "glg", "wof", "deu", "got", "heb", "hin", "hun", "ind", "gle", "ita", "jpn", "kaz", "kor", "lat", "lav", "lzh", "lit", "mlt", "mar", "ell", "sme", "nob", "nno", "fro", "orv", "fas", "pol", "por", "ron", "rus", "san", "gla", "srp", "slk", "slv", "spa", "swe", "tam", "tel", "tur", "uig", "ukr", "urd", "vie", "wol"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["syntactic parsing"], + "Infrastructure": "CLARIN", + "Group": "Syntactic Parsing", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3131" + }, + "Publication": "" +} diff --git a/lexical-resources/language-models/wmt-ca-oc-multi.json b/lexical-resources/language-models/wmt-ca-oc-multi.json new file mode 100644 index 0000000..f94e7d2 --- /dev/null +++ b/lexical-resources/language-models/wmt-ca-oc-multi.json @@ -0,0 +1,16 @@ +{ + "Name": "WMT21 Marian translation model (ca-oc multi-task)", + "URL": "http://hdl.handle.net/11234/1-3772", + "Family": "Language Models", + "Description": "This is a neural machine translation model for Catalan to Occitan translation. It is a multi-task model, also producing phonemic transcription of the Catalan source. The model was submitted to WMT21 Multilingual Low-Resource Translation for Indo-European Languages Shared Task as a CUNI-Contrastive system for Catalan to Occitan.\nThe model is available for download from the LINDAT repository.", + "Language": ["cat", "oci"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3772" + }, + "Publication": "Jon et al. (2021)" +} diff --git a/lexical-resources/language-models/wmt-ca-oc.json b/lexical-resources/language-models/wmt-ca-oc.json new file mode 100644 index 0000000..6074262 --- /dev/null +++ b/lexical-resources/language-models/wmt-ca-oc.json @@ -0,0 +1,16 @@ +{ + "Name": "WMT21 Marian translation model (ca-oc)", + "URL": "http://hdl.handle.net/11234/1-3770", + "Family": "Language Models", + "Description": "This is a neural machine translation model for Catalan to Occitan translation and constitutes the primary CUNI submission for WMT21 Multilingual Low-Resource Translation for Indo-European Languages Shared Task.\nThe model is available for download from the LINDAT repository.", + "Language": ["cat", "oci"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3770" + }, + "Publication": "Jon et al. (2021)" +} diff --git a/lexical-resources/language-models/wmt-ca-ro-it.json b/lexical-resources/language-models/wmt-ca-ro-it.json new file mode 100644 index 0000000..29104df --- /dev/null +++ b/lexical-resources/language-models/wmt-ca-ro-it.json @@ -0,0 +1,16 @@ +{ + "Name": "WMT21 Marian translation models (ca-ro,it,oc)", + "URL": "http://hdl.handle.net/11234/1-3769", + "Family": "Language Models", + "Description": "This is a translation model from Catalan into Romanian, Italian, and Occitan that was part of the submission for WMT21 Multilingual Low-Resource Translation for Indo-European Languages Shared Task.\nThe model is available for download from the LINDAT repository.", + "Language": ["cat", "ita", "oci", "ron"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11234/1-3769" + }, + "Publication": "Jon et al. (2021)" +} diff --git a/lexical-resources/language-models/wmt-tuning-cs-en.json b/lexical-resources/language-models/wmt-tuning-cs-en.json new file mode 100644 index 0000000..2513825 --- /dev/null +++ b/lexical-resources/language-models/wmt-tuning-cs-en.json @@ -0,0 +1,16 @@ +{ + "Name": "WMT16 Tuning Shared Task Models (Czech-to-English)", + "URL": "http://hdl.handle.net/11372/LRT-1671", + "Family": "Language Models", + "Description": "These Czech to English translation models are trained on the parallel CzEng 1.6 corpus. The data is tokenized with Moses). Alignment is done using fast_align and the standard Moses pipeline is used for training.\nThe models are available for download from the LINDAT repository.", + "Language": ["ces", "eng"], + "Licence": "CC BY-NC-SA 4.0", + "Size": [], + "Annotation": ["machine translation"], + "Infrastructure": "CLARIN", + "Group": "Machine Translation", + "Access": { + "Download": "http://hdl.handle.net/11372/LRT-1671" + }, + "Publication": "" +}