From 65363e86c64231fbabf99af9533341ab57b1df27 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 3 May 2024 20:32:46 +0200 Subject: [PATCH 1/9] don't replace_additional_special_tokens --- flair/embeddings/transformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py index 552bad798..0622a4e16 100644 --- a/flair/embeddings/transformer.py +++ b/flair/embeddings/transformer.py @@ -1139,7 +1139,9 @@ def is_supported_t5_model(config: PretrainedConfig) -> bool: # If we use a context separator, add a new special token self.use_context_separator = use_context_separator if use_context_separator: - added = self.tokenizer.add_special_tokens({"additional_special_tokens": [SENTENCE_BOUNDARY_TAG]}) + added = self.tokenizer.add_special_tokens( + {"additional_special_tokens": [SENTENCE_BOUNDARY_TAG]}, replace_additional_special_tokens=False + ) transformer_model.resize_token_embeddings(transformer_model.config.vocab_size + added) super().__init__(**self.to_args()) From abce58f93f093f69321e97a91d14bedafc6a804b Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 11 Mar 2024 13:59:07 +0100 Subject: [PATCH 2/9] datasets: add revision support for all Universal Dependencies classes --- flair/datasets/treebanks.py | 181 ++++++++++++++++++++++++------------ 1 file changed, 121 insertions(+), 60 deletions(-) diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py index b216f15d1..05d8eccdf 100644 --- a/flair/datasets/treebanks.py +++ b/flair/datasets/treebanks.py @@ -223,6 +223,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -232,7 +233,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/{revision}" cached_path(f"{web_path}/en_ewt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/en_ewt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/en_ewt-ud-train.conllu", Path("datasets") / dataset_name) @@ -246,6 +247,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -255,7 +257,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Galician-TreeGal/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Galician-TreeGal/{revision}" cached_path(f"{web_path}/gl_treegal-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/gl_treegal-ud-train.conllu", Path("datasets") / dataset_name) @@ -268,6 +270,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -277,7 +280,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/{revision}" cached_path(f"{web_path}/grc_proiel-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/grc_proiel-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/grc_proiel-ud-train.conllu", Path("datasets") / dataset_name) @@ -291,6 +294,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -300,7 +304,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Kazakh-KTB/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Kazakh-KTB/{revision}" cached_path(f"{web_path}/kk_ktb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/kk_ktb-ud-train.conllu", Path("datasets") / dataset_name) @@ -313,6 +317,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -322,7 +327,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_Church_Slavonic-PROIEL/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Old_Church_Slavonic-PROIEL/{revision}" cached_path(f"{web_path}/cu_proiel-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/cu_proiel-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/cu_proiel-ud-train.conllu", Path("datasets") / dataset_name) @@ -336,6 +341,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -345,7 +351,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Armenian-ArmTDP/master/" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Armenian-ArmTDP/{revision}/" cached_path(f"{web_path}/hy_armtdp-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/hy_armtdp-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/hy_armtdp-ud-train.conllu", Path("datasets") / dataset_name) @@ -359,6 +365,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -368,7 +375,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Estonian-EDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Estonian-EDT/{revision}" cached_path(f"{web_path}/et_edt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/et_edt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/et_edt-ud-train.conllu", Path("datasets") / dataset_name) @@ -382,6 +389,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -391,7 +399,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/{revision}" cached_path(f"{ud_path}/de_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/de_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/de_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -405,6 +413,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = False, split_multiwords: bool = True, + revision: str = "dev", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -414,7 +423,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/dev" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/{revision}" cached_path(f"{ud_path}/de_hdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/de_hdt-ud-test.conllu", Path("datasets") / dataset_name) @@ -447,6 +456,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -456,7 +466,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Dutch-Alpino/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Dutch-Alpino/{revision}" cached_path(f"{ud_path}/nl_alpino-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/nl_alpino-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/nl_alpino-ud-train.conllu", Path("datasets") / dataset_name) @@ -468,7 +478,7 @@ class UD_FAROESE(UniversalDependenciesCorpus): """This treebank includes the Faroese treebank dataset. The data is obtained from the following link: - https://github.com/UniversalDependencies/UD_Faroese-FarPaHC/tree/master + https://github.com/UniversalDependencies/UD_Faroese-FarPaHC/tree/{revision} Faronese is a small Western Scandinavian language with 60.000-100.000, related to Icelandic and Old Norse. """ @@ -478,6 +488,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -487,7 +498,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Faroese-FarPaHC/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Faroese-FarPaHC/{revision}" cached_path(f"{web_path}/fo_farpahc-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/fo_farpahc-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/fo_farpahc-ud-train.conllu", Path("datasets") / dataset_name) @@ -501,6 +512,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -510,7 +522,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_French-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_French-GSD/{revision}" cached_path(f"{ud_path}/fr_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fr_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fr_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -523,6 +535,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -532,7 +545,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Italian-ISDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Italian-ISDT/{revision}" cached_path(f"{ud_path}/it_isdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/it_isdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/it_isdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -545,6 +558,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -554,7 +568,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-LLCT/master/" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-LLCT/{revision}/" cached_path(f"{web_path}/la_llct-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/la_llct-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/la_llct-ud-train.conllu", Path("datasets") / dataset_name) @@ -568,6 +582,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -577,7 +592,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/{revision}" cached_path(f"{ud_path}/es_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/es_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/es_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -590,6 +605,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -599,7 +615,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Portuguese-Bosque/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Portuguese-Bosque/{revision}" cached_path(f"{ud_path}/pt_bosque-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/pt_bosque-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/pt_bosque-ud-train.conllu", Path("datasets") / dataset_name) @@ -612,6 +628,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -621,7 +638,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Romanian-RRT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Romanian-RRT/{revision}" cached_path(f"{ud_path}/ro_rrt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ro_rrt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ro_rrt-ud-train.conllu", Path("datasets") / dataset_name) @@ -634,6 +651,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -643,7 +661,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Catalan-AnCora/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Catalan-AnCora/{revision}" cached_path(f"{ud_path}/ca_ancora-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ca_ancora-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ca_ancora-ud-train.conllu", Path("datasets") / dataset_name) @@ -656,6 +674,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -665,7 +684,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Polish-LFG/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Polish-LFG/{revision}" cached_path(f"{ud_path}/pl_lfg-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/pl_lfg-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/pl_lfg-ud-train.conllu", Path("datasets") / dataset_name) @@ -679,6 +698,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = False, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -688,7 +708,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Czech-PDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Czech-PDT/{revision}" cached_path(f"{ud_path}/cs_pdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/cs_pdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path( @@ -732,6 +752,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -741,7 +762,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Slovak-SNK/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Slovak-SNK/{revision}" cached_path(f"{ud_path}/sk_snk-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sk_snk-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sk_snk-ud-train.conllu", Path("datasets") / dataset_name) @@ -755,6 +776,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -764,7 +786,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Swedish-Talbanken/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Swedish-Talbanken/{revision}" cached_path(f"{ud_path}/sv_talbanken-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sv_talbanken-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sv_talbanken-ud-train.conllu", Path("datasets") / dataset_name) @@ -778,6 +800,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -787,7 +810,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Danish-DDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Danish-DDT/{revision}" cached_path(f"{ud_path}/da_ddt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/da_ddt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/da_ddt-ud-train.conllu", Path("datasets") / dataset_name) @@ -801,6 +824,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -810,7 +834,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Norwegian-Bokmaal/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Norwegian-Bokmaal/{revision}" cached_path(f"{ud_path}/no_bokmaal-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/no_bokmaal-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/no_bokmaal-ud-train.conllu", Path("datasets") / dataset_name) @@ -824,6 +848,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -833,7 +858,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Finnish-TDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Finnish-TDT/{revision}" cached_path(f"{ud_path}/fi_tdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fi_tdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fi_tdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -847,6 +872,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -856,7 +882,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Slovenian-SSJ/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Slovenian-SSJ/{revision}" cached_path(f"{ud_path}/sl_ssj-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sl_ssj-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sl_ssj-ud-train.conllu", Path("datasets") / dataset_name) @@ -870,6 +896,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -879,7 +906,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Croatian-SET/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Croatian-SET/{revision}" cached_path(f"{ud_path}/hr_set-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/hr_set-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/hr_set-ud-train.conllu", Path("datasets") / dataset_name) @@ -893,6 +920,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -902,7 +930,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Serbian-SET/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Serbian-SET/{revision}" cached_path(f"{ud_path}/sr_set-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sr_set-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sr_set-ud-train.conllu", Path("datasets") / dataset_name) @@ -916,6 +944,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -925,7 +954,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Bulgarian-BTB/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Bulgarian-BTB/{revision}" cached_path(f"{ud_path}/bg_btb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/bg_btb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/bg_btb-ud-train.conllu", Path("datasets") / dataset_name) @@ -939,6 +968,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -948,7 +978,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Arabic-PADT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Arabic-PADT/{revision}" cached_path(f"{ud_path}/ar_padt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ar_padt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ar_padt-ud-train.conllu", Path("datasets") / dataset_name) @@ -961,6 +991,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -970,7 +1001,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hebrew-HTB/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Hebrew-HTB/{revision}" cached_path(f"{ud_path}/he_htb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/he_htb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/he_htb-ud-train.conllu", Path("datasets") / dataset_name) @@ -983,6 +1014,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -995,7 +1027,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Turkish-IMST/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Turkish-IMST/{revision}" cached_path(f"{ud_path}/tr_imst-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/tr_imst-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/tr_imst-ud-train.conllu", Path("datasets") / dataset_name) @@ -1005,7 +1037,11 @@ def __init__( class UD_UKRAINIAN(UniversalDependenciesCorpus): def __init__( - self, base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True + self, + base_path: Optional[Union[str, Path]] = None, + in_memory: bool = True, + split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1015,7 +1051,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Ukrainian-IU/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Ukrainian-IU/{revision}" cached_path(f"{ud_path}/uk_iu-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/uk_iu-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/uk_iu-ud-train.conllu", Path("datasets") / dataset_name) @@ -1029,6 +1065,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1041,7 +1078,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Persian-Seraji/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Persian-Seraji/{revision}" cached_path(f"{ud_path}/fa_seraji-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fa_seraji-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fa_seraji-ud-train.conllu", Path("datasets") / dataset_name) @@ -1055,6 +1092,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1064,7 +1102,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/{revision}" cached_path(f"{ud_path}/ru_syntagrus-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ru_syntagrus-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ru_syntagrus-ud-train.conllu", Path("datasets") / dataset_name) @@ -1078,6 +1116,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1087,7 +1126,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hindi-HDTB/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Hindi-HDTB/{revision}" cached_path(f"{ud_path}/hi_hdtb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/hi_hdtb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/hi_hdtb-ud-train.conllu", Path("datasets") / dataset_name) @@ -1101,6 +1140,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1110,7 +1150,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/{revision}" cached_path(f"{ud_path}/id_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/id_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/id_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -1124,6 +1164,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1133,7 +1174,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Japanese-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Japanese-GSD/{revision}" cached_path(f"{ud_path}/ja_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ja_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ja_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -1147,6 +1188,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1156,7 +1198,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/{revision}" cached_path(f"{ud_path}/zh_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/zh_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/zh_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -1170,6 +1212,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1179,7 +1222,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Korean-Kaist/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Korean-Kaist/{revision}" cached_path(f"{ud_path}/ko_kaist-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ko_kaist-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ko_kaist-ud-train.conllu", Path("datasets") / dataset_name) @@ -1193,6 +1236,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1202,7 +1246,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Basque-BDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Basque-BDT/{revision}" cached_path(f"{ud_path}/eu_bdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/eu_bdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/eu_bdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1216,6 +1260,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1225,7 +1270,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/{revision}" cached_path(f"{web_path}/lzh_kyoto-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lzh_kyoto-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lzh_kyoto-ud-train.conllu", Path("datasets") / dataset_name) @@ -1239,6 +1284,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1248,7 +1294,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Greek-GDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Greek-GDT/{revision}" cached_path(f"{web_path}/el_gdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/el_gdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/el_gdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1262,6 +1308,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1271,7 +1318,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Naija-NSC/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Naija-NSC/{revision}" cached_path(f"{web_path}//pcm_nsc-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}//pcm_nsc-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}//pcm_nsc-ud-train.conllu", Path("datasets") / dataset_name) @@ -1285,6 +1332,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1294,7 +1342,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Livvi-KKPP/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Livvi-KKPP/{revision}" cached_path(f"{web_path}/olo_kkpp-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/olo_kkpp-ud-train.conllu", Path("datasets") / dataset_name) @@ -1307,6 +1355,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1316,7 +1365,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Buryat-BDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Buryat-BDT/{revision}" cached_path(f"{web_path}/bxr_bdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/bxr_bdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1329,6 +1378,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1338,7 +1388,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_North_Sami-Giella/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_North_Sami-Giella/{revision}" cached_path(f"{web_path}/sme_giella-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/sme_giella-ud-train.conllu", Path("datasets") / dataset_name) @@ -1351,6 +1401,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1360,7 +1411,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Marathi-UFAL/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Marathi-UFAL/{revision}" cached_path(f"{web_path}/mr_ufal-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mr_ufal-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mr_ufal-ud-train.conllu", Path("datasets") / dataset_name) @@ -1374,6 +1425,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1381,7 +1433,7 @@ def __init__( dataset_name = self.__class__.__name__.lower() data_folder = base_path / dataset_name - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Maltese-MUDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Maltese-MUDT/{revision}" cached_path(f"{web_path}/mt_mudt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mt_mudt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mt_mudt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1395,6 +1447,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1402,7 +1455,7 @@ def __init__( dataset_name = self.__class__.__name__.lower() data_folder = base_path / dataset_name - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Afrikaans-AfriBooms/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Afrikaans-AfriBooms/{revision}" cached_path(f"{web_path}/af_afribooms-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/af_afribooms-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/af_afribooms-ud-train.conllu", Path("datasets") / dataset_name) @@ -1416,6 +1469,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1425,7 +1479,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Gothic-PROIEL/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Gothic-PROIEL/{revision}" cached_path(f"{web_path}/got_proiel-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/got_proiel-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/got_proiel-ud-train.conllu", Path("datasets") / dataset_name) @@ -1439,6 +1493,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1448,7 +1503,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_French-SRCMF/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Old_French-SRCMF/{revision}" cached_path(f"{web_path}/fro_srcmf-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/fro_srcmf-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/fro_srcmf-ud-train.conllu", Path("datasets") / dataset_name) @@ -1462,6 +1517,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1469,7 +1525,7 @@ def __init__( dataset_name = self.__class__.__name__.lower() data_folder = base_path / dataset_name - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Wolof-WTB/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Wolof-WTB/{revision}" cached_path(f"{web_path}/wo_wtb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/wo_wtb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/wo_wtb-ud-train.conllu", Path("datasets") / dataset_name) @@ -1483,6 +1539,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1492,7 +1549,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Belarusian-HSE/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Belarusian-HSE/{revision}" cached_path(f"{web_path}/be_hse-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/be_hse-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/be_hse-ud-train.conllu", Path("datasets") / dataset_name) @@ -1506,6 +1563,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1515,7 +1573,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Coptic-Scriptorium/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Coptic-Scriptorium/{revision}" cached_path(f"{web_path}/cop_scriptorium-ud-dev.conllu", Path("datasets") / dataset_name) cached_path( f"{web_path}/cop_scriptorium-ud-test.conllu", @@ -1535,6 +1593,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1544,7 +1603,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Irish-IDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Irish-IDT/{revision}" cached_path(f"{web_path}/ga_idt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/ga_idt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/ga_idt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1558,6 +1617,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1567,7 +1627,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Latvian-LVTB/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Latvian-LVTB/{revision}" cached_path(f"{web_path}/lv_lvtb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lv_lvtb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lv_lvtb-ud-train.conllu", Path("datasets") / dataset_name) @@ -1581,6 +1641,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1592,7 +1653,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Lithuanian-ALKSNIS/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Lithuanian-ALKSNIS/{revision}" cached_path(f"{web_path}/lt_alksnis-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lt_alksnis-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lt_alksnis-ud-train.conllu", Path("datasets") / dataset_name) From 61606728ddc61f19e3157391d5be7c5b78401dbb Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 18 Mar 2024 09:51:22 +0100 Subject: [PATCH 3/9] treebanks: add support for new Bavarian MaiBaam UD --- flair/datasets/treebanks.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py index 05d8eccdf..33a8ee8e9 100644 --- a/flair/datasets/treebanks.py +++ b/flair/datasets/treebanks.py @@ -35,7 +35,11 @@ def __init__( dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file) # get train data - train = UniversalDependenciesDataset(train_file, in_memory=in_memory, split_multiwords=split_multiwords) + train = ( + UniversalDependenciesDataset(train_file, in_memory=in_memory, split_multiwords=split_multiwords) + if train_file is not None + else None + ) # get test data test = ( @@ -1659,3 +1663,27 @@ def __init__( cached_path(f"{web_path}/lt_alksnis-ud-train.conllu", Path("datasets") / dataset_name) super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) + + +class UD_BAVARIAN_MAIBAAM(UniversalDependenciesCorpus): + def __init__( + self, + base_path: Optional[Union[str, Path]] = None, + in_memory: bool = True, + split_multiwords: bool = True, + revision: str = "dev", + ) -> None: + base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + + data_folder = base_path / dataset_name + + # download data if necessary + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Bavarian-MaiBaam/{revision}" + cached_path(f"{web_path}/bar_maibaam-ud-test.conllu", Path("datasets") / dataset_name) + + super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) From 8d9c67872d9191d81d1961f152706765e6d701a0 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 18 Mar 2024 09:51:56 +0100 Subject: [PATCH 4/9] datasets: globally register new UD_BAVARIAN_MAIBAAM --- flair/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 56c7e4dd0..af005db8f 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -259,6 +259,7 @@ UD_ARABIC, UD_ARMENIAN, UD_BASQUE, + UD_BAVARIAN_MAIBAAM, UD_BELARUSIAN, UD_BULGARIAN, UD_CATALAN, @@ -533,6 +534,7 @@ "UD_ARABIC", "UD_ARMENIAN", "UD_BASQUE", + "UD_BAVARIAN_MAIBAAM", "UD_BELARUSIAN", "UD_BULGARIAN", "UD_CATALAN", From 736d19acb84112225bc482a3a764af7221edcaad Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 11 Mar 2024 13:59:07 +0100 Subject: [PATCH 5/9] datasets: add revision support for all Universal Dependencies classes --- flair/datasets/treebanks.py | 181 ++++++++++++++++++++++++------------ 1 file changed, 121 insertions(+), 60 deletions(-) diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py index b216f15d1..05d8eccdf 100644 --- a/flair/datasets/treebanks.py +++ b/flair/datasets/treebanks.py @@ -223,6 +223,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -232,7 +233,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/{revision}" cached_path(f"{web_path}/en_ewt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/en_ewt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/en_ewt-ud-train.conllu", Path("datasets") / dataset_name) @@ -246,6 +247,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -255,7 +257,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Galician-TreeGal/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Galician-TreeGal/{revision}" cached_path(f"{web_path}/gl_treegal-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/gl_treegal-ud-train.conllu", Path("datasets") / dataset_name) @@ -268,6 +270,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -277,7 +280,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/{revision}" cached_path(f"{web_path}/grc_proiel-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/grc_proiel-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/grc_proiel-ud-train.conllu", Path("datasets") / dataset_name) @@ -291,6 +294,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -300,7 +304,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Kazakh-KTB/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Kazakh-KTB/{revision}" cached_path(f"{web_path}/kk_ktb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/kk_ktb-ud-train.conllu", Path("datasets") / dataset_name) @@ -313,6 +317,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -322,7 +327,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_Church_Slavonic-PROIEL/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Old_Church_Slavonic-PROIEL/{revision}" cached_path(f"{web_path}/cu_proiel-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/cu_proiel-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/cu_proiel-ud-train.conllu", Path("datasets") / dataset_name) @@ -336,6 +341,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -345,7 +351,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Armenian-ArmTDP/master/" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Armenian-ArmTDP/{revision}/" cached_path(f"{web_path}/hy_armtdp-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/hy_armtdp-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/hy_armtdp-ud-train.conllu", Path("datasets") / dataset_name) @@ -359,6 +365,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -368,7 +375,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Estonian-EDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Estonian-EDT/{revision}" cached_path(f"{web_path}/et_edt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/et_edt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/et_edt-ud-train.conllu", Path("datasets") / dataset_name) @@ -382,6 +389,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -391,7 +399,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/{revision}" cached_path(f"{ud_path}/de_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/de_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/de_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -405,6 +413,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = False, split_multiwords: bool = True, + revision: str = "dev", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -414,7 +423,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/dev" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/{revision}" cached_path(f"{ud_path}/de_hdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/de_hdt-ud-test.conllu", Path("datasets") / dataset_name) @@ -447,6 +456,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -456,7 +466,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Dutch-Alpino/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Dutch-Alpino/{revision}" cached_path(f"{ud_path}/nl_alpino-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/nl_alpino-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/nl_alpino-ud-train.conllu", Path("datasets") / dataset_name) @@ -468,7 +478,7 @@ class UD_FAROESE(UniversalDependenciesCorpus): """This treebank includes the Faroese treebank dataset. The data is obtained from the following link: - https://github.com/UniversalDependencies/UD_Faroese-FarPaHC/tree/master + https://github.com/UniversalDependencies/UD_Faroese-FarPaHC/tree/{revision} Faronese is a small Western Scandinavian language with 60.000-100.000, related to Icelandic and Old Norse. """ @@ -478,6 +488,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -487,7 +498,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Faroese-FarPaHC/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Faroese-FarPaHC/{revision}" cached_path(f"{web_path}/fo_farpahc-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/fo_farpahc-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/fo_farpahc-ud-train.conllu", Path("datasets") / dataset_name) @@ -501,6 +512,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -510,7 +522,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_French-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_French-GSD/{revision}" cached_path(f"{ud_path}/fr_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fr_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fr_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -523,6 +535,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -532,7 +545,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Italian-ISDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Italian-ISDT/{revision}" cached_path(f"{ud_path}/it_isdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/it_isdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/it_isdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -545,6 +558,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -554,7 +568,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-LLCT/master/" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-LLCT/{revision}/" cached_path(f"{web_path}/la_llct-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/la_llct-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/la_llct-ud-train.conllu", Path("datasets") / dataset_name) @@ -568,6 +582,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -577,7 +592,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/{revision}" cached_path(f"{ud_path}/es_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/es_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/es_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -590,6 +605,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -599,7 +615,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Portuguese-Bosque/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Portuguese-Bosque/{revision}" cached_path(f"{ud_path}/pt_bosque-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/pt_bosque-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/pt_bosque-ud-train.conllu", Path("datasets") / dataset_name) @@ -612,6 +628,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -621,7 +638,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Romanian-RRT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Romanian-RRT/{revision}" cached_path(f"{ud_path}/ro_rrt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ro_rrt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ro_rrt-ud-train.conllu", Path("datasets") / dataset_name) @@ -634,6 +651,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -643,7 +661,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Catalan-AnCora/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Catalan-AnCora/{revision}" cached_path(f"{ud_path}/ca_ancora-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ca_ancora-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ca_ancora-ud-train.conllu", Path("datasets") / dataset_name) @@ -656,6 +674,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -665,7 +684,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Polish-LFG/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Polish-LFG/{revision}" cached_path(f"{ud_path}/pl_lfg-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/pl_lfg-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/pl_lfg-ud-train.conllu", Path("datasets") / dataset_name) @@ -679,6 +698,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = False, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -688,7 +708,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Czech-PDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Czech-PDT/{revision}" cached_path(f"{ud_path}/cs_pdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/cs_pdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path( @@ -732,6 +752,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -741,7 +762,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Slovak-SNK/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Slovak-SNK/{revision}" cached_path(f"{ud_path}/sk_snk-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sk_snk-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sk_snk-ud-train.conllu", Path("datasets") / dataset_name) @@ -755,6 +776,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -764,7 +786,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Swedish-Talbanken/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Swedish-Talbanken/{revision}" cached_path(f"{ud_path}/sv_talbanken-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sv_talbanken-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sv_talbanken-ud-train.conllu", Path("datasets") / dataset_name) @@ -778,6 +800,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -787,7 +810,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Danish-DDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Danish-DDT/{revision}" cached_path(f"{ud_path}/da_ddt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/da_ddt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/da_ddt-ud-train.conllu", Path("datasets") / dataset_name) @@ -801,6 +824,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -810,7 +834,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Norwegian-Bokmaal/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Norwegian-Bokmaal/{revision}" cached_path(f"{ud_path}/no_bokmaal-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/no_bokmaal-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/no_bokmaal-ud-train.conllu", Path("datasets") / dataset_name) @@ -824,6 +848,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -833,7 +858,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Finnish-TDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Finnish-TDT/{revision}" cached_path(f"{ud_path}/fi_tdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fi_tdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fi_tdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -847,6 +872,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -856,7 +882,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Slovenian-SSJ/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Slovenian-SSJ/{revision}" cached_path(f"{ud_path}/sl_ssj-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sl_ssj-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sl_ssj-ud-train.conllu", Path("datasets") / dataset_name) @@ -870,6 +896,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -879,7 +906,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Croatian-SET/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Croatian-SET/{revision}" cached_path(f"{ud_path}/hr_set-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/hr_set-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/hr_set-ud-train.conllu", Path("datasets") / dataset_name) @@ -893,6 +920,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -902,7 +930,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Serbian-SET/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Serbian-SET/{revision}" cached_path(f"{ud_path}/sr_set-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sr_set-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/sr_set-ud-train.conllu", Path("datasets") / dataset_name) @@ -916,6 +944,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -925,7 +954,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Bulgarian-BTB/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Bulgarian-BTB/{revision}" cached_path(f"{ud_path}/bg_btb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/bg_btb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/bg_btb-ud-train.conllu", Path("datasets") / dataset_name) @@ -939,6 +968,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -948,7 +978,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Arabic-PADT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Arabic-PADT/{revision}" cached_path(f"{ud_path}/ar_padt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ar_padt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ar_padt-ud-train.conllu", Path("datasets") / dataset_name) @@ -961,6 +991,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -970,7 +1001,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hebrew-HTB/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Hebrew-HTB/{revision}" cached_path(f"{ud_path}/he_htb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/he_htb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/he_htb-ud-train.conllu", Path("datasets") / dataset_name) @@ -983,6 +1014,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -995,7 +1027,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Turkish-IMST/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Turkish-IMST/{revision}" cached_path(f"{ud_path}/tr_imst-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/tr_imst-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/tr_imst-ud-train.conllu", Path("datasets") / dataset_name) @@ -1005,7 +1037,11 @@ def __init__( class UD_UKRAINIAN(UniversalDependenciesCorpus): def __init__( - self, base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True + self, + base_path: Optional[Union[str, Path]] = None, + in_memory: bool = True, + split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1015,7 +1051,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Ukrainian-IU/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Ukrainian-IU/{revision}" cached_path(f"{ud_path}/uk_iu-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/uk_iu-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/uk_iu-ud-train.conllu", Path("datasets") / dataset_name) @@ -1029,6 +1065,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1041,7 +1078,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Persian-Seraji/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Persian-Seraji/{revision}" cached_path(f"{ud_path}/fa_seraji-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fa_seraji-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/fa_seraji-ud-train.conllu", Path("datasets") / dataset_name) @@ -1055,6 +1092,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1064,7 +1102,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/{revision}" cached_path(f"{ud_path}/ru_syntagrus-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ru_syntagrus-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ru_syntagrus-ud-train.conllu", Path("datasets") / dataset_name) @@ -1078,6 +1116,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1087,7 +1126,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hindi-HDTB/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Hindi-HDTB/{revision}" cached_path(f"{ud_path}/hi_hdtb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/hi_hdtb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/hi_hdtb-ud-train.conllu", Path("datasets") / dataset_name) @@ -1101,6 +1140,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1110,7 +1150,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/{revision}" cached_path(f"{ud_path}/id_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/id_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/id_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -1124,6 +1164,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1133,7 +1174,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Japanese-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Japanese-GSD/{revision}" cached_path(f"{ud_path}/ja_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ja_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ja_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -1147,6 +1188,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1156,7 +1198,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/{revision}" cached_path(f"{ud_path}/zh_gsd-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/zh_gsd-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/zh_gsd-ud-train.conllu", Path("datasets") / dataset_name) @@ -1170,6 +1212,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1179,7 +1222,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Korean-Kaist/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Korean-Kaist/{revision}" cached_path(f"{ud_path}/ko_kaist-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ko_kaist-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ko_kaist-ud-train.conllu", Path("datasets") / dataset_name) @@ -1193,6 +1236,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1202,7 +1246,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Basque-BDT/master" + ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Basque-BDT/{revision}" cached_path(f"{ud_path}/eu_bdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/eu_bdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/eu_bdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1216,6 +1260,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1225,7 +1270,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/{revision}" cached_path(f"{web_path}/lzh_kyoto-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lzh_kyoto-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lzh_kyoto-ud-train.conllu", Path("datasets") / dataset_name) @@ -1239,6 +1284,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1248,7 +1294,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Greek-GDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Greek-GDT/{revision}" cached_path(f"{web_path}/el_gdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/el_gdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/el_gdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1262,6 +1308,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1271,7 +1318,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Naija-NSC/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Naija-NSC/{revision}" cached_path(f"{web_path}//pcm_nsc-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}//pcm_nsc-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}//pcm_nsc-ud-train.conllu", Path("datasets") / dataset_name) @@ -1285,6 +1332,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1294,7 +1342,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Livvi-KKPP/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Livvi-KKPP/{revision}" cached_path(f"{web_path}/olo_kkpp-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/olo_kkpp-ud-train.conllu", Path("datasets") / dataset_name) @@ -1307,6 +1355,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1316,7 +1365,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Buryat-BDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Buryat-BDT/{revision}" cached_path(f"{web_path}/bxr_bdt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/bxr_bdt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1329,6 +1378,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1338,7 +1388,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_North_Sami-Giella/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_North_Sami-Giella/{revision}" cached_path(f"{web_path}/sme_giella-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/sme_giella-ud-train.conllu", Path("datasets") / dataset_name) @@ -1351,6 +1401,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1360,7 +1411,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Marathi-UFAL/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Marathi-UFAL/{revision}" cached_path(f"{web_path}/mr_ufal-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mr_ufal-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mr_ufal-ud-train.conllu", Path("datasets") / dataset_name) @@ -1374,6 +1425,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1381,7 +1433,7 @@ def __init__( dataset_name = self.__class__.__name__.lower() data_folder = base_path / dataset_name - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Maltese-MUDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Maltese-MUDT/{revision}" cached_path(f"{web_path}/mt_mudt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mt_mudt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mt_mudt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1395,6 +1447,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1402,7 +1455,7 @@ def __init__( dataset_name = self.__class__.__name__.lower() data_folder = base_path / dataset_name - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Afrikaans-AfriBooms/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Afrikaans-AfriBooms/{revision}" cached_path(f"{web_path}/af_afribooms-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/af_afribooms-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/af_afribooms-ud-train.conllu", Path("datasets") / dataset_name) @@ -1416,6 +1469,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1425,7 +1479,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Gothic-PROIEL/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Gothic-PROIEL/{revision}" cached_path(f"{web_path}/got_proiel-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/got_proiel-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/got_proiel-ud-train.conllu", Path("datasets") / dataset_name) @@ -1439,6 +1493,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1448,7 +1503,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_French-SRCMF/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Old_French-SRCMF/{revision}" cached_path(f"{web_path}/fro_srcmf-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/fro_srcmf-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/fro_srcmf-ud-train.conllu", Path("datasets") / dataset_name) @@ -1462,6 +1517,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1469,7 +1525,7 @@ def __init__( dataset_name = self.__class__.__name__.lower() data_folder = base_path / dataset_name - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Wolof-WTB/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Wolof-WTB/{revision}" cached_path(f"{web_path}/wo_wtb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/wo_wtb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/wo_wtb-ud-train.conllu", Path("datasets") / dataset_name) @@ -1483,6 +1539,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1492,7 +1549,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Belarusian-HSE/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Belarusian-HSE/{revision}" cached_path(f"{web_path}/be_hse-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/be_hse-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/be_hse-ud-train.conllu", Path("datasets") / dataset_name) @@ -1506,6 +1563,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1515,7 +1573,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Coptic-Scriptorium/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Coptic-Scriptorium/{revision}" cached_path(f"{web_path}/cop_scriptorium-ud-dev.conllu", Path("datasets") / dataset_name) cached_path( f"{web_path}/cop_scriptorium-ud-test.conllu", @@ -1535,6 +1593,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1544,7 +1603,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Irish-IDT/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Irish-IDT/{revision}" cached_path(f"{web_path}/ga_idt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/ga_idt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/ga_idt-ud-train.conllu", Path("datasets") / dataset_name) @@ -1558,6 +1617,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1567,7 +1627,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Latvian-LVTB/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Latvian-LVTB/{revision}" cached_path(f"{web_path}/lv_lvtb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lv_lvtb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lv_lvtb-ud-train.conllu", Path("datasets") / dataset_name) @@ -1581,6 +1641,7 @@ def __init__( base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, split_multiwords: bool = True, + revision: str = "master", ) -> None: base_path = Path(flair.cache_root) / "datasets" if not base_path else Path(base_path) @@ -1592,7 +1653,7 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Lithuanian-ALKSNIS/master" + web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Lithuanian-ALKSNIS/{revision}" cached_path(f"{web_path}/lt_alksnis-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lt_alksnis-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lt_alksnis-ud-train.conllu", Path("datasets") / dataset_name) From e561689d3f03edee05ef43eb242d533228e617dc Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 26 Mar 2024 14:34:14 +0100 Subject: [PATCH 6/9] datasets: fix train files for UD_CZECH and UD_RUSSIAN --- flair/datasets/treebanks.py | 57 ++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py index 05d8eccdf..1d9e025b9 100644 --- a/flair/datasets/treebanks.py +++ b/flair/datasets/treebanks.py @@ -711,30 +711,17 @@ def __init__( ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Czech-PDT/{revision}" cached_path(f"{ud_path}/cs_pdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/cs_pdt-ud-test.conllu", Path("datasets") / dataset_name) - cached_path( - f"{ud_path}/cs_pdt-ud-train-c.conllu", - Path("datasets") / dataset_name / "original", - ) - cached_path( - f"{ud_path}/cs_pdt-ud-train-l.conllu", - Path("datasets") / dataset_name / "original", - ) - cached_path( - f"{ud_path}/cs_pdt-ud-train-m.conllu", - Path("datasets") / dataset_name / "original", - ) - cached_path( - f"{ud_path}/cs_pdt-ud-train-v.conllu", - Path("datasets") / dataset_name / "original", - ) + + train_suffixes = ["ca", "ct", "la", "lt", "ma", "mt", "va"] + + for train_suffix in train_suffixes: + cached_path( + f"{ud_path}/cs_pdt-ud-train-{train_suffix}.conllu", + Path("datasets") / dataset_name / "original", + ) data_path = flair.cache_root / "datasets" / dataset_name - train_filenames = [ - "cs_pdt-ud-train-c.conllu", - "cs_pdt-ud-train-l.conllu", - "cs_pdt-ud-train-m.conllu", - "cs_pdt-ud-train-v.conllu", - ] + train_filenames = [f"cs_pdt-ud-train-{train_suffix}.conllu" for train_suffix in train_suffixes] new_train_file: Path = data_path / "cs_pdt-ud-train-all.conllu" @@ -1105,7 +1092,25 @@ def __init__( ud_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/{revision}" cached_path(f"{ud_path}/ru_syntagrus-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/ru_syntagrus-ud-test.conllu", Path("datasets") / dataset_name) - cached_path(f"{ud_path}/ru_syntagrus-ud-train.conllu", Path("datasets") / dataset_name) + + train_filenames = [ + "ru_syntagrus-ud-train-a.conllu", + "ru_syntagrus-ud-train-b.conllu", + "ru_syntagrus-ud-train-c.conllu", + ] + + for train_file in train_filenames: + cached_path(f"{ud_path}/{train_file}", Path("datasets") / dataset_name / "original") + + data_path = flair.cache_root / "datasets" / dataset_name + + new_train_file: Path = data_path / "ru_syntagrus-ud-train-all.conllu" + + if not new_train_file.is_file(): + with open(new_train_file, "w") as f_out: + for train_filename in train_filenames: + with open(data_path / "original" / train_filename) as f_in: + f_out.write(f_in.read()) super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) @@ -1504,9 +1509,9 @@ def __init__( # download data if necessary web_path = f"https://raw.githubusercontent.com/UniversalDependencies/UD_Old_French-SRCMF/{revision}" - cached_path(f"{web_path}/fro_srcmf-ud-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{web_path}/fro_srcmf-ud-test.conllu", Path("datasets") / dataset_name) - cached_path(f"{web_path}/fro_srcmf-ud-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{web_path}/fro_profiterole-ud-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{web_path}/fro_profiterole-ud-test.conllu", Path("datasets") / dataset_name) + cached_path(f"{web_path}/fro_profiterole-ud-train.conllu", Path("datasets") / dataset_name) super().__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) From db4a1b190519b7734598f75427e4390b1a564431 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 26 Mar 2024 14:34:55 +0100 Subject: [PATCH 7/9] datasets: register UD_BURYAT, UD_CHINESE_KYOTO and UD_NAIJA correctly --- flair/datasets/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 56c7e4dd0..b9e822770 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -261,8 +261,10 @@ UD_BASQUE, UD_BELARUSIAN, UD_BULGARIAN, + UD_BURYAT, UD_CATALAN, UD_CHINESE, + UD_CHINESE_KYOTO, UD_COPTIC, UD_CROATIAN, UD_CZECH, @@ -292,6 +294,7 @@ UD_LIVVI, UD_MALTESE, UD_MARATHI, + UD_NAIJA, UD_NORTH_SAMI, UD_NORWEGIAN, UD_OLD_CHURCH_SLAVONIC, @@ -535,8 +538,10 @@ "UD_BASQUE", "UD_BELARUSIAN", "UD_BULGARIAN", + "UD_BURYAT", "UD_CATALAN", "UD_CHINESE", + "UD_CHINESE_KYOTO", "UD_COPTIC", "UD_CROATIAN", "UD_CZECH", @@ -566,6 +571,7 @@ "UD_LIVVI", "UD_MALTESE", "UD_MARATHI", + "UD_NAIJA", "UD_NORTH_SAMI", "UD_NORWEGIAN", "UD_OLD_CHURCH_SLAVONIC", From 60b9863b2c209582ccc23be1954b95717522f9e2 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 14 Jun 2024 15:06:38 +0200 Subject: [PATCH 8/9] load best model at end, even when there is no final evaluation --- flair/trainers/trainer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 2afb72bc8..6d9c3ec54 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -801,6 +801,9 @@ def train_custom( self.return_values["test_score"] = test_results.main_score else: + if (base_path / "best-model.pt").exists(): + log.info("Loading model from best epoch ...") + self.model.load_state_dict(self.model.load(base_path / "best-model.pt").state_dict()) self.return_values["test_score"] = 0 log.info("Test data not provided setting final score to 0") From 9303ee2584c0fba02955161d59affbb0be2497e9 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Thu, 6 Jun 2024 20:59:14 +0200 Subject: [PATCH 9/9] pip: bump min. required version of bpemb (see https://github.com/bheinzerling/bpemb/issues/66) --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1186efb44..fdb507e44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ boto3>=1.20.27 -bpemb>=0.3.2 +bpemb>=0.3.5 conllu>=4.0 deprecated>=1.2.13 ftfy>=6.1.0 @@ -26,4 +26,4 @@ transformers[sentencepiece]>=4.18.0,<5.0.0 urllib3<2.0.0,>=1.0.0 # pin below 2 to make dependency resolution faster. wikipedia-api>=0.5.7 semver<4.0.0,>=3.0.0 -bioc<3.0.0,>=2.0.0 \ No newline at end of file +bioc<3.0.0,>=2.0.0