forked from explosion/projects
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathproject.yml
96 lines (86 loc) · 3.32 KB
/
project.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
title: "Dependency Parsing (Penn Treebank)"
vars:
name: "cnn_glove_small" # pick between "transformer" or "cnn_glove_small"
gpu: 0
# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "training", "configs", "metrics", "corpus"]
assets:
- dest: "assets/PTB_SD_3_3_0/train.gold.conll"
description: "Training data (not available publicly so you have to add the file yourself)"
- dest: "assets/PTB_SD_3_3_0/dev.gold.conll"
description: "Development data (not available publicly so you have to add the file yourself)"
- dest: "assets/PTB_SD_3_3_0/test.gold.conll"
description: "Test data (not available publicly so you have to add the file yourself)"
- url: "http://nlp.stanford.edu/data/glove.840B.300d.zip"
dest: "assets/vectors.zip"
description: "GloVe vectors"
- url: "https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json"
dest: "assets/orth_variants.json"
description: "A file containing orth variants for data augmentation"
workflows:
all:
- install
- vectors
- corpus
- train
- evaluate
commands:
- name: install
help: "Install dependencies"
script:
- "python -m pip install -r requirements.txt"
deps:
- "requirements.txt"
- name: corpus
help: "Convert the data to spaCy's format"
# Make sure we specify the branch in the command string, so that the
# caching works correctly.
script:
- "mkdir -p assets/PTB_SD_3_3_0_pos_fix"
- "python scripts/fix_conll_format.py assets/PTB_SD_3_3_0/train.gold.conll assets/PTB_SD_3_3_0_pos_fix/train.conll"
- "python scripts/fix_conll_format.py assets/PTB_SD_3_3_0/dev.gold.conll assets/PTB_SD_3_3_0_pos_fix/dev.conll"
- "python scripts/fix_conll_format.py assets/PTB_SD_3_3_0/test.gold.conll assets/PTB_SD_3_3_0_pos_fix/test.conll"
- "python -m spacy convert assets/PTB_SD_3_3_0 corpus/"
deps:
- "assets/PTB_SD_3_3_0/train.gold.conll"
- "assets/PTB_SD_3_3_0/dev.gold.conll"
- "assets/PTB_SD_3_3_0/test.gold.conll"
outputs:
- "corpus/train.spacy"
- "corpus/dev.spacy"
- "corpus/test.spacy"
- name: vectors
help: "Convert, truncate and prune the vectors."
script:
- "python -m spacy init vectors en assets/vectors.zip corpus/en_vectors -n en_glove840b_vectors_md"
deps:
- "assets/vectors.zip"
outputs:
- "corpus/en_vectors"
- name: train
help: "Train the full pipeline"
script:
- python -m spacy train configs/${vars.name}.cfg -o training/${vars.name} --gpu-id ${vars.gpu}
deps:
- "corpus/train.spacy"
- "corpus/dev.spacy"
- "configs/${vars.name}.cfg"
- "corpus/en_vectors"
outputs:
- "training/${vars.name}/model-best"
- name: evaluate
help: "Evaluate on the test data and save the metrics"
script:
- "python -m spacy evaluate ./training/${vars.name}/model-best ./corpus/test.spacy --output ./metrics/${vars.name}.json --gpu-id ${vars.gpu} --gold-preproc"
deps:
- "training/${vars.name}/model-best"
- "corpus/test.spacy"
outputs:
- "metrics/${vars.name}.json"
- name: clean
help: "Remove intermediate files"
script:
- "rm -rf training/*"
- "rm -rf metrics/*"
- "rm -rf corpus/*"