Skip to content

Commit

Permalink
update code
Browse files Browse the repository at this point in the history
  • Loading branch information
chenzhihao committed Sep 23, 2021
1 parent abdd2ef commit e3b3d42
Show file tree
Hide file tree
Showing 9 changed files with 209 additions and 151 deletions.
181 changes: 125 additions & 56 deletions EventExtraction/event_extractor.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion EventExtraction/utils/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class DataAndTrainArguments:
eval_all_checkpoints: bool = Field(default=False,
description="Evaluate all checkpoints starting with the same prefix "
"as model_name ending and ending with step number", )
do_lower_case: bool = Field(default=True, description="Set this flag if you are using an uncased model.")
do_lower_case: bool = Field(default=False, description="Set this flag if you are using an uncased model.")
use_lstm: bool = False
from_scratch: bool = True
from_last_checkpoint: bool = Field(default=False, description="Only if 'from_scratch' was set 'False'")
Expand Down
7 changes: 4 additions & 3 deletions EventExtraction/utils/ee_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,11 @@ def tokenize(text, vocab, do_lower_case=False):


def convert_examples_to_features(examples, label2id, max_seq_length, tokenizer, cls_token="[CLS]", sep_token="[SEP]",
pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, data_type="train"):
pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, data_type="train",
do_lower_case=False):
features = []
for (ex_index, example) in tqdm(enumerate(examples), desc=f"Convert {data_type} examples to features"):
tokens = tokenize(example.text_a, tokenizer.vocab)
tokens = tokenize(example.text_a, tokenizer.vocab, do_lower_case)
if len(tokens) > max_seq_length - 2:
tokens = tokens[: max_seq_length - 2]
tokens = [cls_token] + tokens + [sep_token]
Expand All @@ -136,7 +137,7 @@ def convert_examples_to_features(examples, label2id, max_seq_length, tokenizer,
for argument, (event_type, role) in arguments.items():
if (event_type, role) not in label2id and event_type == "OTHER":
continue
a_token = tokenize(argument, tokenizer.vocab)
a_token = tokenize(argument, tokenizer.vocab, do_lower_case)
a_token_id = tokenizer.convert_tokens_to_ids(a_token)
start_index = search(a_token_id, token_ids)
if start_index != -1:
Expand Down
70 changes: 32 additions & 38 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
3. transformers==4.10.0
4. pydantic==1.8.2
5. accelerate==0.4.0
6. tensorflow==2.6.0
7. torch==1.9.0
6. tensorboard==2.6.0
7. spanner==3.3.8
8. torch==1.9.0

# 训练数据格式

Expand All @@ -42,42 +43,41 @@
from EventExtraction import EventExtractor, DataAndTrainArguments
config = {
'task_name': 'ee',
'task_name': 'ee',
'data_dir': '../data/normal_data/news2',
'model_type': 'bert',
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
'model_type': 'bert', # bert, nezha
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext', # nezha-base-wwm
'model_sate_dict_path': '../data/output/bert/best_model', # 保存的checkpoint文件地址用于继续训练
'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录
'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录False
'do_lower_case': False, # 主要是tokenize时是否将大写转为小写
'cache_dir': '', # 指定下载的预训练模型保存地址
'evaluate_during_training': True,
'do_eval_per_epoch': True,
'use_lstm': True,
'from_scratch': True,
'from_last_checkpoint': False,
'evaluate_during_training': True, # 是否在训练过程中验证模型, 默认为True
'use_lstm': True, # 默认为False, 表示模型结构为bert_crf
'from_scratch': True, # 是否从头开始训练,默认为True
'from_last_checkpoint': False, # 是否从最新的checkpoint模型继续训练,默认为False
'early_stop': False,
'overwrite_output_dir': True,
'overwrite_cache': True,
'no_cuda': False,
'overwrite_cache': True, # 是否重写特征,默认为True,若为False表示从特征文件中加载特征
'no_cuda': False, # 是否使用GPU。默认为False, 表示只使用CPU
'fp16': True,
'train_max_seq_length': 128,
'eval_max_seq_length': 128,
'train_max_seq_length': 128, # 默认为512
'eval_max_seq_length': 128, # 默认为512
'per_gpu_train_batch_size': 16,
'per_gpu_eval_batch_size': 16,
'gradient_accumulation_steps': 1,
'learning_rate': 5e-05,
'learning_rate': 5e-05, # bert和lstm的学习率
'crf_learning_rate': 5e-05,
'weight_decay': 0.01,
'adam_epsilon': 1e-08,
'warmup_proportion': 0.1,
'num_train_epochs': 50.0,
'max_steps': -1,
'max_steps': -1, # 当指定了该字段值后,'num_train_epochs'就不起作用了
'tolerance': 5, # 指定early stop容忍的epoch数量
'logging_steps': 500,
'save_steps': 500,
'logging_steps': 500, # 指定tensorboard日志在哪个阶段记录
'save_steps': 500, # 指定哪些步骤保存中间训练结果
'scheduler_type': 'linear', # ["linear","cosine","cosine_with_restarts","polynomial","constant","constant_with_warmup"]
'cuda_number': '0', # '0,1,2,3'
'cuda_number': '0', # '0,1,2,3' 使用GPU时需指定GPU卡号
'seed': 2333,
'local_rank': -1,
'dropout_rate': 0.3
}
Expand All @@ -103,18 +103,17 @@ extractor.train_and_valid()
from EventExtraction import EventExtractor, DataAndTrainArguments
config = {
'task_name': 'ee',
'task_name': 'ee', # ee, ner
'data_dir': '../data/normal_data/news2',
'model_type': 'bert',
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
'model_type': 'bert', # bert, nezha
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext', # nezha-base-wwm
'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录
'do_lower_case': False,
'use_lstm': False,
'no_cuda': False,
'eval_max_seq_length': 128,
'do_lower_case': False, # 主要是tokenize时是否将大写转为小写
'use_lstm': False, # 默认为False, 表示模型结构为bert_crf
'no_cuda': False, # 是否使用GPU。默认为False, 表示只使用CPU
'eval_max_seq_length': 128, # 默认为512
'per_gpu_eval_batch_size': 8,
'cuda_number': '0', # '0,1,2,3'
'local_rank': -1,
'cuda_number': '0', # '0,1,2,3' 使用GPU时需指定GPU卡号
}
args = DataAndTrainArguments(**config)
Expand All @@ -140,18 +139,13 @@ from EventExtraction import EventExtractor, DataAndTrainArguments
config = {
'task_name': 'ee',
'data_dir': '../data/normal_data/news2',
'model_type': 'bert',
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录
'do_lower_case': True,
'use_lstm': False,
'no_cuda': False,
'eval_max_seq_length': 128,
'use_lstm': True, # 默认是False
'eval_max_seq_length': 512,
}
args = DataAndTrainArguments(**config)
extractor = EventExtractor(args, state='pred')
extractor = EventExtractor(args, state='pred', model_path='../data/model')
# data_type: 只能是'test',或者None。若为test则表示在测试数据集上预测
# input_texts: 若不为空,则表示是预测新的数据
Expand Down
19 changes: 9 additions & 10 deletions experiments/eval_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,20 @@
from EventExtraction import EventExtractor, DataAndTrainArguments

config = {
'task_name': 'ee',
'task_name': 'ee', # ee
'data_dir': '../data/normal_data/news2',
'model_type': 'bert',
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
'model_type': 'bert', # bert, nezha
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext', # nezha-base-wwm
'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录
'do_lower_case': False,
'use_lstm': False,
'no_cuda': False,
'eval_max_seq_length': 128,
'do_lower_case': False, # 主要是tokenize时是否将大写转为小写
'use_lstm': False, # 默认为False, 表示模型结构为bert_crf
'no_cuda': False, # 是否使用GPU。默认为False, 表示只使用CPU
'eval_max_seq_length': 128, # 默认为512
'per_gpu_eval_batch_size': 8,
'cuda_number': '0', # '0,1,2,3'
'local_rank': -1,
'cuda_number': '0', # '0,1,2,3' 使用GPU时需指定GPU卡号
}

args = DataAndTrainArguments(**config)
args = DataAndTrainArguments(**config) # noqa
extractor = EventExtractor(args)

# evaluate all checkpoints file for the dev datasets
Expand Down
16 changes: 6 additions & 10 deletions experiments/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,18 @@
Change Activity:
======================================
"""
import json
from EventExtraction import EventExtractor, DataAndTrainArguments

config = {
'task_name': 'ee',
'data_dir': '../data/normal_data/news2',
'model_type': 'bert',
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录
'do_lower_case': True,
'use_lstm': False,
'no_cuda': False,
'eval_max_seq_length': 128,
'use_lstm': True, # 默认是False
'eval_max_seq_length': 512,
}

args = DataAndTrainArguments(**config)
extractor = EventExtractor(args, state='pred')
args = DataAndTrainArguments(**config) # noqa
extractor = EventExtractor(args, state='pred', model_path='../data/model')

# data_type: 只能是'test',或者None。若为test则表示在测试数据集上预测
# input_texts: 若不为空,则表示是预测新的数据
Expand All @@ -43,4 +39,4 @@
texts = ["博盛医疗完成Pre-A轮融资澳银资本重点参与",
"百炼智能完成A轮一亿元融资,由今日头条领投"]
for res in extractor.predict(input_texts=texts):
print(res)
print(json.dumps(res, ensure_ascii=False, indent=2))
55 changes: 27 additions & 28 deletions experiments/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,47 +10,46 @@
======================================
"""
import sys
sys.path.append('/data/chenzhihao/EventExtraction')
sys.path.append('/data/chenzhihao/EventExtraction/')
from EventExtraction import EventExtractor, DataAndTrainArguments

config = {
'task_name': 'ee',
'data_dir': '../data/normal_data/news2',
'model_type': 'bert',
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
'model_sate_dict_path': '../data/output/bert/best_model', # 保存的checkpoint文件地址用于继续训练
'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录
'cache_dir': '', # 指定下载的预训练模型保存地址
'evaluate_during_training': True,
'do_eval_per_epoch': True,
'use_lstm': False,
'from_scratch': True,
'from_last_checkpoint': False,
'task_name': 'ner', # ner
'data_dir': '../data/normal_data/ner',
'model_type': 'bert', # bert, nezha
'model_name_or_path': 'hfl/chinese-roberta-wwm-ext', # '/data/chenzhihao/nezha-base-www'
'model_sate_dict_path': '../data/output/bert/best_model', # 保存的checkpoint文件地址用于继续训练
'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录False
'do_lower_case': False, # 主要是tokenize时是否将大写转为小写
'cache_dir': '', # 指定下载的预训练模型保存地址
'evaluate_during_training': True, # 是否在训练过程中验证模型, 默认为True
'use_lstm': False, # 默认为False, 表示模型结构为bert_crf
'from_scratch': True, # 是否从头开始训练,默认为True
'from_last_checkpoint': False, # 是否从最新的checkpoint模型继续训练,默认为False
'early_stop': False,
'overwrite_output_dir': True,
'overwrite_cache': True,
'no_cuda': False,
'overwrite_cache': True, # 是否重写特征,默认为True,若为False表示从特征文件中加载特征
'no_cuda': False, # 是否使用GPU。默认为False, 表示只使用CPU
'fp16': True,
'train_max_seq_length': 128,
'eval_max_seq_length': 128,
'per_gpu_train_batch_size': 32,
'per_gpu_eval_batch_size': 32,
'train_max_seq_length': 32, # 默认为512
'eval_max_seq_length': 32, # 默认为512
'per_gpu_train_batch_size': 16,
'per_gpu_eval_batch_size': 16,
'gradient_accumulation_steps': 1,
'learning_rate': 5e-05,
'learning_rate': 5e-05, # bert和lstm的学习率
'crf_learning_rate': 5e-05,
'weight_decay': 0.01,
'adam_epsilon': 1e-08,
'warmup_proportion': 0.1,
'num_train_epochs': 30.0,
'max_steps': -1,
'tolerance': 5, # 指定early stop容忍的epoch数量
'logging_steps': 500,
'save_steps': 500,
'scheduler_type': 'cosine',
'num_train_epochs': 3.0,
'max_steps': -1, # 当指定了该字段值后,'num_train_epochs'就不起作用了
'tolerance': 5, # 指定early stop容忍的epoch数量
'logging_steps': 500, # 指定tensorboard日志在哪个阶段记录
'save_steps': 500, # 指定哪些步骤保存中间训练结果
# ["linear","cosine","cosine_with_restarts","polynomial","constant","constant_with_warmup"]
'cuda_number': '0', # '0,1,2,3'
'scheduler_type': 'linear',
'cuda_number': '3', # '0,1,2,3' 使用GPU时需指定GPU卡号
'seed': 2333,
'local_rank': -1,
'dropout_rate': 0.3
}

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ accelerate==0.4.0
numpy==1.21.2
tqdm==4.62.2
torch==1.9.0
tensorflow==2.6.0
tensorboard==2.6.0
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
version = f.read().strip()

# 改成自己的工程名,尽量与gitlab的工程名起名一致
proj_name = "EventExtraction"
proj_name = "event_extraction_pytorch"
keywords = "event_extraction, bert_crf"
git_url = f"https://gitlab.bailian-ai.com/ai_algo/{proj_name}"
git_url = f"https://github.com/zhihao-chen/{proj_name}"


def load_requirements(file_name="requirements.txt", comment_char="#"):
Expand All @@ -47,8 +47,8 @@ def load_requirements(file_name="requirements.txt", comment_char="#"):

setup(
name=proj_name,
author="bailian.ai",
author_email="[email protected]",
author="andrew chen",
author_email="[email protected]",
description=long_description,
version=version,
url=git_url,
Expand Down

0 comments on commit e3b3d42

Please sign in to comment.