update code

zhihao-chen · Sep 23, 2021 · e3b3d42 · e3b3d42
1 parent abdd2ef
commit e3b3d42
Show file tree

Hide file tree

Showing 9 changed files with 209 additions and 151 deletions.
diff --git a/EventExtraction/event_extractor.py b/EventExtraction/event_extractor.py
diff --git a/EventExtraction/utils/arguments.py b/EventExtraction/utils/arguments.py
@@ -63,7 +63,7 @@ class DataAndTrainArguments:
     eval_all_checkpoints: bool = Field(default=False,
                                        description="Evaluate all checkpoints starting with the same prefix "
                                                    "as model_name ending and ending with step number", )
-    do_lower_case: bool = Field(default=True, description="Set this flag if you are using an uncased model.")
+    do_lower_case: bool = Field(default=False, description="Set this flag if you are using an uncased model.")
     use_lstm: bool = False
     from_scratch: bool = True
     from_last_checkpoint: bool = Field(default=False, description="Only if 'from_scratch' was set 'False'")

diff --git a/EventExtraction/utils/ee_seq.py b/EventExtraction/utils/ee_seq.py
@@ -111,10 +111,11 @@ def tokenize(text, vocab, do_lower_case=False):
 
 
 def convert_examples_to_features(examples, label2id, max_seq_length, tokenizer, cls_token="[CLS]", sep_token="[SEP]",
-                                 pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, data_type="train"):
+                                 pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, data_type="train",
+                                 do_lower_case=False):
     features = []
     for (ex_index, example) in tqdm(enumerate(examples), desc=f"Convert {data_type} examples to features"):
-        tokens = tokenize(example.text_a, tokenizer.vocab)
+        tokens = tokenize(example.text_a, tokenizer.vocab, do_lower_case)
         if len(tokens) > max_seq_length - 2:
             tokens = tokens[: max_seq_length - 2]
         tokens = [cls_token] + tokens + [sep_token]
@@ -136,7 +137,7 @@ def convert_examples_to_features(examples, label2id, max_seq_length, tokenizer,
             for argument, (event_type, role) in arguments.items():
                 if (event_type, role) not in label2id and event_type == "OTHER":
                     continue
-                a_token = tokenize(argument, tokenizer.vocab)
+                a_token = tokenize(argument, tokenizer.vocab, do_lower_case)
                 a_token_id = tokenizer.convert_tokens_to_ids(a_token)
                 start_index = search(a_token_id, token_ids)
                 if start_index != -1:

diff --git a/README.md b/README.md
@@ -14,8 +14,9 @@
 3. transformers==4.10.0
 4. pydantic==1.8.2
 5. accelerate==0.4.0
-6. tensorflow==2.6.0
-7. torch==1.9.0
+6. tensorboard==2.6.0
+7. spanner==3.3.8
+8. torch==1.9.0
 
 # 训练数据格式
 
@@ -42,42 +43,41 @@
 from EventExtraction import EventExtractor, DataAndTrainArguments
 
 config = {
-    'task_name': 'ee',
+    'task_name': 'ee',  
     'data_dir': '../data/normal_data/news2',
-    'model_type': 'bert',
-    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
+    'model_type': 'bert',  # bert, nezha
+    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',  # nezha-base-wwm
     'model_sate_dict_path': '../data/output/bert/best_model',   # 保存的checkpoint文件地址用于继续训练
-    'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录
+    'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录False
+    'do_lower_case': False,  # 主要是tokenize时是否将大写转为小写
     'cache_dir': '',   # 指定下载的预训练模型保存地址
-    'evaluate_during_training': True,
-    'do_eval_per_epoch': True,
-    'use_lstm': True,
-    'from_scratch': True,
-    'from_last_checkpoint': False,
+    'evaluate_during_training': True,  # 是否在训练过程中验证模型, 默认为True
+    'use_lstm': True,  # 默认为False, 表示模型结构为bert_crf
+    'from_scratch': True,  # 是否从头开始训练，默认为True
+    'from_last_checkpoint': False,  # 是否从最新的checkpoint模型继续训练，默认为False
     'early_stop': False,
     'overwrite_output_dir': True,
-    'overwrite_cache': True,
-    'no_cuda': False,
+    'overwrite_cache': True,  # 是否重写特征，默认为True，若为False表示从特征文件中加载特征
+    'no_cuda': False,  # 是否使用GPU。默认为False, 表示只使用CPU
     'fp16': True,
-    'train_max_seq_length': 128,
-    'eval_max_seq_length': 128,
+    'train_max_seq_length': 128,  # 默认为512
+    'eval_max_seq_length': 128,  # 默认为512
     'per_gpu_train_batch_size': 16,
     'per_gpu_eval_batch_size': 16,
     'gradient_accumulation_steps': 1,
-    'learning_rate': 5e-05,
+    'learning_rate': 5e-05,  # bert和lstm的学习率
     'crf_learning_rate': 5e-05,
     'weight_decay': 0.01,
     'adam_epsilon': 1e-08,
     'warmup_proportion': 0.1,
     'num_train_epochs': 50.0,
-    'max_steps': -1,
+    'max_steps': -1,  # 当指定了该字段值后，'num_train_epochs'就不起作用了
     'tolerance': 5,   # 指定early stop容忍的epoch数量
-    'logging_steps': 500,
-    'save_steps': 500,
+    'logging_steps': 500,  # 指定tensorboard日志在哪个阶段记录
+    'save_steps': 500,  # 指定哪些步骤保存中间训练结果
     'scheduler_type': 'linear',   # ["linear","cosine","cosine_with_restarts","polynomial","constant","constant_with_warmup"]
-    'cuda_number': '0',   # '0,1,2,3'
+    'cuda_number': '0',   # '0,1,2,3' 使用GPU时需指定GPU卡号
     'seed': 2333,
-    'local_rank': -1,
     'dropout_rate': 0.3
 }
 
@@ -103,18 +103,17 @@ extractor.train_and_valid()
 from EventExtraction import EventExtractor, DataAndTrainArguments
 
 config = {
-    'task_name': 'ee',
+    'task_name': 'ee',  # ee, ner
     'data_dir': '../data/normal_data/news2',
-    'model_type': 'bert',
-    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
+    'model_type': 'bert',  # bert, nezha
+    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',  # nezha-base-wwm
     'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录
-    'do_lower_case': False,
-    'use_lstm': False,
-    'no_cuda': False,
-    'eval_max_seq_length': 128,
+    'do_lower_case': False,  # 主要是tokenize时是否将大写转为小写
+    'use_lstm': False,  # 默认为False, 表示模型结构为bert_crf
+    'no_cuda': False,  # 是否使用GPU。默认为False, 表示只使用CPU
+    'eval_max_seq_length': 128,  # 默认为512
     'per_gpu_eval_batch_size': 8,
-    'cuda_number': '0',   # '0,1,2,3'
-    'local_rank': -1,
+    'cuda_number': '0',   # '0,1,2,3' 使用GPU时需指定GPU卡号
 }
 
 args = DataAndTrainArguments(**config)
@@ -140,18 +139,13 @@ from EventExtraction import EventExtractor, DataAndTrainArguments
 
 config = {
     'task_name': 'ee',
-    'data_dir': '../data/normal_data/news2',
     'model_type': 'bert',
-    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
-    'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录
-    'do_lower_case': True,
-    'use_lstm': False,
-    'no_cuda': False,
-    'eval_max_seq_length': 128,
+    'use_lstm': True,  # 默认是False
+    'eval_max_seq_length': 512,
 }
 
 args = DataAndTrainArguments(**config)
-extractor = EventExtractor(args, state='pred')
+extractor = EventExtractor(args, state='pred', model_path='../data/model')
 
 # data_type: 只能是'test'，或者None。若为test则表示在测试数据集上预测
 # input_texts: 若不为空，则表示是预测新的数据

diff --git a/experiments/eval_model.py b/experiments/eval_model.py
@@ -12,21 +12,20 @@
 from EventExtraction import EventExtractor, DataAndTrainArguments
 
 config = {
-    'task_name': 'ee',
+    'task_name': 'ee',  # ee
     'data_dir': '../data/normal_data/news2',
-    'model_type': 'bert',
-    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
+    'model_type': 'bert',  # bert, nezha
+    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',  # nezha-base-wwm
     'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录
-    'do_lower_case': False,
-    'use_lstm': False,
-    'no_cuda': False,
-    'eval_max_seq_length': 128,
+    'do_lower_case': False,  # 主要是tokenize时是否将大写转为小写
+    'use_lstm': False,  # 默认为False, 表示模型结构为bert_crf
+    'no_cuda': False,  # 是否使用GPU。默认为False, 表示只使用CPU
+    'eval_max_seq_length': 128,  # 默认为512
     'per_gpu_eval_batch_size': 8,
-    'cuda_number': '0',  # '0,1,2,3'
-    'local_rank': -1,
+    'cuda_number': '0',   # '0,1,2,3' 使用GPU时需指定GPU卡号
 }
 
-args = DataAndTrainArguments(**config)
+args = DataAndTrainArguments(**config) # noqa
 extractor = EventExtractor(args)
 
 # evaluate all checkpoints file for the dev datasets

diff --git a/experiments/predict.py b/experiments/predict.py
@@ -9,22 +9,18 @@
     Change Activity: 
 ======================================
 """
+import json
 from EventExtraction import EventExtractor, DataAndTrainArguments
 
 config = {
     'task_name': 'ee',
-    'data_dir': '../data/normal_data/news2',
     'model_type': 'bert',
-    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
-    'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录
-    'do_lower_case': True,
-    'use_lstm': False,
-    'no_cuda': False,
-    'eval_max_seq_length': 128,
+    'use_lstm': True,  # 默认是False
+    'eval_max_seq_length': 512,
 }
 
-args = DataAndTrainArguments(**config)
-extractor = EventExtractor(args, state='pred')
+args = DataAndTrainArguments(**config)  # noqa
+extractor = EventExtractor(args, state='pred', model_path='../data/model')
 
 # data_type: 只能是'test'，或者None。若为test则表示在测试数据集上预测
 # input_texts: 若不为空，则表示是预测新的数据
@@ -43,4 +39,4 @@
 texts = ["博盛医疗完成Pre-A轮融资澳银资本重点参与",
          "百炼智能完成A轮一亿元融资，由今日头条领投"]
 for res in extractor.predict(input_texts=texts):
-    print(res)
+    print(json.dumps(res, ensure_ascii=False, indent=2))
diff --git a/experiments/train_model.py b/experiments/train_model.py
@@ -10,47 +10,46 @@
 ======================================
 """
 import sys
-sys.path.append('/data/chenzhihao/EventExtraction')
+sys.path.append('/data/chenzhihao/EventExtraction/')
 from EventExtraction import EventExtractor, DataAndTrainArguments
 
 config = {
-    'task_name': 'ee',
-    'data_dir': '../data/normal_data/news2',
-    'model_type': 'bert',
-    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',
-    'model_sate_dict_path': '../data/output/bert/best_model',  # 保存的checkpoint文件地址用于继续训练
-    'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录
-    'cache_dir': '',  # 指定下载的预训练模型保存地址
-    'evaluate_during_training': True,
-    'do_eval_per_epoch': True,
-    'use_lstm': False,
-    'from_scratch': True,
-    'from_last_checkpoint': False,
+    'task_name': 'ner',  # ner
+    'data_dir': '../data/normal_data/ner',
+    'model_type': 'bert',  # bert, nezha
+    'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',  # '/data/chenzhihao/nezha-base-www'
+    'model_sate_dict_path': '../data/output/bert/best_model',   # 保存的checkpoint文件地址用于继续训练
+    'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录False
+    'do_lower_case': False,  # 主要是tokenize时是否将大写转为小写
+    'cache_dir': '',   # 指定下载的预训练模型保存地址
+    'evaluate_during_training': True,  # 是否在训练过程中验证模型, 默认为True
+    'use_lstm': False,  # 默认为False, 表示模型结构为bert_crf
+    'from_scratch': True,  # 是否从头开始训练，默认为True
+    'from_last_checkpoint': False,  # 是否从最新的checkpoint模型继续训练，默认为False
     'early_stop': False,
     'overwrite_output_dir': True,
-    'overwrite_cache': True,
-    'no_cuda': False,
+    'overwrite_cache': True,  # 是否重写特征，默认为True，若为False表示从特征文件中加载特征
+    'no_cuda': False,  # 是否使用GPU。默认为False, 表示只使用CPU
     'fp16': True,
-    'train_max_seq_length': 128,
-    'eval_max_seq_length': 128,
-    'per_gpu_train_batch_size': 32,
-    'per_gpu_eval_batch_size': 32,
+    'train_max_seq_length': 32,  # 默认为512
+    'eval_max_seq_length': 32,  # 默认为512
+    'per_gpu_train_batch_size': 16,
+    'per_gpu_eval_batch_size': 16,
     'gradient_accumulation_steps': 1,
-    'learning_rate': 5e-05,
+    'learning_rate': 5e-05,  # bert和lstm的学习率
     'crf_learning_rate': 5e-05,
     'weight_decay': 0.01,
     'adam_epsilon': 1e-08,
     'warmup_proportion': 0.1,
-    'num_train_epochs': 30.0,
-    'max_steps': -1,
-    'tolerance': 5,  # 指定early stop容忍的epoch数量
-    'logging_steps': 500,
-    'save_steps': 500,
-    'scheduler_type': 'cosine',
+    'num_train_epochs': 3.0,
+    'max_steps': -1,  # 当指定了该字段值后，'num_train_epochs'就不起作用了
+    'tolerance': 5,   # 指定early stop容忍的epoch数量
+    'logging_steps': 500,  # 指定tensorboard日志在哪个阶段记录
+    'save_steps': 500,  # 指定哪些步骤保存中间训练结果
     # ["linear","cosine","cosine_with_restarts","polynomial","constant","constant_with_warmup"]
-    'cuda_number': '0',  # '0,1,2,3'
+    'scheduler_type': 'linear',
+    'cuda_number': '3',   # '0,1,2,3' 使用GPU时需指定GPU卡号
     'seed': 2333,
-    'local_rank': -1,
     'dropout_rate': 0.3
 }
 

diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,4 @@ accelerate==0.4.0
 numpy==1.21.2
 tqdm==4.62.2
 torch==1.9.0
-tensorflow==2.6.0
+tensorboard==2.6.0
diff --git a/setup.py b/setup.py
@@ -24,9 +24,9 @@
     version = f.read().strip()
 
 # 改成自己的工程名，尽量与gitlab的工程名起名一致
-proj_name = "EventExtraction"
+proj_name = "event_extraction_pytorch"
 keywords = "event_extraction, bert_crf"
-git_url = f"https://gitlab.bailian-ai.com/ai_algo/{proj_name}"
+git_url = f"https://github.com/zhihao-chen/{proj_name}"
 
 
 def load_requirements(file_name="requirements.txt", comment_char="#"):
@@ -47,8 +47,8 @@ def load_requirements(file_name="requirements.txt", comment_char="#"):
 
 setup(
     name=proj_name,
-    author="bailian.ai",
-    author_email="[email protected]",
+    author="andrew chen",
+    author_email="[email protected]",
     description=long_description,
     version=version,
     url=git_url,