diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fd3193c --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__/utils.cpython-38.pyc +__pycache__/evaluate.cpython-38.pyc +__pycache__/data_loader.cpython-38.pyc +__pycache__/metrics.cpython-38.pyc diff --git a/README.md b/README.md index 4d97332..d98f91c 100644 --- a/README.md +++ b/README.md @@ -48,9 +48,8 @@ We randomly select 3000 samples from the training set as the validation set, and This repo was tested on Python 3.5+ and PyTorch 0.4.1/1.0.0. The requirements are: -- tensorflow >= 1.11.0 -- torch >= 0.4.1 -- pytorch-pretrained-bert == 0.4.0 +- torch >= 1.10.0 +- transformers > 4.12.0 - tqdm - apex @@ -84,34 +83,7 @@ Based on the best model on the validation set, we can get the recognition effect 1. **Get BERT model for PyTorch** - There are two ways to get the pre-trained BERT model in a PyTorch dump for your experiments : - - - **Direct download of the converted pytorch version of the BERT model** - - You can download the pytorch dump I converted from the tensorflow checkpont from my Google Cloud Drive folder [`bert-base-chinese-pytorch`](https://drive.google.com/drive/folders/1K_xCYMCEfjpPjedSnMyL9zMVzqbanQX9), including the BERT parameters file `bert_config.json`, the model file `pytorch_model.bin` and the vocabulary file `vocab.txt`. - - - **Convert the TensorFlow checkpoint to a PyTorch dump by yourself** - - - Download the Google's BERT base model for Chinese from **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)** (Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters), and decompress it. - - - Execute the following command, convert the TensorFlow checkpoint to a PyTorch dump. - - ```shell - export TF_BERT_BASE_DIR=/path/to/chinese_L-12_H-768_A-12 - export PT_BERT_BASE_DIR=/path/to/NER-BERT-pytorch/bert-base-chinese-pytorch - - pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch \ - $TF_BERT_BASE_DIR/bert_model.ckpt \ - $TF_BERT_BASE_DIR/bert_config.json \ - $PT_BERT_BASE_DIR/pytorch_model.bin - ``` - - - Copy the BERT parameters file `bert_config.json` and dictionary file `vocab.txt` to the directory `$PT_BERT_BASE_DIR`. - - ```shell - cp $TF_BERT_BASE_DIR/bert_config.json $PT_BERT_BASE_DIR/bert_config.json - cp $TF_BERT_BASE_DIR/vocab.txt $PT_BERT_BASE_DIR/vocab.txt - ``` + do nothing ( it can donwload bert-base-chinese automatically) 2. **Build dataset and tags** diff --git a/data_loader.py b/data_loader.py index 9462ace..28f5b50 100644 --- a/data_loader.py +++ b/data_loader.py @@ -7,7 +7,7 @@ import torch -from pytorch_pretrained_bert import BertTokenizer +from transformers import BertTokenizer import utils diff --git a/evaluate.py b/evaluate.py index fcf81bb..336b01d 100644 --- a/evaluate.py +++ b/evaluate.py @@ -8,7 +8,7 @@ import numpy as np import torch -from pytorch_pretrained_bert import BertForTokenClassification, BertConfig +from transformers import BertForTokenClassification, BertConfig from metrics import f1_score from metrics import classification_report @@ -45,6 +45,7 @@ def evaluate(model, data_iterator, params, mark='Eval', verbose=False): batch_masks = batch_data.gt(0) loss = model(batch_data, token_type_ids=None, attention_mask=batch_masks, labels=batch_tags) + loss=loss[0] if params.n_gpu > 1 and params.multi_gpu: loss = loss.mean() loss_avg.update(loss.item()) diff --git a/experiments/base_model/evaluate.log b/experiments/base_model/evaluate.log index 41c2ccd..d1fb89b 100644 --- a/experiments/base_model/evaluate.log +++ b/experiments/base_model/evaluate.log @@ -28,3 +28,4 @@ avg / total 94.54 94.73 94.63 5449 +2022-02-05 14:26:42,389:INFO: Loading the dataset... diff --git a/experiments/base_model/train.log b/experiments/base_model/train.log index 6dfea67..30623af 100644 --- a/experiments/base_model/train.log +++ b/experiments/base_model/train.log @@ -69,3 +69,19 @@ 2019-01-25 05:48:44,861:INFO: - Train metrics: loss: 00.00; f1: 99.90 2019-01-25 05:49:07,503:INFO: - Val metrics: loss: 00.03; f1: 95.88 2019-01-25 05:49:12,045:INFO: Best val f1: 95.90 +2022-02-05 14:27:16,705:INFO: device: cuda, n_gpu: 1, 16-bits training: False +2022-02-05 14:27:16,705:INFO: Loading the datasets... +2022-02-05 14:27:45,285:INFO: device: cuda, n_gpu: 1, 16-bits training: False +2022-02-05 14:27:45,285:INFO: Loading the datasets... +2022-02-05 14:28:39,517:INFO: Starting training for 20 epoch(s) +2022-02-05 14:28:39,517:INFO: Epoch 1/20 +2022-02-05 14:34:00,740:INFO: device: cuda, n_gpu: 1, 16-bits training: False +2022-02-05 14:34:00,741:INFO: Loading the datasets... +2022-02-05 14:34:54,650:INFO: Starting training for 20 epoch(s) +2022-02-05 14:34:54,651:INFO: Epoch 1/20 +2022-02-05 14:40:08,016:INFO: device: cuda, n_gpu: 1, 16-bits training: False +2022-02-05 14:40:08,017:INFO: Loading the datasets... +2022-02-05 14:41:01,202:INFO: Starting training for 20 epoch(s) +2022-02-05 14:41:01,202:INFO: Epoch 1/20 +2022-02-05 14:46:01,254:INFO: device: cuda, n_gpu: 1, 16-bits training: False +2022-02-05 14:46:01,254:INFO: Loading the datasets... diff --git a/requirements.txt b/requirements.txt index 2ae8242..47d048e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,8 @@ -# TensorFlow -tensorflow >= 1.11.0 # PyTorch -torch >= 0.4.1 +torch >= 1.10.0 # progress bars in model download and training scripts tqdm # A PyTorch implementation of Google AI's BERT model -pytorch-pretrained-bert == 0.4.0 +transformers >= 4.12.0 # A tool for easy mixed precision and distributed training in Pytorch, https://github.com/NVIDIA/apex -apex \ No newline at end of file +apex diff --git a/train.py b/train.py index 5c28104..e2fedf6 100644 --- a/train.py +++ b/train.py @@ -11,7 +11,7 @@ from torch.optim.lr_scheduler import LambdaLR from tqdm import trange -from pytorch_pretrained_bert import BertForTokenClassification +from transformers import BertForTokenClassification from data_loader import DataLoader from evaluate import evaluate @@ -20,7 +20,7 @@ parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default='data/msra', help="Directory containing the dataset") -parser.add_argument('--bert_model_dir', default='bert-base-chinese-pytorch', help="Directory containing the BERT model in PyTorch") +parser.add_argument('--bert_model_dir', default='bert-base-chinese', help="Directory containing the BERT model in PyTorch") parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing params.json") parser.add_argument('--seed', type=int, default=2019, help="random seed for initialization") parser.add_argument('--restore_file', default=None, @@ -51,7 +51,7 @@ def train(model, data_iterator, optimizer, scheduler, params): # compute model output and loss loss = model(batch_data, token_type_ids=None, attention_mask=batch_masks, labels=batch_tags) - + loss=loss[0] if params.n_gpu > 1 and args.multi_gpu: loss = loss.mean() # mean() to average on multi-gpu