-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_training.py
136 lines (115 loc) · 4.33 KB
/
model_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pandas as pd
import numpy as np
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding
)
from datasets import Dataset
import torch
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
def prepare_dataset(df, tokenizer, max_length=512):
"""Prepare dataset for training/testing"""
# Convert sentiment labels to numeric (case-insensitive)
label_map = {'negative': 0, 'positive': 1}
labels = [label_map[label.lower()] for label in df['sentiment']]
# Create dataset
dataset = Dataset.from_dict({
'text': df['review'].tolist(),
'label': labels
})
# Tokenize dataset
def tokenize_function(examples):
return tokenizer(
examples['text'],
truncation=True,
max_length=max_length,
padding='max_length'
)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
return tokenized_dataset
def compute_metrics(eval_pred):
"""Compute metrics for evaluation"""
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
# Calculate metrics
report = classification_report(labels, predictions, output_dict=True)
return {
'accuracy': report['accuracy'],
'f1': report['macro avg']['f1-score'],
'precision': report['macro avg']['precision'],
'recall': report['macro avg']['recall']
}
def plot_confusion_matrix(y_true, y_pred, save_path='confusion_matrix.png'):
"""Plot and save confusion matrix"""
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig(save_path)
plt.close()
def main(final_budget):
# Load the datasets
train_df = pd.read_csv(f'informative_samples/selected_informative_samples_financial_news_{final_budget}.csv')
test_df = pd.read_csv('datasets/financial_news_test.csv')
# Initialize model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Prepare datasets
train_dataset = prepare_dataset(train_df, tokenizer)
test_dataset = prepare_dataset(test_df, tokenizer)
# Define training arguments
training_args = TrainingArguments(
output_dir="./results_financial_news",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="accuracy"
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
compute_metrics=compute_metrics,
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)
# Train the model
print("Training the model...")
trainer.train()
# Evaluate on test set
print("\nEvaluating on test set...")
eval_results = trainer.evaluate()
print("\nEvaluation Results:")
for metric, value in eval_results.items():
print(f"{metric}: {value:.4f}")
# Get predictions for confusion matrix
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
# Plot and save confusion matrix
plot_confusion_matrix(y_true, y_pred)
# Save detailed classification report
report = classification_report(y_true, y_pred)
print("\nDetailed Classification Report:")
print(report)
# Save the model
trainer.save_model("./final_model_financial_news_300")
tokenizer.save_pretrained("./final_model_financial_news_300")
if __name__ == "__main__":
main(final_budget=500)