breast_cancer_pred_ml.py

# -*- coding: utf-8 -*-
"""breast_cancer_pred_ML.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1sN3wKcfoRo-vxxHmrT5rV1qcQLqtIPGs
"""

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from random import shuffle

# Preprocessing and sampling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE

# Model selection and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer

# Machine learning models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

# Deep learning imports
import tensorflow as tf
from sklearn.datasets import load_breast_cancer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import regularizers

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Load breast cancer dataset

breast_cancer_dataset = sklearn.datasets.load_breast_cancer()

print(breast_cancer_dataset)

# Create DataFrame from dataset
df = pd.DataFrame(breast_cancer_dataset.data, columns=breast_cancer_dataset.feature_names)

df.head(10)

# Add target labels to DataFrame

df['label'] = breast_cancer_dataset.target

"""# Data Exploration

"""

df.head(10)

# Print shape of DataFrame
df.shape

# Print information about DataFrame
df.info()

# Check for missing values
df.isnull().sum()

# Descriptive statistics of DataFrame
df.describe()

#Count of each class

df['label'].value_counts()

# Mean of features by class

df.groupby('label').mean()

description = df.describe()
print(f'description of the dataset  : {description}')

corr=df.corr()
corr

df.std()

# Visualizations

selected_features = ['area error', 'worst perimeter', 'worst area', 'mean area',
                    'mean perimeter']

sns.pairplot(df , vars=selected_features, hue='label', markers=['o','s'], palette='husl')
plt.show()

sns.countplot(x='label', data=df)

sns.distplot(df['label'])

sns.displot(df['label'])

plt.figure(figsize=(10,10))
sns.heatmap(corr , cbar=True , square=True , fmt='.1f', annot=True,
           annot_kws={'size':8}, cmap='Blues')

X = df.drop(columns='label', axis=1)
y = df['label']

print(X)
print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train.unique()

param_grid = {'n_estimators': [100,200,250,300,350,400,500]}
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(model,param_grid,cv=5,scoring='accuracy')
grid_search.fit(X_train,y_train)
print('grid search reasult:')
print('best parameters : ', grid_search.best_params_)
print('best cross-validated accuracy: ',grid_search.best_score_)

param_grid = {'n_estimators': [100,200,250,300,350,400,500]}
model = AdaBoostClassifier(random_state=42)
grid_search = GridSearchCV(model,param_grid,cv=5,scoring='accuracy')
grid_search.fit(X_train,y_train)
print('grid search reasult:')
print('best parameters : ', grid_search.best_params_)
print('best cross-validated accuracy: ',grid_search.best_score_)

param_grid = {'n_estimators': [100,200,250,300,350,400,500]}
model = XGBClassifier(random_state=42)
grid_search = GridSearchCV(model,param_grid,cv=5,scoring='accuracy')
grid_search.fit(X_train,y_train)
print('grid search reasult:')
print('best parameters : ', grid_search.best_params_)
print('best cross-validated accuracy: ',grid_search.best_score_)

!pip install  catboost

'''param_grid = {'n_estimators': [100,200,250,300,350,400,500]}
model = CatBoostClassifier(random_state=42)
grid_search = GridSearchCV(model,param_grid,cv=5,scoring='accuracy')
grid_search.fit(X_train,y_train)
print('grid search reasult:')
print('best parameters : ', grid_search.best_params_)
print('best cross-validated accuracy: ',grid_search.best_score_)'''

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

X_train_predict = model.predict(X_train)

training = accuracy_score(y_train, X_train_predict)

print(training)

model = RandomForestClassifier(n_estimators=200 , random_state=42)
model.fit(X_train,y_train)
y_perd = model.predict(X_test)
accuracy = accuracy_score(y_test,y_perd)
precision = precision_score(y_test,y_perd)
recall = recall_score(y_test,y_perd)
f1 = f1_score(y_test,y_perd)

print(f'Accuracy :  {accuracy:.2f}%')
print(f'precision :  {precision:.2f}%')
print(f'recall :  {recall:.2f}%')
print(f'F1 score :  {f1:.2f}%')

model = AdaBoostClassifier(n_estimators=300 , random_state=42)
model.fit(X_train,y_train)
y_perd = model.predict(X_test)
accuracy = accuracy_score(y_test,y_perd)
precision = precision_score(y_test,y_perd)
recall = recall_score(y_test,y_perd)
f1 = f1_score(y_test,y_perd)

print(f'Accuracy :  {accuracy:.2f}%')
print(f'precision :  {precision:.2f}%')
print(f'recall :  {recall:.2f}%')
print(f'F1 score :  {f1:.2f}%')

from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test,y_perd)
precision = precision_score(y_test,y_perd)
recall = recall_score(y_test,y_perd)
f1 = f1_score(y_test,y_perd)

print(f'Accuracy :  {accuracy:.2f}%')
print(f'precision :  {precision:.2f}%')
print(f'recall :  {recall:.2f}%')
print(f'F1 score :  {f1:.2f}%')

from catboost import CatBoostClassifier
model = CatBoostClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test,y_perd)
precision = precision_score(y_test,y_perd)
recall = recall_score(y_test,y_perd)
f1 = f1_score(y_test,y_perd)

print(f'Accuracy :  {accuracy:.2f}%')
print(f'precision :  {precision:.2f}%')
print(f'recall :  {recall:.2f}%')
print(f'F1 score :  {f1:.2f}%')

model = SVC(kernel='linear', random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test,y_perd)
precision = precision_score(y_test,y_perd)
recall = recall_score(y_test,y_perd)
f1 = f1_score(y_test,y_perd)

print(f'Accuracy :  {accuracy:.2f}%')
print(f'precision :  {precision:.2f}%')
print(f'recall :  {recall:.2f}%')
print(f'F1 score :  {f1:.2f}%')

# Define models for ROC curve plotting

from sklearn.metrics import roc_curve, auc

models = [
    RandomForestClassifier(n_estimators=200, random_state=42),
    AdaBoostClassifier(n_estimators=300, random_state=42),
    SVC(kernel='linear', probability=True, random_state=42),
    CatBoostClassifier(n_estimators=200, random_state=42, verbose=False),
    XGBClassifier(n_estimators=200, random_state=42),
    LogisticRegression()
]
# Plot ROC curve for each model

plt.figure(figsize=(10, 8))
for model in models:
    model.fit(X_train, y_train)
    y_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='%s (AUC = %0.2f)' % (type(model).__name__, roc_auc))

plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.show()

# Load breast cancer dataset
breast_cancer_dataset = load_breast_cancer()
X, y = breast_cancer_dataset.data, breast_cancer_dataset.target

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Shuffle and split dataset into train and test sets
X, y = shuffle(X, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define advanced neural network architecture
model = Sequential([
    Dense(256, input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l2(0.001)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.5),
    Dense(128, kernel_regularizer=regularizers.l2(0.001)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.5),
    Dense(64, kernel_regularizer=regularizers.l2(0.001)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model with advanced optimizer and metrics
optimizer = Adam(learning_rate=1e-3)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Define learning rate scheduler
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-5)

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with callbacks for learning rate scheduling and early stopping
history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_test, y_test),
                    callbacks=[lr_scheduler, early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

# Predict classes
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)

# Plot training history
plt.figure(figsize=(12, 6))

# Plot loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# Receiver Operating Characteristic (ROC) Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC Curve')
plt.plot([0, 1], [0, 1], linestyle='--', color='red')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Distribution of Predictions for each Class
predictions = model.predict(X_test)
plt.hist(predictions[y_test == 0], bins=20, color='blue', alpha=0.5, label='Class 0')
plt.hist(predictions[y_test == 1], bins=20, color='red', alpha=0.5, label='Class 1')
plt.title('Distribution of Predictions for each Class')
plt.xlabel('Predictions')
plt.ylabel('Frequency')
plt.legend()
plt.show()