Skip to content

Commit

Permalink
Added hyperparameter-tuning
Browse files Browse the repository at this point in the history
  • Loading branch information
Prasadayus committed May 29, 2024
1 parent 93fc861 commit 989317d
Showing 1 changed file with 214 additions and 26 deletions.
240 changes: 214 additions & 26 deletions Stackoverflow_Survey_Analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14783,6 +14783,17 @@
" return metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#For hyperparameter tuning we will use optuna\n",
"!pip install optuna\n",
"import optuna"
]
},
{
"cell_type": "code",
"execution_count": 801,
Expand All @@ -14800,18 +14811,40 @@
],
"source": [
"#DecisionTreeClassifier\n",
"# Define the objective function\n",
"def objective(trial):\n",
" max_depth = trial.suggest_int('max_depth', 1, 20)\n",
" min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 50)\n",
"\n",
" model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)\n",
" model.fit(X_train, y_train)\n",
" \n",
" y_pred = model.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" \n",
" return accuracy\n",
"\n",
"# Create a study object and optimize the objective function\n",
"study = optuna.create_study(direction='maximize')\n",
"study.optimize(objective, n_trials=100)\n",
"\n",
"# Print the best hyperparameters\n",
"print('Best hyperparameters: ', study.best_params)\n",
"\n",
"# Train the model with the best hyperparameters\n",
"best_params = study.best_params\n",
"start = time.time()\n",
"modelDC = DecisionTreeClassifier(max_depth = 12, min_samples_leaf = 10)\n",
"modelDC = DecisionTreeClassifier(**best_params)\n",
"modelDC.fit(X_train, y_train)\n",
"end = time.time()\n",
"TimeDC = end - start\n",
"print('Time: ', TimeDC)\n",
"\n",
"#Evaluating model on test set\n",
"# Evaluating model on test set\n",
"y_pred = modelDC.predict(X_test)\n",
"all_metrics.update(metrics_data(\"Decision Trees\", y_test, y_pred))\n",
"\n",
"#Evaluating model on train set\n",
"# Evaluating model on train set\n",
"y_pred = modelDC.predict(X_train)\n",
"accuracyDC2 = accuracy_score(y_train, y_pred)\n",
"print('Accuracy on train set: {}'.format(accuracyDC2))"
Expand All @@ -14833,17 +14866,39 @@
],
"source": [
"#MultinomialNB\n",
"# Define the objective function\n",
"def objective(trial):\n",
" alpha = trial.suggest_float('alpha', 1e-3, 1e-1, log=True)\n",
"\n",
" model = MultinomialNB(alpha=alpha)\n",
" model.fit(X_train, y_train)\n",
" \n",
" y_pred = model.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" \n",
" return accuracy\n",
"\n",
"# Create a study object and optimize the objective function\n",
"study = optuna.create_study(direction='maximize')\n",
"study.optimize(objective, n_trials=100)\n",
"\n",
"# Print the best hyperparameters\n",
"print('Best hyperparameters: ', study.best_params)\n",
"\n",
"# Train the model with the best hyperparameters\n",
"best_params = study.best_params\n",
"start = time.time()\n",
"modelNB = MultinomialNB(alpha=0.005)\n",
"modelNB = MultinomialNB(**best_params)\n",
"modelNB.fit(X_train, y_train)\n",
"end = time.time()\n",
"TimeNB = end - start\n",
"print('Time: ', TimeNB)\n",
"\n",
"#Evaluating model on test set\n",
"# Evaluating model on test set\n",
"y_pred = modelNB.predict(X_test)\n",
"all_metrics.update(metrics_data(\"Multinomial Naive Bayes\", y_test, y_pred))\n",
"\n",
"#Evaluating model on train set\n",
"# Evaluating model on train set\n",
"y_pred = modelNB.predict(X_train)\n",
"accuracyNB2 = accuracy_score(y_train, y_pred)\n",
"print('Accuracy on train set: {}'.format(accuracyNB2))"
Expand Down Expand Up @@ -14931,18 +14986,40 @@
],
"source": [
"#GaussianNB\n",
"\n",
"# Define the objective function\n",
"def objective(trial):\n",
" var_smoothing = trial.suggest_float('var_smoothing', 1e-11, 1e-7, log=True)\n",
"\n",
" model = GaussianNB(var_smoothing=var_smoothing)\n",
" model.fit(X_train, y_train)\n",
" \n",
" y_pred = model.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" \n",
" return accuracy\n",
"\n",
"# Create a study object and optimize the objective function\n",
"study = optuna.create_study(direction='maximize')\n",
"study.optimize(objective, n_trials=100)\n",
"\n",
"# Print the best hyperparameters\n",
"print('Best hyperparameters: ', study.best_params)\n",
"\n",
"# Train the model with the best hyperparameters\n",
"best_params = study.best_params\n",
"start = time.time()\n",
"modelGNB = GaussianNB()\n",
"modelGNB = GaussianNB(**best_params)\n",
"modelGNB.fit(X_train, y_train)\n",
"end = time.time()\n",
"TimeGNB = end - start\n",
"print('Time: ', TimeGNB)\n",
"\n",
"#Evaluating model on test set\n",
"# Evaluating model on test set\n",
"y_pred = modelGNB.predict(X_test)\n",
"all_metrics.update(metrics_data(\"Gaussian Naive Bayes\", y_test, y_pred))\n",
"\n",
"#Evaluating model on train set\n",
"# Evaluating model on train set\n",
"y_pred = modelGNB.predict(X_train)\n",
"accuracyGNB2 = accuracy_score(y_train, y_pred)\n",
"print('Accuracy on train set: {}'.format(accuracyGNB2))"
Expand All @@ -14965,18 +15042,41 @@
],
"source": [
"#Logistic Regression\n",
"# Define the objective function\n",
"def objective(trial):\n",
" # Define the search space for hyperparameters\n",
" C = trial.suggest_float('C', 1e-4, 1e2, log=True)\n",
" solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'])\n",
"\n",
" model = LogisticRegression(C=C, solver=solver, max_iter=1000)\n",
" model.fit(X_train, y_train)\n",
" \n",
" y_pred = model.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" \n",
" return accuracy\n",
"\n",
"# Create a study object and optimize the objective function\n",
"study = optuna.create_study(direction='maximize')\n",
"study.optimize(objective, n_trials=100)\n",
"\n",
"# Print the best hyperparameters\n",
"print('Best hyperparameters: ', study.best_params)\n",
"\n",
"# Train the model with the best hyperparameters\n",
"best_params = study.best_params\n",
"start = time.time()\n",
"modelLR = LogisticRegression()\n",
"modelLR = LogisticRegression(**best_params, max_iter=1000)\n",
"modelLR.fit(X_train, y_train)\n",
"end = time.time()\n",
"TimeLR = end - start\n",
"print('Time: ', TimeLR)\n",
"\n",
"#Evaluating model on test set\n",
"# Evaluating model on test set\n",
"y_pred = modelLR.predict(X_test)\n",
"all_metrics.update(metrics_data(\"Logistic Regression\", y_test, y_pred))\n",
"\n",
"#Evaluating model on train set\n",
"# Evaluating model on train set\n",
"y_pred = modelLR.predict(X_train)\n",
"accuracyLR2 = accuracy_score(y_train, y_pred)\n",
"print('Accuracy on train set: {}'.format(accuracyLR2))"
Expand All @@ -14999,18 +15099,51 @@
],
"source": [
"#RandomForestClassifier\n",
"# Define the objective function\n",
"def objective(trial):\n",
" # Define the search space for hyperparameters\n",
" n_estimators = trial.suggest_int('n_estimators', 100, 1000)\n",
" max_depth = trial.suggest_int('max_depth', 10, 50)\n",
" min_samples_split = trial.suggest_int('min_samples_split', 2, 10)\n",
" min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)\n",
" max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])\n",
" \n",
" model = RandomForestClassifier(\n",
" n_estimators=n_estimators,\n",
" max_depth=max_depth,\n",
" min_samples_split=min_samples_split,\n",
" min_samples_leaf=min_samples_leaf,\n",
" max_features=max_features,\n",
" random_state=42\n",
" )\n",
" model.fit(X_train, y_train)\n",
" \n",
" y_pred = model.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" \n",
" return accuracy\n",
"\n",
"# Create a study object and optimize the objective function\n",
"study = optuna.create_study(direction='maximize')\n",
"study.optimize(objective, n_trials=100)\n",
"\n",
"# Print the best hyperparameters\n",
"print('Best hyperparameters: ', study.best_params)\n",
"\n",
"# Train the model with the best hyperparameters\n",
"best_params = study.best_params\n",
"start = time.time()\n",
"rfc = RandomForestClassifier()\n",
"rfc = RandomForestClassifier(**best_params, random_state=42)\n",
"rfc.fit(X_train, y_train)\n",
"end = time.time()\n",
"TimeRFC = end - start\n",
"print('Time: ', TimeRFC)\n",
"\n",
"#Evaluating model on test set\n",
"# Evaluating model on test set\n",
"y_pred = rfc.predict(X_test)\n",
"all_metrics.update(metrics_data(\"Random Forest\", y_test, y_pred))\n",
"\n",
"#Evaluating model on train set\n",
"# Evaluating model on train set\n",
"y_pred = rfc.predict(X_train)\n",
"accuracyRFC2 = accuracy_score(y_train, y_pred)\n",
"print('Accuracy on train set: {}'.format(accuracyRFC2))"
Expand All @@ -15033,18 +15166,41 @@
],
"source": [
"#LinearSVC\n",
"def objective(trial):\n",
" # Define the search space for hyperparameters\n",
" C = trial.suggest_float('C', 1e-4, 1e2, log=True)\n",
" max_iter = trial.suggest_int('max_iter', 1000, 10000)\n",
" loss = trial.suggest_categorical('loss', ['hinge', 'squared_hinge'])\n",
" \n",
" model = LinearSVC(C=C, max_iter=max_iter, loss=loss, random_state=42)\n",
" model.fit(X_train, y_train)\n",
" \n",
" y_pred = model.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" \n",
" return accuracy\n",
"\n",
"# Create a study object and optimize the objective function\n",
"study = optuna.create_study(direction='maximize')\n",
"study.optimize(objective, n_trials=100)\n",
"\n",
"# Print the best hyperparameters\n",
"print('Best hyperparameters: ', study.best_params)\n",
"\n",
"# Train the model with the best hyperparameters\n",
"best_params = study.best_params\n",
"start = time.time()\n",
"svc = LinearSVC()\n",
"svc.fit(X_train, y_train) \n",
"svc = LinearSVC(**best_params, random_state=42)\n",
"svc.fit(X_train, y_train)\n",
"end = time.time()\n",
"TimeSVC = end - start\n",
"print('Time: ', TimeSVC)\n",
"\n",
"#Evaluating model on test set\n",
"# Evaluating model on test set\n",
"y_pred = svc.predict(X_test)\n",
"all_metrics.update(metrics_data(\"LinearSVC\", y_test, y_pred))\n",
"\n",
"#Evaluating model on train set\n",
"# Evaluating model on train set\n",
"y_pred = svc.predict(X_train)\n",
"accuracySVC2 = accuracy_score(y_train, y_pred)\n",
"print('Accuracy on train set: {}'.format(accuracySVC2))"
Expand All @@ -15067,20 +15223,52 @@
],
"source": [
"#Gradient Boosting Classifier\n",
"start = time.time()\n",
"ef objective(trial):\n",
" # Define the search space for hyperparameters\n",
" n_estimators = trial.suggest_int('n_estimators', 100, 1000)\n",
" learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)\n",
" max_depth = trial.suggest_int('max_depth', 3, 20)\n",
" min_samples_split = trial.suggest_int('min_samples_split', 2, 10)\n",
" min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)\n",
" subsample = trial.suggest_float('subsample', 0.5, 1.0)\n",
" \n",
" model = GradientBoostingClassifier(\n",
" n_estimators=n_estimators,\n",
" learning_rate=learning_rate,\n",
" max_depth=max_depth,\n",
" min_samples_split=min_samples_split,\n",
" min_samples_leaf=min_samples_leaf,\n",
" subsample=subsample,\n",
" random_state=42\n",
" )\n",
" model.fit(X_train, y_train)\n",
" \n",
" y_pred = model.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" \n",
" return accuracy\n",
"\n",
"grb= GradientBoostingClassifier()\n",
"grb.fit(X_train,y_train)\n",
"# Create a study object and optimize the objective function\n",
"study = optuna.create_study(direction='maximize')\n",
"study.optimize(objective, n_trials=100)\n",
"\n",
"# Print the best hyperparameters\n",
"print('Best hyperparameters: ', study.best_params)\n",
"\n",
"# Train the model with the best hyperparameters\n",
"best_params = study.best_params\n",
"start = time.time()\n",
"grb = GradientBoostingClassifier(**best_params, random_state=42)\n",
"grb.fit(X_train, y_train)\n",
"end = time.time()\n",
"Timegrb = end - start\n",
"print('Time: ', Timegrb)\n",
"\n",
"#Evaluating model on test set\n",
"# Evaluating model on test set\n",
"y_pred = grb.predict(X_test)\n",
"all_metrics.update(metrics_data(\"Gradient Boosting Classifier\", y_test, y_pred))\n",
"\n",
"\n",
"#Evaluating model on train set\n",
"# Evaluating model on train set\n",
"y_pred = grb.predict(X_train)\n",
"accuracygrb2 = accuracy_score(y_train, y_pred)\n",
"print('Accuracy on train set: {}'.format(accuracygrb2))\n"
Expand Down Expand Up @@ -21615,7 +21803,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
"version": "3.11.3"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 989317d

Please sign in to comment.