Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor print statments to use logger #3322 #3348

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 43 additions & 28 deletions bugbug/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pickle
from collections import defaultdict
from logging import INFO, basicConfig, getLogger
from typing import Any

import matplotlib
Expand All @@ -29,6 +30,9 @@
from bugbug.nlp import SpacyVectorizer
from bugbug.utils import split_tuple_generator, to_array

basicConfig(level=INFO)
logger = getLogger(__name__)


def classification_report_imbalanced_values(
y_true, y_pred, labels, target_names=None, sample_weight=None, digits=2, alpha=0.1
Expand Down Expand Up @@ -103,7 +107,7 @@ def print_labeled_confusion_matrix(confusion_matrix, labels, is_multilabel=False

for num, table in enumerate(confusion_matrix_table):
if is_multilabel:
print(f"label: {labels[num]}")
logger.info("label: %d", labels[num])
table_labels = [0, 1]
else:
table_labels = labels
Expand All @@ -117,9 +121,9 @@ def print_labeled_confusion_matrix(confusion_matrix, labels, is_multilabel=False
)
for i in range(len(table)):
table[i].insert(0, f"{table_labels[i]} (Actual)")
print(
logger.info(
"\n%s\n",
tabulate(table, headers=confusion_matrix_header, tablefmt="fancy_grid"),
end="\n\n",
)


Expand Down Expand Up @@ -287,18 +291,18 @@ def print_feature_importances(self, important_features, class_probabilities=None

# allow maximum of 3 columns in a row to fit the page better
COLUMNS = 3
print("Top {} features:".format(len(top_feature_names)))
logger.info("Top %d features:", len(top_feature_names))
for i in range(0, len(top_feature_names), COLUMNS):
table = []
for item in shap_val:
table.append(item[i : i + COLUMNS])
print(
logger.info(
"\n%s\n\n",
tabulate(
table,
headers=(["classes"] + top_feature_names)[i : i + COLUMNS],
tablefmt="grid",
),
end="\n\n",
)

def save_feature_importances(self, important_features, feature_names):
Expand Down Expand Up @@ -351,7 +355,7 @@ def train(self, importance_cutoff=0.15, limit=None):
X = X[:limit]
y = y[:limit]

print(f"X: {X.shape}, y: {y.shape}")
logger.info("X: %s, y: %s", str(X.shape), str(y.shape))

is_multilabel = isinstance(y[0], np.ndarray)
is_binary = len(self.class_names) == 2
Expand All @@ -375,30 +379,35 @@ def train(self, importance_cutoff=0.15, limit=None):
pipeline, X_train, self.le.transform(y_train), scoring=scorings, cv=5
)

print("Cross Validation scores:")
logger.info("Cross Validation scores:")
for scoring in scorings:
score = scores[f"test_{scoring}"]
tracking_metrics[f"test_{scoring}"] = {
"mean": score.mean(),
"std": score.std() * 2,
}
print(
f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})"
logger.info(
"%s: f%.3f (+/- %.3f)",
scoring.capitalize(),
score.mean(),
score.std() * 2,
)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
logger.info("X_train: %s, y_train: %s", X_train.shape, y_train.shape)

# Training on the resampled dataset if sampler is provided.
if self.sampler is not None:
X_train, y_train = self.sampler.fit_resample(X_train, y_train)

print(f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}")
logger.info(
"resampled X_train: %s, y_train: %s", X_train.shape, y_train.shape
)

print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
logger.info("X_test: %s, y_test: %s", X_test.shape, y_test.shape)

self.clf.fit(X_train, self.le.transform(y_train))

print("Model trained")
logger.info("Model trained")

feature_names = self.get_human_readable_feature_names()
if self.calculate_importance and len(feature_names):
Expand Down Expand Up @@ -440,17 +449,18 @@ def train(self, importance_cutoff=0.15, limit=None):

tracking_metrics["feature_report"] = feature_report

print("Training Set scores:")
logger.info("Training Set scores:")
y_pred = self.clf.predict(X_train)
y_pred = self.le.inverse_transform(y_pred)
if not is_multilabel:
print(
logger.info(
"\n%s",
classification_report_imbalanced(
y_train, y_pred, labels=self.class_names
)
),
)

print("Test Set scores:")
logger.info("Test Set scores:")
# Evaluate results on the test set.
y_pred = self.clf.predict(X_test)
y_pred = self.le.inverse_transform(y_pred)
Expand All @@ -460,17 +470,19 @@ def train(self, importance_cutoff=0.15, limit=None):
y_pred[0], np.ndarray
), "The predictions should be multilabel"

print(f"No confidence threshold - {len(y_test)} classified")
logger.info("No confidence threshold - %d classified", len(y_test))
if is_multilabel:
confusion_matrix = metrics.multilabel_confusion_matrix(y_test, y_pred)
else:
confusion_matrix = metrics.confusion_matrix(
y_test, y_pred, labels=self.class_names
)

print(
classification_report_imbalanced(
y_test, y_pred, labels=self.class_names
logger.info(
str(
classification_report_imbalanced(
y_test, y_pred, labels=self.class_names
)
)
)
report = classification_report_imbalanced_values(
Expand Down Expand Up @@ -522,8 +534,10 @@ def train(self, importance_cutoff=0.15, limit=None):

classified_num = sum(1 for v in y_pred_filter if v != "__NOT_CLASSIFIED__")

print(
f"\nConfidence threshold > {confidence_threshold} - {classified_num} classified"
logger.info(
"\nConfidence threshold > %d - %d classified",
confidence_threshold,
classified_num,
)
if is_multilabel:
confusion_matrix = metrics.multilabel_confusion_matrix(
Expand All @@ -535,12 +549,13 @@ def train(self, importance_cutoff=0.15, limit=None):
y_pred_filter.astype(str),
labels=confidence_class_names,
)
print(
logger.info(
"\n%s",
classification_report_imbalanced(
y_test.astype(str),
y_pred_filter.astype(str),
labels=confidence_class_names,
)
),
)
print_labeled_confusion_matrix(
confusion_matrix, confidence_class_names, is_multilabel=is_multilabel
Expand All @@ -549,15 +564,15 @@ def train(self, importance_cutoff=0.15, limit=None):
self.evaluation()

if self.entire_dataset_training:
print("Retraining on the entire dataset...")
logger.info("Retraining on the entire dataset...")

if self.sampler is not None:
X_train, y_train = self.sampler.fit_resample(X, y)
else:
X_train = X
y_train = y

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
logger.info("X_train: %s, y_train: %s", X_train.shape, y_train.shape)

self.clf.fit(X_train, self.le.transform(y_train))

Expand Down
22 changes: 11 additions & 11 deletions bugbug/models/defect_enhancement_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

from logging import INFO, basicConfig, getLogger
from typing import Any

from bugbug.models.defect import DefectModel

basicConfig(level=INFO)
logger = getLogger(__name__)


class DefectEnhancementTaskModel(DefectModel):
def __init__(self, lemmatization=False, historical=False):
Expand All @@ -17,19 +21,15 @@ def __init__(self, lemmatization=False, historical=False):
def get_labels(self) -> tuple[dict[int, Any], list[Any]]:
classes = self.get_bugbug_labels("defect_enhancement_task")

print(
"{} defects".format(
sum(1 for label in classes.values() if label == "defect")
)
)
print(
"{} enhancements".format(
sum(1 for label in classes.values() if label == "enhancement")
)
logger.info(
"%d defects", sum(1 for label in classes.values() if label == "defect")
)
print(
"{} tasks".format(sum(1 for label in classes.values() if label == "task"))

logger.info(
"%d enhancements",
sum(1 for label in classes.values() if label == "enhancement"),
)
logger.info("%d tasks", sum(1 for label in classes.values() if label == "task"))

return classes, ["defect", "enhancement", "task"]

Expand Down
16 changes: 8 additions & 8 deletions bugbug/models/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

from logging import INFO, basicConfig, getLogger
from typing import Any

from bugbug.models.defect import DefectModel

basicConfig(level=INFO)
logger = getLogger(__name__)


class RegressionModel(DefectModel):
def __init__(self, lemmatization=False, historical=False):
Expand All @@ -16,15 +20,11 @@ def __init__(self, lemmatization=False, historical=False):
def get_labels(self) -> tuple[dict[int, Any], list[int]]:
classes = self.get_bugbug_labels("regression")

print(
"{} regression bugs".format(
sum(1 for label in classes.values() if label == 1)
)
logger.info(
"%d regression bugs", sum(1 for label in classes.values() if label == 1)
)
print(
"{} non-regression bugs".format(
sum(1 for label in classes.values() if label == 0)
)
logger.info(
"%d non-regression bugs", sum(1 for label in classes.values() if label == 0)
)

return classes, [0, 1]
Expand Down