mozilla · singla007 · Mar 6, 2023 · Mar 6, 2023 · Mar 7, 2023 · Mar 8, 2023
diff --git a/bugbug/model.py b/bugbug/model.py
@@ -5,6 +5,7 @@
 
 import pickle
 from collections import defaultdict
+from logging import INFO, basicConfig, getLogger
 from typing import Any
 
 import matplotlib
@@ -29,6 +30,9 @@
 from bugbug.nlp import SpacyVectorizer
 from bugbug.utils import split_tuple_generator, to_array
 
+basicConfig(level=INFO)
+logger = getLogger(__name__)
+
 
 def classification_report_imbalanced_values(
     y_true, y_pred, labels, target_names=None, sample_weight=None, digits=2, alpha=0.1
@@ -103,7 +107,7 @@ def print_labeled_confusion_matrix(confusion_matrix, labels, is_multilabel=False
 
     for num, table in enumerate(confusion_matrix_table):
         if is_multilabel:
-            print(f"label: {labels[num]}")
+            logger.info("label: %d", labels[num])
             table_labels = [0, 1]
         else:
             table_labels = labels
@@ -117,9 +121,9 @@ def print_labeled_confusion_matrix(confusion_matrix, labels, is_multilabel=False
             )
         for i in range(len(table)):
             table[i].insert(0, f"{table_labels[i]} (Actual)")
-        print(
+        logger.info(
+            "\n%s\n",
             tabulate(table, headers=confusion_matrix_header, tablefmt="fancy_grid"),
-            end="\n\n",
         )
 
 
@@ -287,18 +291,18 @@ def print_feature_importances(self, important_features, class_probabilities=None
 
         # allow maximum of 3 columns in a row to fit the page better
         COLUMNS = 3
-        print("Top {} features:".format(len(top_feature_names)))
+        logger.info("Top %d features:", len(top_feature_names))
         for i in range(0, len(top_feature_names), COLUMNS):
             table = []
             for item in shap_val:
                 table.append(item[i : i + COLUMNS])
-            print(
+            logger.info(
+                "\n%s\n\n",
                 tabulate(
                     table,
                     headers=(["classes"] + top_feature_names)[i : i + COLUMNS],
                     tablefmt="grid",
                 ),
-                end="\n\n",
             )
 
     def save_feature_importances(self, important_features, feature_names):
@@ -351,7 +355,7 @@ def train(self, importance_cutoff=0.15, limit=None):
             X = X[:limit]
             y = y[:limit]
 
-        print(f"X: {X.shape}, y: {y.shape}")
+        logger.info("X: %s, y: %s", str(X.shape), str(y.shape))
 
         is_multilabel = isinstance(y[0], np.ndarray)
         is_binary = len(self.class_names) == 2
@@ -375,30 +379,35 @@ def train(self, importance_cutoff=0.15, limit=None):
                 pipeline, X_train, self.le.transform(y_train), scoring=scorings, cv=5
             )
 
-            print("Cross Validation scores:")
+            logger.info("Cross Validation scores:")
             for scoring in scorings:
                 score = scores[f"test_{scoring}"]
                 tracking_metrics[f"test_{scoring}"] = {
                     "mean": score.mean(),
                     "std": score.std() * 2,
                 }
-                print(
-                    f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})"
+                logger.info(
+                    "%s: f%.3f (+/- %.3f)",
+                    scoring.capitalize(),
+                    score.mean(),
+                    score.std() * 2,
                 )
 
-        print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
+        logger.info("X_train: %s, y_train: %s", X_train.shape, y_train.shape)
 
         # Training on the resampled dataset if sampler is provided.
         if self.sampler is not None:
             X_train, y_train = self.sampler.fit_resample(X_train, y_train)
 
-            print(f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}")
+            logger.info(
+                "resampled X_train: %s, y_train: %s", X_train.shape, y_train.shape
+            )
 
-        print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
+        logger.info("X_test: %s, y_test: %s", X_test.shape, y_test.shape)
 
         self.clf.fit(X_train, self.le.transform(y_train))
 
-        print("Model trained")
+        logger.info("Model trained")
 
         feature_names = self.get_human_readable_feature_names()
         if self.calculate_importance and len(feature_names):
@@ -440,17 +449,18 @@ def train(self, importance_cutoff=0.15, limit=None):
 
             tracking_metrics["feature_report"] = feature_report
 
-        print("Training Set scores:")
+        logger.info("Training Set scores:")
         y_pred = self.clf.predict(X_train)
         y_pred = self.le.inverse_transform(y_pred)
         if not is_multilabel:
-            print(
+            logger.info(
+                "\n%s",
                 classification_report_imbalanced(
                     y_train, y_pred, labels=self.class_names
-                )
+                ),
             )
 
-        print("Test Set scores:")
+        logger.info("Test Set scores:")
         # Evaluate results on the test set.
         y_pred = self.clf.predict(X_test)
         y_pred = self.le.inverse_transform(y_pred)
@@ -460,17 +470,19 @@ def train(self, importance_cutoff=0.15, limit=None):
                 y_pred[0], np.ndarray
             ), "The predictions should be multilabel"
 
-        print(f"No confidence threshold - {len(y_test)} classified")
+        logger.info("No confidence threshold - %d classified", len(y_test))
         if is_multilabel:
             confusion_matrix = metrics.multilabel_confusion_matrix(y_test, y_pred)
         else:
             confusion_matrix = metrics.confusion_matrix(
                 y_test, y_pred, labels=self.class_names
             )
 
-            print(
-                classification_report_imbalanced(
-                    y_test, y_pred, labels=self.class_names
+            logger.info(
+                str(
+                    classification_report_imbalanced(
+                        y_test, y_pred, labels=self.class_names
+                    )
                 )
             )
             report = classification_report_imbalanced_values(
@@ -522,8 +534,10 @@ def train(self, importance_cutoff=0.15, limit=None):
 
             classified_num = sum(1 for v in y_pred_filter if v != "__NOT_CLASSIFIED__")
 
-            print(
-                f"\nConfidence threshold > {confidence_threshold} - {classified_num} classified"
+            logger.info(
+                "\nConfidence threshold > %d - %d classified",
+                confidence_threshold,
+                classified_num,
             )
             if is_multilabel:
                 confusion_matrix = metrics.multilabel_confusion_matrix(
@@ -535,12 +549,13 @@ def train(self, importance_cutoff=0.15, limit=None):
                     y_pred_filter.astype(str),
                     labels=confidence_class_names,
                 )
-                print(
+                logger.info(
+                    "\n%s",
                     classification_report_imbalanced(
                         y_test.astype(str),
                         y_pred_filter.astype(str),
                         labels=confidence_class_names,
-                    )
+                    ),
                 )
             print_labeled_confusion_matrix(
                 confusion_matrix, confidence_class_names, is_multilabel=is_multilabel
@@ -549,15 +564,15 @@ def train(self, importance_cutoff=0.15, limit=None):
         self.evaluation()
 
         if self.entire_dataset_training:
-            print("Retraining on the entire dataset...")
+            logger.info("Retraining on the entire dataset...")
 
             if self.sampler is not None:
                 X_train, y_train = self.sampler.fit_resample(X, y)
             else:
                 X_train = X
                 y_train = y
 
-            print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
+            logger.info("X_train: %s, y_train: %s", X_train.shape, y_train.shape)
 
             self.clf.fit(X_train, self.le.transform(y_train))
 

diff --git a/bugbug/models/defect_enhancement_task.py b/bugbug/models/defect_enhancement_task.py
@@ -3,10 +3,14 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
+from logging import INFO, basicConfig, getLogger
 from typing import Any
 
 from bugbug.models.defect import DefectModel
 
+basicConfig(level=INFO)
+logger = getLogger(__name__)
+
 
 class DefectEnhancementTaskModel(DefectModel):
     def __init__(self, lemmatization=False, historical=False):
@@ -17,19 +21,15 @@ def __init__(self, lemmatization=False, historical=False):
     def get_labels(self) -> tuple[dict[int, Any], list[Any]]:
         classes = self.get_bugbug_labels("defect_enhancement_task")
 
-        print(
-            "{} defects".format(
-                sum(1 for label in classes.values() if label == "defect")
-            )
-        )
-        print(
-            "{} enhancements".format(
-                sum(1 for label in classes.values() if label == "enhancement")
-            )
+        logger.info(
+            "%d defects", sum(1 for label in classes.values() if label == "defect")
         )
-        print(
-            "{} tasks".format(sum(1 for label in classes.values() if label == "task"))
+
+        logger.info(
+            "%d enhancements",
+            sum(1 for label in classes.values() if label == "enhancement"),
         )
+        logger.info("%d tasks", sum(1 for label in classes.values() if label == "task"))
 
         return classes, ["defect", "enhancement", "task"]
 

diff --git a/bugbug/models/regression.py b/bugbug/models/regression.py
@@ -3,10 +3,14 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
+from logging import INFO, basicConfig, getLogger
 from typing import Any
 
 from bugbug.models.defect import DefectModel
 
+basicConfig(level=INFO)
+logger = getLogger(__name__)
+
 
 class RegressionModel(DefectModel):
     def __init__(self, lemmatization=False, historical=False):
@@ -16,15 +20,11 @@ def __init__(self, lemmatization=False, historical=False):
     def get_labels(self) -> tuple[dict[int, Any], list[int]]:
         classes = self.get_bugbug_labels("regression")
 
-        print(
-            "{} regression bugs".format(
-                sum(1 for label in classes.values() if label == 1)
-            )
+        logger.info(
+            "%d regression bugs", sum(1 for label in classes.values() if label == 1)
         )
-        print(
-            "{} non-regression bugs".format(
-                sum(1 for label in classes.values() if label == 0)
-            )
+        logger.info(
+            "%d non-regression bugs", sum(1 for label in classes.values() if label == 0)
         )
 
         return classes, [0, 1]