Merge pull request #4161 from gamesh411/crash-clustering

[debug][scripts] Add experimental script for clustering similar crashes
Ericsson · Jan 24, 2025 · 81f3a93 · 81f3a93
2 parents eb42b5d + b70a85c
commit 81f3a93
Show file tree

Hide file tree

Showing 3 changed files with 204 additions and 0 deletions.
diff --git a/scripts/debug_tools/crash_clustering/README.md b/scripts/debug_tools/crash_clustering/README.md
@@ -0,0 +1,29 @@
+# Failzip Crash Clustering
+
+## Overview
+
+This script is designed for processing and analyzing text data contained within zip files, focusing on extracting, filtering, and analyzing specific parts of the content. It supports functions such as text vectorization, dimensionality reduction, similarity matrix calculation, and data visualization. Primarily, it could be used for analyzing stack dumps or other structured text data by applying text analytics and visualization techniques.
+
+### Key Features
+
+- **Zip File Handling**: Class `FailZip` provides functionalities to work with collections of zip files, including content extraction and length determination.
+- **Text Filtering and Analysis**: Implements filtering criteria to extract and preprocess relevant parts of the text data.
+- **Dimensionality Reduction and Vectorization**: Uses Truncated SVD and TF-IDF vectorization to process and analyze textual data.
+- **Data Visualization**: Leverages matplotlib and TSNE for generating plots to visualize the results of the text data analysis.
+- **JSON Handling**: Handles JSON data encoding through the `NumpyArrayEncoder` class and related functions for ensuring JSON file structures and custom encoding of numpy arrays.
+
+## Requirements
+
+- Python 3.x
+- Libraries: json, logging, pathlib, re, zipfile, matplotlib, numpy, scipy, sklearn, tqdm
+
+Ensure all dependencies are installed using pip:
+
+```bash
+pip install matplotlib numpy scipy scikit-learn tqdm
+```
+
+### Usage
+Copy a fail-zip to the current working directory of the script, ensure that the name is 'archive.zip', and run the script `cluster_crashes.py`.
+The cluster plot is then opened interactively if possible, but an image with name 'tsne.png' is also generated in the current working directory.
+
diff --git a/scripts/debug_tools/crash_clustering/cluster_crashes.py b/scripts/debug_tools/crash_clustering/cluster_crashes.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+import json
+import logging
+import pathlib
+import re
+from pathlib import Path
+from zipfile import ZipFile
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.colors import ListedColormap
+from scipy.sparse import load_npz, save_npz
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.manifold import TSNE
+from tqdm import tqdm
+
+logging.basicConfig(level=logging.INFO)
+
+
+class FailZip:
+    def __init__(self, file: pathlib.Path):
+        self.file = file
+
+    def __len__(self):
+        with ZipFile(self.file) as outer_zipfile:
+            return len(outer_zipfile.namelist())
+
+    def __iter__(self):
+        with ZipFile(self.file) as outer_zipfile:
+            for outer_filename in outer_zipfile.namelist():
+                parts = re.search(
+                    r"archive/(?P<project_name>.*)/(?P<run_name>.*)/failed/(?P<file_name>[^_]*)_(?P<analyzer>[^_]*)_",
+                    outer_filename,
+                )
+                with ZipFile(outer_zipfile.open(outer_filename)) as inner_zipfile:
+                    stderr = inner_zipfile.open("stderr").read().decode("utf-8")
+                    yield {
+                        "project_name": parts["project_name"],
+                        "run_name": parts["run_name"],
+                        "analyzer": parts["analyzer"],
+                        "file_name": parts["file_name"],
+                        "similarity_context": stderr,
+                    }
+
+
+def cleanup_crash_text(context: str):
+    # Stackdump is the only relevant part, if this is found, skip everything before it
+    stackdump_filter = re.search(r"Stack dump:([\s\S]*)", context)
+    if stackdump_filter:
+        context = stackdump_filter.group(1)
+    # Remove all pointer info
+    de_pointered_context = re.sub(r"0x[0-9a-fA-F]+", "", context)
+    return de_pointered_context
+
+
+def ensure_json_file(filename, data, force=True, encoder=None):
+    file = Path(filename)
+    if force or not file.exists():
+        file.write_text(json.dumps(data, indent=2, cls=encoder))
+
+
+class NumpyArrayEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return json.JSONEncoder.default(self, obj)
+
+
+def calculate_similarity_matrix(contexts):
+    ### Calculate similarity matrix
+    # 1. TF-IDF
+    # 2. Truncated SVD
+    # 3. TSNE
+
+    # Crete a sparse matrix of the TF-IDF embedding
+    tsne_matrix = Path("tfidf_embedding.npz")
+    tfidf = TfidfVectorizer()
+    tfidf_embedding = tfidf.fit_transform(
+        [cleanup_crash_text(c["similarity_context"]) for c in contexts]
+    )
+
+    # Reduce dimensionality of the TF-IDF embedding
+    truncated_svd = TruncatedSVD(n_components=50)
+    truncated_svd_embedding = truncated_svd.fit_transform(tfidf_embedding)
+
+    # Create a TSNE embedding
+    tsne = TSNE(init="random")
+    tsne_embedding = tsne.fit_transform(truncated_svd_embedding)
+    with open(tsne_matrix, "wb") as f:
+        save_npz(f, tfidf_embedding)
+
+    return tsne_embedding
+
+
+def plot_results(tsne_embedding, contexts):
+    projects = list(c["project_name"] for c in contexts)
+    unique_projects = set(projects)
+    colors = plt.cm.Set1(np.linspace(0, 1, len(unique_projects)))
+    colormap = dict(zip(unique_projects, colors))
+
+    _, ax = plt.subplots()
+
+    legend_handles = {}
+
+    # maintain a set of seen points, and if the new point is close, then do not annotate it
+    # this is to avoid cluttering the plot
+    # FIXME: We should do a real clustering instead of this hack
+    seen_points = list()
+
+    for i, c in enumerate(contexts):
+        project = c["project_name"]
+        color = colormap[project]
+        # Plot and caputure the scatter plot handle
+        scatter = ax.scatter(
+            tsne_embedding[i, 0],
+            tsne_embedding[i, 1],
+            c=[color],
+            s=50,
+            edgecolor="k",
+            label=project,
+            alpha=0.8,
+        )
+
+        legend_handles[project] = scatter
+
+        # find the closest point in seen_points to this point
+        closest_distance = float("inf")
+        for seen_point in seen_points:
+            distance = np.linalg.norm(tsne_embedding[i] - seen_point)
+            if distance < closest_distance:
+                closest_distance = distance
+        # FIXME: arbitrary distance here...
+        if closest_distance < 0.5:
+            continue
+
+        seen_points.append(tsne_embedding[i])
+
+        ax.annotate(
+            c["file_name"],
+            tsne_embedding[i],
+            textcoords="offset points",
+            xytext=(0, 5),
+            ha="center",
+        )
+
+    ax.legend(
+        handles=list(legend_handles.values()),
+        labels=list(legend_handles.keys()),
+        title="Project Names",
+        loc="lower left",
+    )
+
+    ax.set_title("Files")
+
+    plt.savefig("tsne.png")
+    plt.show()
+
+
+def main():
+    contexts = list(
+        tqdm(FailZip(pathlib.Path("archive.zip")), desc="Reading failzips...")
+    )
+    tsne_embedding = calculate_similarity_matrix(contexts)
+    plot_results(tsne_embedding, contexts)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/debug_tools/crash_clustering/requirements.txt b/scripts/debug_tools/crash_clustering/requirements.txt
@@ -0,0 +1,5 @@
+matplotlib==3.8.2
+numpy==1.25.2
+scikit-learn==1.3.0
+scipy==1.11.1
+tqdm==4.66.1