Skip to content

Commit

Permalink
Merge pull request #4161 from gamesh411/crash-clustering
Browse files Browse the repository at this point in the history
[debug][scripts] Add experimental script for clustering similar crashes
  • Loading branch information
dkrupp authored Jan 24, 2025
2 parents eb42b5d + b70a85c commit 81f3a93
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 0 deletions.
29 changes: 29 additions & 0 deletions scripts/debug_tools/crash_clustering/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Failzip Crash Clustering

## Overview

This script is designed for processing and analyzing text data contained within zip files, focusing on extracting, filtering, and analyzing specific parts of the content. It supports functions such as text vectorization, dimensionality reduction, similarity matrix calculation, and data visualization. Primarily, it could be used for analyzing stack dumps or other structured text data by applying text analytics and visualization techniques.

### Key Features

- **Zip File Handling**: Class `FailZip` provides functionalities to work with collections of zip files, including content extraction and length determination.
- **Text Filtering and Analysis**: Implements filtering criteria to extract and preprocess relevant parts of the text data.
- **Dimensionality Reduction and Vectorization**: Uses Truncated SVD and TF-IDF vectorization to process and analyze textual data.
- **Data Visualization**: Leverages matplotlib and TSNE for generating plots to visualize the results of the text data analysis.
- **JSON Handling**: Handles JSON data encoding through the `NumpyArrayEncoder` class and related functions for ensuring JSON file structures and custom encoding of numpy arrays.

## Requirements

- Python 3.x
- Libraries: json, logging, pathlib, re, zipfile, matplotlib, numpy, scipy, sklearn, tqdm

Ensure all dependencies are installed using pip:

```bash
pip install matplotlib numpy scipy scikit-learn tqdm
```

### Usage
Copy a fail-zip to the current working directory of the script, ensure that the name is 'archive.zip', and run the script `cluster_crashes.py`.
The cluster plot is then opened interactively if possible, but an image with name 'tsne.png' is also generated in the current working directory.

170 changes: 170 additions & 0 deletions scripts/debug_tools/crash_clustering/cluster_crashes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#!/usr/bin/env python3

import json
import logging
import pathlib
import re
from pathlib import Path
from zipfile import ZipFile

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from scipy.sparse import load_npz, save_npz
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)


class FailZip:
def __init__(self, file: pathlib.Path):
self.file = file

def __len__(self):
with ZipFile(self.file) as outer_zipfile:
return len(outer_zipfile.namelist())

def __iter__(self):
with ZipFile(self.file) as outer_zipfile:
for outer_filename in outer_zipfile.namelist():
parts = re.search(
r"archive/(?P<project_name>.*)/(?P<run_name>.*)/failed/(?P<file_name>[^_]*)_(?P<analyzer>[^_]*)_",
outer_filename,
)
with ZipFile(outer_zipfile.open(outer_filename)) as inner_zipfile:
stderr = inner_zipfile.open("stderr").read().decode("utf-8")
yield {
"project_name": parts["project_name"],
"run_name": parts["run_name"],
"analyzer": parts["analyzer"],
"file_name": parts["file_name"],
"similarity_context": stderr,
}


def cleanup_crash_text(context: str):
# Stackdump is the only relevant part, if this is found, skip everything before it
stackdump_filter = re.search(r"Stack dump:([\s\S]*)", context)
if stackdump_filter:
context = stackdump_filter.group(1)
# Remove all pointer info
de_pointered_context = re.sub(r"0x[0-9a-fA-F]+", "", context)
return de_pointered_context


def ensure_json_file(filename, data, force=True, encoder=None):
file = Path(filename)
if force or not file.exists():
file.write_text(json.dumps(data, indent=2, cls=encoder))


class NumpyArrayEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)


def calculate_similarity_matrix(contexts):
### Calculate similarity matrix
# 1. TF-IDF
# 2. Truncated SVD
# 3. TSNE

# Crete a sparse matrix of the TF-IDF embedding
tsne_matrix = Path("tfidf_embedding.npz")
tfidf = TfidfVectorizer()
tfidf_embedding = tfidf.fit_transform(
[cleanup_crash_text(c["similarity_context"]) for c in contexts]
)

# Reduce dimensionality of the TF-IDF embedding
truncated_svd = TruncatedSVD(n_components=50)
truncated_svd_embedding = truncated_svd.fit_transform(tfidf_embedding)

# Create a TSNE embedding
tsne = TSNE(init="random")
tsne_embedding = tsne.fit_transform(truncated_svd_embedding)
with open(tsne_matrix, "wb") as f:
save_npz(f, tfidf_embedding)

return tsne_embedding


def plot_results(tsne_embedding, contexts):
projects = list(c["project_name"] for c in contexts)
unique_projects = set(projects)
colors = plt.cm.Set1(np.linspace(0, 1, len(unique_projects)))
colormap = dict(zip(unique_projects, colors))

_, ax = plt.subplots()

legend_handles = {}

# maintain a set of seen points, and if the new point is close, then do not annotate it
# this is to avoid cluttering the plot
# FIXME: We should do a real clustering instead of this hack
seen_points = list()

for i, c in enumerate(contexts):
project = c["project_name"]
color = colormap[project]
# Plot and caputure the scatter plot handle
scatter = ax.scatter(
tsne_embedding[i, 0],
tsne_embedding[i, 1],
c=[color],
s=50,
edgecolor="k",
label=project,
alpha=0.8,
)

legend_handles[project] = scatter

# find the closest point in seen_points to this point
closest_distance = float("inf")
for seen_point in seen_points:
distance = np.linalg.norm(tsne_embedding[i] - seen_point)
if distance < closest_distance:
closest_distance = distance
# FIXME: arbitrary distance here...
if closest_distance < 0.5:
continue

seen_points.append(tsne_embedding[i])

ax.annotate(
c["file_name"],
tsne_embedding[i],
textcoords="offset points",
xytext=(0, 5),
ha="center",
)

ax.legend(
handles=list(legend_handles.values()),
labels=list(legend_handles.keys()),
title="Project Names",
loc="lower left",
)

ax.set_title("Files")

plt.savefig("tsne.png")
plt.show()


def main():
contexts = list(
tqdm(FailZip(pathlib.Path("archive.zip")), desc="Reading failzips...")
)
tsne_embedding = calculate_similarity_matrix(contexts)
plot_results(tsne_embedding, contexts)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions scripts/debug_tools/crash_clustering/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
matplotlib==3.8.2
numpy==1.25.2
scikit-learn==1.3.0
scipy==1.11.1
tqdm==4.66.1

0 comments on commit 81f3a93

Please sign in to comment.