Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix] Pylint fails on cluster_crashes #4436

Merged
merged 1 commit into from
Jan 24, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 17 additions & 16 deletions scripts/debug_tools/crash_clustering/cluster_crashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from scipy.sparse import load_npz, save_npz
from scipy.sparse import save_npz
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
Expand All @@ -31,7 +30,10 @@ def __iter__(self):
with ZipFile(self.file) as outer_zipfile:
for outer_filename in outer_zipfile.namelist():
parts = re.search(
r"archive/(?P<project_name>.*)/(?P<run_name>.*)/failed/(?P<file_name>[^_]*)_(?P<analyzer>[^_]*)_",
(
r"archive/(?P<project_name>.*)/(?P<run_name>.*)/"
r"failed/(?P<file_name>[^_]*)_(?P<analyzer>[^_]*)_"
),
outer_filename,
)
with ZipFile(outer_zipfile.open(outer_filename)) as inner_zipfile:
Expand All @@ -46,7 +48,8 @@ def __iter__(self):


def cleanup_crash_text(context: str):
# Stackdump is the only relevant part, if this is found, skip everything before it
# Stackdump is the only relevant part,
# if this is found, skip everything before it
stackdump_filter = re.search(r"Stack dump:([\s\S]*)", context)
if stackdump_filter:
context = stackdump_filter.group(1)
Expand All @@ -55,17 +58,17 @@ def cleanup_crash_text(context: str):
return de_pointered_context


def ensure_json_file(filename, data, force=True, encoder=None):
def ensure_json_file(filename, data, force=True, encoder="utf-8"):
file = Path(filename)
if force or not file.exists():
file.write_text(json.dumps(data, indent=2, cls=encoder))
file.write_text(json.dumps(data, indent=2), encoding=encoder)


class NumpyArrayEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
def default(self, o):
if isinstance(o, np.ndarray):
return o.tolist()
return json.JSONEncoder.default(self, o)


def calculate_similarity_matrix(contexts):
Expand Down Expand Up @@ -107,7 +110,7 @@ def plot_results(tsne_embedding, contexts):
# maintain a set of seen points, and if the new point is close, then do not annotate it
# this is to avoid cluttering the plot
# FIXME: We should do a real clustering instead of this hack
seen_points = list()
seen_points = []

for i, c in enumerate(contexts):
project = c["project_name"]
Expand All @@ -126,11 +129,9 @@ def plot_results(tsne_embedding, contexts):
legend_handles[project] = scatter

# find the closest point in seen_points to this point
closest_distance = float("inf")
for seen_point in seen_points:
distance = np.linalg.norm(tsne_embedding[i] - seen_point)
if distance < closest_distance:
closest_distance = distance
closest_distance = min(np.linalg.norm(
tsne_embedding[i] - seen_point) for seen_point in seen_points)

# FIXME: arbitrary distance here...
if closest_distance < 0.5:
continue
Expand Down
Loading