Skip to content

Commit

Permalink
Merge pull request #4436 from cservakt/fix-cluster-crashes-pylint
Browse files Browse the repository at this point in the history
[fix] Pylint fails on cluster_crashes
  • Loading branch information
bruntib authored Jan 24, 2025
2 parents 81f3a93 + fd6915d commit 66efe25
Showing 1 changed file with 17 additions and 16 deletions.
33 changes: 17 additions & 16 deletions scripts/debug_tools/crash_clustering/cluster_crashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from scipy.sparse import load_npz, save_npz
from scipy.sparse import save_npz
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
Expand All @@ -31,7 +30,10 @@ def __iter__(self):
with ZipFile(self.file) as outer_zipfile:
for outer_filename in outer_zipfile.namelist():
parts = re.search(
r"archive/(?P<project_name>.*)/(?P<run_name>.*)/failed/(?P<file_name>[^_]*)_(?P<analyzer>[^_]*)_",
(
r"archive/(?P<project_name>.*)/(?P<run_name>.*)/"
r"failed/(?P<file_name>[^_]*)_(?P<analyzer>[^_]*)_"
),
outer_filename,
)
with ZipFile(outer_zipfile.open(outer_filename)) as inner_zipfile:
Expand All @@ -46,7 +48,8 @@ def __iter__(self):


def cleanup_crash_text(context: str):
# Stackdump is the only relevant part, if this is found, skip everything before it
# Stackdump is the only relevant part,
# if this is found, skip everything before it
stackdump_filter = re.search(r"Stack dump:([\s\S]*)", context)
if stackdump_filter:
context = stackdump_filter.group(1)
Expand All @@ -55,17 +58,17 @@ def cleanup_crash_text(context: str):
return de_pointered_context


def ensure_json_file(filename, data, force=True, encoder=None):
def ensure_json_file(filename, data, force=True, encoder="utf-8"):
file = Path(filename)
if force or not file.exists():
file.write_text(json.dumps(data, indent=2, cls=encoder))
file.write_text(json.dumps(data, indent=2), encoding=encoder)


class NumpyArrayEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
def default(self, o):
if isinstance(o, np.ndarray):
return o.tolist()
return json.JSONEncoder.default(self, o)


def calculate_similarity_matrix(contexts):
Expand Down Expand Up @@ -107,7 +110,7 @@ def plot_results(tsne_embedding, contexts):
# maintain a set of seen points, and if the new point is close, then do not annotate it
# this is to avoid cluttering the plot
# FIXME: We should do a real clustering instead of this hack
seen_points = list()
seen_points = []

for i, c in enumerate(contexts):
project = c["project_name"]
Expand All @@ -126,11 +129,9 @@ def plot_results(tsne_embedding, contexts):
legend_handles[project] = scatter

# find the closest point in seen_points to this point
closest_distance = float("inf")
for seen_point in seen_points:
distance = np.linalg.norm(tsne_embedding[i] - seen_point)
if distance < closest_distance:
closest_distance = distance
closest_distance = min(np.linalg.norm(
tsne_embedding[i] - seen_point) for seen_point in seen_points)

# FIXME: arbitrary distance here...
if closest_distance < 0.5:
continue
Expand Down

0 comments on commit 66efe25

Please sign in to comment.