Skip to content

Commit

Permalink
add esig opt dependency and cdf plotting
Browse files Browse the repository at this point in the history
  • Loading branch information
rchan26 committed Nov 24, 2023
1 parent c23f575 commit 3f43f7e
Show file tree
Hide file tree
Showing 3 changed files with 447 additions and 309 deletions.
665 changes: 358 additions & 307 deletions paper-examples/examples.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ dependencies = [
]

[project.optional-dependencies]
signatures = [
'esig; python_version < "3.10"',
]
test = [
"pytest >=6",
"pytest-cov >=3",
Expand Down
88 changes: 86 additions & 2 deletions src/signature_mahalanobis_knn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ def plot_roc_curve(
roc_auc = roc_auc_score(y_true, y_score)

plt.title(f"Receiver Operating Characteristic {title}")
plt.plot(fp_rate, tp_rate, "b", label=f"AUC = {round(roc_auc, 2)}")
plt.plot(fp_rate, tp_rate, "b", label=f"AUC = {round(roc_auc, 2)}", linewidth=2)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], "r--")
plt.plot([0, 1], [0, 1], "r--", linewidth=2)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel("True Positive Rate")
Expand Down Expand Up @@ -162,3 +162,87 @@ def compute_auc(
plot=plot,
title=title,
)


def plot_cdf_given_dists(
distances_in: np.ndarray,
distances_out: np.ndarray,
bins: int | np.array = 50,
xrange: tuple[float, float] | None = None,
xticks: list[float] | None = None,
xlog: bool = False,
xlog_base: int = 10,
title: str = "",
):
"""
Plot an empirical cumulative distributions of the
distances of inliers and outliers.
Parameters
----------
distances_in : np.ndarray
KNN distances for the inlier data points.
distances_out : np.ndarray
KNN distances for the outlier data points.
bins : int | np.array, optional
Number of bins or the bins, by default 10.
If int, then the bins will be equally spaced.
If array, then the sequence defines the bin edges.
xrange : tuple[float, float] | None, optional
Range of the x-axis, by default None.
If None, then the range will be the minimum and maximum
of the distances.
xticks : list[float] | None, optional
Tick values of the x-axis, by default None.
If None, then the ticks will be automatically generated.
xlog : bool, optional
Whether to use log scale for the x-axis, by default False.
xlog_base : int, optional
Base of the log scale, by default 10.
Only used when xlog is True.
title : str, optional
Title for the ROC curve plot, by default "".
Only used when plot is True.
"""
# obtain the empirical cumulative distribution functions
sorted_inlier = np.sort(distances_in)
cumulative_inliers = np.arange(1, len(sorted_inlier) + 1) / len(sorted_inlier)
sorted_outlier = np.sort(distances_out)
cumulative_outliers = np.arange(1, len(sorted_outlier) + 1) / len(sorted_outlier)

# define the empirical cumulative distribution functions
def empirical_cdf_inliers(x):
return np.interp(x, sorted_inlier, cumulative_inliers, left=0.0, right=1.0)

def empirical_cdf_outliers(x):
return np.interp(x, sorted_outlier, cumulative_outliers, left=0.0, right=1.0)

# obtain the range of x to evaluate the ECDFs
xmin = min(np.min(distances_in), np.min(distances_out))
xmax = max(np.max(distances_in), np.max(distances_out))
x = np.linspace(xmin, xmax, bins)

# plot the cumulative functions
plt.plot(
x,
empirical_cdf_inliers(x),
c="blue",
label="inliers",
linestyle="--",
linewidth=2,
)
plt.plot(x, empirical_cdf_outliers(x), c="orange", label="anomalies", linewidth=2)
plt.grid()
if title != "":
plt.title(title)
plt.legend(loc="lower right")
if xrange is not None:
plt.xlim(xrange)
if xticks is not None:
plt.xticks(xticks)
if xlog:
plt.xscale("log", base=xlog_base)
plt.ylim([0, 1])
plt.ylabel("Cumulative probability")
plt.xlabel("Conformance")
plt.show()

0 comments on commit 3f43f7e

Please sign in to comment.