Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transform euclidean #11

Merged
merged 26 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
c088e0b
transformation first attempt
rchan26 Oct 18, 2023
3ba0077
update docs for _train and _test
rchan26 Oct 18, 2023
5256ccc
take min distance after subspace threshold computations
rchan26 Oct 19, 2023
2eebbf6
store U from sdv calculation in Mahalanobis
rchan26 Oct 19, 2023
f06ce72
subtract signatures by mean in modified signature comp
rchan26 Oct 24, 2023
e2e0dac
fix some errors and convert signatures to numpy arrays
rchan26 Oct 24, 2023
75ca5b7
Merge branch 'pynndescent' into transform-euclidean
rchan26 Oct 24, 2023
d11083b
add examples in repo
rchan26 Oct 24, 2023
a943e07
fix a bug where nans are set to zeros instead of the desired value
sz85512678 Oct 25, 2023
3c52832
added some diagosis information
sz85512678 Oct 25, 2023
5114b49
fix #12: signature default args
rchan26 Nov 2, 2023
c210c43
Merge branch 'transform-euclidean' into paper-examples
rchan26 Nov 2, 2023
816558b
update paper examples using signature args fix
rchan26 Nov 2, 2023
47d002a
run with updated code
rchan26 Nov 2, 2023
b50d954
run last example
rchan26 Nov 2, 2023
303c3de
added new notebook for bug testing
jh83775 Nov 21, 2023
53d6682
add transform-euclidean
rchan26 Nov 22, 2023
3575f1f
Merge branch 'pynndescent' into transform-euclidean
rchan26 Nov 22, 2023
38234ef
Merge branch 'transform-euclidean' into paper-examples
rchan26 Nov 22, 2023
a15e4a6
check reproduce old output
rchan26 Nov 23, 2023
a1c363f
check reproduce old output
rchan26 Nov 23, 2023
3d5a07b
run examples notebook fully
rchan26 Nov 24, 2023
c23f575
change default type to float64 and add backend argument
rchan26 Nov 24, 2023
3f43f7e
add esig opt dependency and cdf plotting
rchan26 Nov 24, 2023
c944883
apply pre-commit
rchan26 Nov 24, 2023
56fff8f
rename debug to return_indices
rchan26 Nov 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,6 @@ repos:
- id: ruff
args: ["--fix", "--show-fixes"]

- repo: https://github.com/codespell-project/codespell
rev: "v2.2.6"
hooks:
- id: codespell

- repo: https://github.com/shellcheck-py/shellcheck-py
rev: "v0.9.0.6"
hooks:
Expand Down
235 changes: 235 additions & 0 deletions paper-examples/Data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
from __future__ import annotations

import pickle
import random
from pathlib import Path

import numpy as np
import pandas as pd
import sklearn
from scipy.io import arff
from sklearn.model_selection import train_test_split

DATA_DIR = "data/"


def get_corpus_and_outlier_paths(df, desired_class):
corpus_paths = []
outlier_paths = []

for i in range(df.shape[0]):
if df.iloc[i]["target"] == desired_class:
corpus_paths.append(
np.column_stack([list(range(df.shape[1] - 1)), df.iloc[i][:-1]])
)
else:
outlier_paths.append(
np.column_stack([list(range(df.shape[1] - 1)), df.iloc[i][:-1]])
)

return corpus_paths, outlier_paths


def normalise(streams):
return [
sklearn.preprocessing.MinMaxScaler().fit_transform(stream) for stream in streams
]


class Data:
"""
Hold time-series data and allow augmentations
"""

def __init__(self, if_sample=True, n_samples=(800, 10, 10), random_seed=1):
self.corpus = (
None # unlabelled corpus consists of streams, numpy.array(numpy.array)
)
self.test_inlier = None # test set consists of numpy.array of inliers,
self.test_outlier = None # test set consists of numpy.array of outliers
self.if_sample = if_sample
self.n_samples = n_samples
self.random_seed = random_seed

def sample(self):
random.seed(self.random_seed)
self.corpus = random.choices(list(self.corpus), k=self.n_samples[0])
self.test_inlier = random.choices(list(self.test_inlier), k=self.n_samples[1])
self.test_outlier = random.choices(list(self.test_outlier), k=self.n_samples[2])

def load_pen_digit(self, digit: int = 1):
"""
Load pen digit dataset with a specific digit as training set
:param digit: 0-9, use as "normality" training corpus
:return: None
"""
train_df = pd.read_pickle(DATA_DIR + "pen_digit_train.pkl")
test_df = pd.read_pickle(DATA_DIR + "pen_digit_test.pkl")
self.corpus = train_df[train_df["Digit"] == digit]["Stream"].to_numpy()
self.test_inlier = test_df[test_df["Digit"] == digit]["Stream"].to_numpy()
self.test_outlier = test_df[test_df["Digit"] != digit]["Stream"].to_numpy()

if self.if_sample:
self.sample()
self.corpus, self.test_inlier, self.test_outlier = map(
normalise, (self.corpus, self.test_inlier, self.test_outlier)
)

def load_language_data(self):
"""
Load language data set with English and German words.
:return: None
"""
paths = np.load(DATA_DIR + "paths_en_de.npy")
labels = np.load(DATA_DIR + "labels_en_de.npy")
(paths_train, paths_test, labels_train, labels_test) = train_test_split(
paths, labels, random_state=1, test_size=0.2
)
paths_train = paths_train[labels_train == 0]

self.corpus = paths_train
self.test_inlier = paths_test[labels_test == 0]
self.test_outlier = paths_test[labels_test == 1]
if self.if_sample:
self.sample()
self.corpus, self.test_inlier, self.test_outlier = map(
normalise, (self.corpus, self.test_inlier, self.test_outlier)
)

def load_ship_movements(self, thres_distance=32000, n_samples=5000):
"""

:param thres_distance: Must be one of [4000, 8000, 16000, 32000]
:param n_samples: samples taken for each of the train, test_in and test_out
:return:
"""

# process data in a format where it could be indexed.
def process_data(data_frame):
data_frame["TimeDiff"] = data_frame["BaseDateTime"].apply(
lambda x: np.append(0, np.diff(x))
)
data_frame = data_frame[["LAT", "LON", "TimeDiff"]]
res = []
for i in range(len(data_frame)):
res.append(
np.array(
list(
zip(
data_frame.iloc[i].to_numpy()[0],
data_frame.iloc[i].to_numpy()[1],
data_frame.iloc[i].to_numpy()[2],
)
)
)
)
return res

# subsample data
def sample_data(ais_by_vessel_split, random_state):
return ais_by_vessel_split.sample(
n=n_samples,
weights="SUBSTREAM_WEIGHT",
replace=True,
random_state=random_state,
)

with Path(DATA_DIR + "inlier_mmsis_train.pkl").open("rb") as f:
inlier_mmsis_train = pickle.load(f)
with Path(DATA_DIR + "inlier_mmsis_test.pkl").open("rb") as f:
inlier_mmsis_test = pickle.load(f)
with Path(DATA_DIR + "outlier_mmsis.pkl").open("rb") as f:
outlier_mmsis = pickle.load(f)

if thres_distance not in [4000, 8000, 16000, 32000]:
msg = "thres_distance needs to be in [4000, 8000, 16000, 32000]"
raise ValueError(msg)
ais_by_vessel_split_local = pd.read_pickle(
DATA_DIR + "substreams_" + str(thres_distance) + ".pkl"
)

self.corpus = process_data(
sample_data(
ais_by_vessel_split_local.loc[inlier_mmsis_train],
random_state=self.random_seed,
)
)
self.test_inlier = process_data(
sample_data(
ais_by_vessel_split_local.loc[inlier_mmsis_test],
random_state=self.random_seed,
)
)
self.test_outlier = process_data(
sample_data(
ais_by_vessel_split_local.loc[outlier_mmsis],
random_state=self.random_seed,
)
)
if self.if_sample:
self.sample()
self.corpus, self.test_inlier, self.test_outlier = map(
normalise, (self.corpus, self.test_inlier, self.test_outlier)
)

def load_ucr_dataset(
self, data_set_name="Adiac", anomaly_level=0.001, random_state=1
):
"""

:param data_set_name: Must be one of ['Adiac', 'ArrowHead', 'Beef', 'BeetleFly', 'BirdChicken', 'CBF',
'ChlorineConcentration', 'Coffee', 'ECG200', 'ECGFiveDays', 'FaceFour',
'GunPoint', 'Ham', 'Herring', 'Lightning2', 'Lightning7', 'Meat',
'MedicalImages', 'MoteStrain', 'Plane', 'Strawberry', 'Symbols',
'ToeSegmentation1', 'ToeSegmentation2', 'Trace', 'TwoLeadECG', 'Wafer',
'Wine']
:param anomaly_level: Must be one of [0.001, 0.05]
:param random_state:
:return:
"""
Level_To_COLUMN = {0.001: "Atra", 0.05: "A5tra"}
comparisons = pd.read_csv(
DATA_DIR + "results_beggel_et_al_2019_tables_2_and_4.csv"
)
comparisons = comparisons.set_index("Dataset")
if data_set_name not in comparisons.index:
msg = "data_set_name must be in "
raise ValueError(msg, comparisons.index)
DATASET_PATH = DATA_DIR + "Univariate_arff"
datatrain = arff.loadarff(
Path(DATASET_PATH) / data_set_name / (data_set_name + "_TRAIN.arff")
)
datatest = arff.loadarff(
Path(DATASET_PATH) / data_set_name / (data_set_name + "_TEST.arff")
)
alldata = pd.concat(
[pd.DataFrame(datatrain[0]), pd.DataFrame(datatest[0])], ignore_index=True
)
alldata["target"] = pd.to_numeric(alldata["target"])
corpus_paths, outlier_paths = get_corpus_and_outlier_paths(
alldata, comparisons.loc[data_set_name].normal
)
corpus_train, corpus_test = train_test_split(
corpus_paths,
test_size=comparisons.loc[data_set_name].Ntes.astype("int"),
random_state=random_state,
)
outliers_injection = comparisons.loc[data_set_name][
Level_To_COLUMN[anomaly_level]
].astype("int")
if outliers_injection != 0:
outlier_paths, outlier_paths_to_train = train_test_split(
outlier_paths, test_size=outliers_injection, random_state=random_state
)
corpus_train = (
corpus_train + outlier_paths_to_train
) # injecting anomaly into the corpus

self.corpus = corpus_train
self.test_inlier = corpus_test
self.test_outlier = outlier_paths
if self.if_sample:
self.sample()
self.corpus, self.test_inlier, self.test_outlier = map(
normalise, (self.corpus, self.test_inlier, self.test_outlier)
)
Loading