Skip to content

Commit

Permalink
7. adding of eeg dataset with bayesian tests
Browse files Browse the repository at this point in the history
  • Loading branch information
qnater committed Sep 27, 2024
1 parent 26c9b3f commit cd1906a
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 22 deletions.
32 changes: 15 additions & 17 deletions .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 45 additions & 4 deletions imputegap/contamination/contamination.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def format_selection(ts, selection):
else:
return selection

def scenario_mcar(ts, series_impacted=0.2, missing_rate=0.2, block_size=10, protection=0.1, use_seed=True, seed=42, explainer=False):
def scenario_mcar(ts, series_impacted=0.2, missing_rate=0.2, block_size=10, protection=0.1, use_seed=True, seed=42,
explainer=False):
"""
Contamination of time series base on the Missing Completely at Random scenario
Expand Down Expand Up @@ -82,7 +83,7 @@ def scenario_mcar(ts, series_impacted=0.2, missing_rate=0.2, block_size=10, prot
for series in series_selected:
S = int(series)
N = len(ts_contaminated[S]) # number of values in the series
P = int(N * protection) # values to protect in the begining of the series
P = int(N * protection) # values to protect in the beginning of the series
W = int((N - P) * missing_rate) # number of data to remove
B = int(W / block_size) # number of block to remove

Expand All @@ -100,11 +101,51 @@ def scenario_mcar(ts, series_impacted=0.2, missing_rate=0.2, block_size=10, prot
position = P + (position - N) # Wrap around to the start after protection

while np.isnan(ts_contaminated[S, position]):
position = position+1
position = position + 1

if position >= N: # If block exceeds the series length
position = P + (position - N) # Wrap around to the start after protection

ts_contaminated[S, position] = np.nan

return ts_contaminated
return ts_contaminated

def scenario_missing_percentage(ts, series_impacted=0.2, missing_rate=0.2, protection=0.1, use_seed=True, seed=42):
"""
Contamination of time series base on the missing percentage scenario
:param series_impacted: percentage of series contaminated | default 0.2
:param missing_rate: percentage of missing values by series | default 0.2
:param protection: size in the beginning of the time series where contamination is not proceeded | default 0.1
:param use_seed: use a seed to reproduce the test | default true
:param seed: value of the seed | default 42
:return: the contaminated time series
"""

if use_seed:
np.random.seed(seed)

ts_contaminated = ts.copy()
M, _ = ts_contaminated.shape

nbr_series_impacted = int(np.ceil(M * series_impacted))

print("\n\nMISSING PERCENTAGE contamination has been called with :"
"\n\ta number of series impacted ", series_impacted * 100, "%",
"\n\ta missing rate of ", missing_rate * 100, "%",
"\n\ta starting position at ", protection,
"\n\twith a seed option set to ", use_seed,
"\n\tshape of the set ", ts_contaminated.shape,
"\n\tthis selection of series 0 to ", nbr_series_impacted, "\n\n")

for series in range(0, nbr_series_impacted):
S = int(series)
N = len(ts_contaminated[S]) # number of values in the series
P = int(N * protection) # values to protect in the beginning of the series
W = int((N - P) * missing_rate) # number of data to remove

for to_remove in range(0, W):
index = P + to_remove
ts_contaminated.iat[series, index] = np.nan

return ts_contaminated
2 changes: 1 addition & 1 deletion imputegap/imputation/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def load_parameters(query="default", algorithm="cdrec"):
print("Query not found for this function ('optimal' or 'default')")

if not os.path.exists(filepath):
filepath = filepath[:1]
filepath = filepath[1:]

with open(filepath, "r") as _:
config = toml.load(filepath)
Expand Down
Binary file not shown.
97 changes: 97 additions & 0 deletions tests/test_contamination_mp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import os
import unittest
import numpy as np

from imputegap.contamination.contamination import Contamination
from imputegap.manager.manager import TimeSeries


def resolve_path(local_path, github_actions_path):
"""
Find the accurate path for tests
:param local_path: path of local code
:param github_actions_path: path on GitHub action
:return: correct file paths
"""
if os.path.exists(local_path):
return local_path
elif os.path.exists(github_actions_path):
return github_actions_path
else:
raise FileNotFoundError("File not found in both: ", local_path, " and ", github_actions_path)


def get_file_path(set_name="test"):
"""
Find the accurate path for loading files of tests
:return: correct file paths
"""
return resolve_path(f'../imputegap/dataset/{set_name}.txt', f'./imputegap/dataset/{set_name}.txt')


class TestContamination(unittest.TestCase):

def test_mcar_selection(self):
"""
the goal is to test if only the selected values are contaminated
"""
impute_gap = TimeSeries(get_file_path("test"))

series_impacted = [0.4]
missing_rates = [0.4]
seeds_start, seeds_end = 42, 43
series_check = ["1", "2", "3", "4"]
protection = 0.1

for seed_value in range(seeds_start, seeds_end):
for series_sel in series_impacted:
for missing_rate in missing_rates:

ts_contaminate = Contamination.scenario_missing_percentage(ts=impute_gap.ts,
series_impacted=series_sel,
missing_rate=missing_rate,
protection=protection, use_seed=True,
seed=seed_value)

check_nan_series = False

for series, data in enumerate(ts_contaminate):
if str(series) in series_check:
if np.isnan(data).any():
check_nan_series = True
else:
if np.isnan(data).any():
check_nan_series = False
break
else:
check_nan_series = True

self.assertTrue(check_nan_series, True)

def test_mcar_position(self):
"""
the goal is to test if the starting position is always guaranteed
"""
impute_gap = TimeSeries(get_file_path("test"))

series_impacted = [0.4, 1]
missing_rates = [0.1, 0.4, 0.6]
ten_percent_index = int(impute_gap.ts.shape[1] * 0.1)
seeds_start, seeds_end = 42, 43

for seed_value in range(seeds_start, seeds_end):
for series_sel in series_impacted:
for missing_rate in missing_rates:

ts_contaminate = Contamination.scenario_missing_percentage(ts=impute_gap.ts,
series_impacted=series_sel,
missing_rate=missing_rate, protection=0.1,
use_seed=True, seed=seed_value)

if np.isnan(ts_contaminate[:, :ten_percent_index]).any():
check_position = False
else:
check_position = True

self.assertTrue(check_position, True)

0 comments on commit cd1906a

Please sign in to comment.