7. adding of eeg dataset with bayesian tests

eXascaleInfolab · Sep 27, 2024 · cd1906a · cd1906a
1 parent 26c9b3f
commit cd1906a
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 22 deletions.
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/imputegap/contamination/contamination.py b/imputegap/contamination/contamination.py
@@ -41,7 +41,8 @@ def format_selection(ts, selection):
         else:
             return selection
 
-    def scenario_mcar(ts, series_impacted=0.2, missing_rate=0.2, block_size=10, protection=0.1, use_seed=True, seed=42, explainer=False):
+    def scenario_mcar(ts, series_impacted=0.2, missing_rate=0.2, block_size=10, protection=0.1, use_seed=True, seed=42,
+                      explainer=False):
         """
         Contamination of time series base on the Missing Completely at Random scenario
 
@@ -82,7 +83,7 @@ def scenario_mcar(ts, series_impacted=0.2, missing_rate=0.2, block_size=10, prot
         for series in series_selected:
             S = int(series)
             N = len(ts_contaminated[S])  # number of values in the series
-            P = int(N * protection)  # values to protect in the begining of the series
+            P = int(N * protection)  # values to protect in the beginning of the series
             W = int((N - P) * missing_rate)  # number of data to remove
             B = int(W / block_size)  # number of block to remove
 
@@ -100,11 +101,51 @@ def scenario_mcar(ts, series_impacted=0.2, missing_rate=0.2, block_size=10, prot
                         position = P + (position - N)  # Wrap around to the start after protection
 
                     while np.isnan(ts_contaminated[S, position]):
-                        position = position+1
+                        position = position + 1
 
                         if position >= N:  # If block exceeds the series length
                             position = P + (position - N)  # Wrap around to the start after protection
 
                     ts_contaminated[S, position] = np.nan
 
-        return ts_contaminated
+        return ts_contaminated
+
+    def scenario_missing_percentage(ts, series_impacted=0.2, missing_rate=0.2, protection=0.1, use_seed=True, seed=42):
+        """
+        Contamination of time series base on the missing percentage scenario
+
+        :param series_impacted: percentage of series contaminated | default 0.2
+        :param missing_rate: percentage of missing values by series  | default 0.2
+        :param protection: size in the beginning of the time series where contamination is not proceeded  | default 0.1
+        :param use_seed: use a seed to reproduce the test | default true
+        :param seed: value of the seed | default 42
+        :return: the contaminated time series
+        """
+
+        if use_seed:
+            np.random.seed(seed)
+
+        ts_contaminated = ts.copy()
+        M, _ = ts_contaminated.shape
+
+        nbr_series_impacted = int(np.ceil(M * series_impacted))
+
+        print("\n\nMISSING PERCENTAGE contamination has been called with :"
+              "\n\ta number of series impacted ", series_impacted * 100, "%",
+              "\n\ta missing rate of ", missing_rate * 100, "%",
+              "\n\ta starting position at ", protection,
+              "\n\twith a seed option set to ", use_seed,
+              "\n\tshape of the set ", ts_contaminated.shape,
+              "\n\tthis selection of series 0 to ", nbr_series_impacted, "\n\n")
+
+        for series in range(0, nbr_series_impacted):
+            S = int(series)
+            N = len(ts_contaminated[S])  # number of values in the series
+            P = int(N * protection)  # values to protect in the beginning of the series
+            W = int((N - P) * missing_rate)  # number of data to remove
+
+            for to_remove in range(0, W):
+                index = P + to_remove
+                ts_contaminated.iat[series, index] = np.nan
+
+        return ts_contaminated
diff --git a/imputegap/imputation/imputation.py b/imputegap/imputation/imputation.py
@@ -30,7 +30,7 @@ def load_parameters(query="default", algorithm="cdrec"):
             print("Query not found for this function ('optimal' or 'default')")
 
         if not os.path.exists(filepath):
-            filepath = filepath[:1]
+            filepath = filepath[1:]
 
         with open(filepath, "r") as _:
             config = toml.load(filepath)

diff --git a/tests/__pycache__/test_opti_bayesian_stmvl.cpython-312.pyc b/tests/__pycache__/test_opti_bayesian_stmvl.cpython-312.pyc
diff --git a/tests/test_contamination_mp.py b/tests/test_contamination_mp.py
@@ -0,0 +1,97 @@
+import os
+import unittest
+import numpy as np
+
+from imputegap.contamination.contamination import Contamination
+from imputegap.manager.manager import TimeSeries
+
+
+def resolve_path(local_path, github_actions_path):
+    """
+    Find the accurate path for tests
+
+    :param local_path: path of local code
+    :param github_actions_path: path on GitHub action
+    :return: correct file paths
+    """
+    if os.path.exists(local_path):
+        return local_path
+    elif os.path.exists(github_actions_path):
+        return github_actions_path
+    else:
+        raise FileNotFoundError("File not found in both: ", local_path, " and ", github_actions_path)
+
+
+def get_file_path(set_name="test"):
+    """
+    Find the accurate path for loading files of tests
+    :return: correct file paths
+    """
+    return resolve_path(f'../imputegap/dataset/{set_name}.txt', f'./imputegap/dataset/{set_name}.txt')
+
+
+class TestContamination(unittest.TestCase):
+
+    def test_mcar_selection(self):
+        """
+        the goal is to test if only the selected values are contaminated
+        """
+        impute_gap = TimeSeries(get_file_path("test"))
+
+        series_impacted = [0.4]
+        missing_rates = [0.4]
+        seeds_start, seeds_end = 42, 43
+        series_check = ["1", "2", "3", "4"]
+        protection = 0.1
+
+        for seed_value in range(seeds_start, seeds_end):
+            for series_sel in series_impacted:
+                for missing_rate in missing_rates:
+
+                    ts_contaminate = Contamination.scenario_missing_percentage(ts=impute_gap.ts,
+                                                                 series_impacted=series_sel,
+                                                                 missing_rate=missing_rate,
+                                                                 protection=protection, use_seed=True,
+                                                                 seed=seed_value)
+
+                    check_nan_series = False
+
+                    for series, data in enumerate(ts_contaminate):
+                        if str(series) in series_check:
+                            if np.isnan(data).any():
+                                check_nan_series = True
+                        else:
+                            if np.isnan(data).any():
+                                check_nan_series = False
+                                break
+                            else:
+                                check_nan_series = True
+
+                    self.assertTrue(check_nan_series, True)
+
+    def test_mcar_position(self):
+        """
+        the goal is to test if the starting position is always guaranteed
+        """
+        impute_gap = TimeSeries(get_file_path("test"))
+
+        series_impacted = [0.4, 1]
+        missing_rates = [0.1, 0.4, 0.6]
+        ten_percent_index = int(impute_gap.ts.shape[1] * 0.1)
+        seeds_start, seeds_end = 42, 43
+
+        for seed_value in range(seeds_start, seeds_end):
+            for series_sel in series_impacted:
+                for missing_rate in missing_rates:
+
+                    ts_contaminate = Contamination.scenario_missing_percentage(ts=impute_gap.ts,
+                                                                 series_impacted=series_sel,
+                                                                 missing_rate=missing_rate, protection=0.1,
+                                                                 use_seed=True, seed=seed_value)
+
+                    if np.isnan(ts_contaminate[:, :ten_percent_index]).any():
+                        check_position = False
+                    else:
+                        check_position = True
+
+                    self.assertTrue(check_position, True)