Add method to create a more complete data file for model training

ChronoBoot · Jan 7, 2024 · 26c6015 · 26c6015
1 parent 3c8b541
commit 26c6015
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 2 deletions.
diff --git a/backend/src/data_processing/simple_read_data.py b/backend/src/data_processing/simple_read_data.py
@@ -315,7 +315,21 @@ def read_data(self, files_path: str, concat: bool, sampling_frequency: int) -> p
                 temp_data = pd.read_csv(f"{files_path}/{file_name}")
                 temp_data = temp_data[temp_data['SK_ID_CURR'].isin(train_data['SK_ID_CURR'])]
                 aggregated_data = aggregation_method(temp_data)
-                data = pd.merge(data, aggregated_data, on="SK_ID_CURR")
+                data = pd.merge(data, aggregated_data, on="SK_ID_CURR", how="outer")
 
         # Concatenate all the data into a single DataFrame
-        return data
+        return data
+
+    def write_data_for_model(self, files_path : str, filename: str):
+        """
+        Write the data for the model.
+        It is a merge of the training data and the aggregated data from the other tables.
+
+        Parameters:
+        files_path (str): The path where the file are located.
+        filename (str): The name of the file to write.
+        """
+
+        data = self.read_data(files_path, concat=True, sampling_frequency=1)
+
+        data.to_csv(f"{files_path}/{filename}", index=False)
diff --git a/backend/src/main.py b/backend/src/main.py
@@ -10,6 +10,7 @@
 reader = SimpleReadData()
 
 FILES_FOLDER = 'data'
+DATA_FILE_MODEL = 'data_for_model.csv'
 
 @app.route('/test', methods=['GET'])
 def test():
@@ -46,6 +47,11 @@ def most_important_features():
     features = predictor.get_most_important_features(nb_features)
     return jsonify({'features': features.to_dict()}), 200
 
+@app.route('/write_model_data', methods=['GET'])
+def write_model_data():
+    reader.write_data_for_model(FILES_FOLDER, DATA_FILE_MODEL)
+    return jsonify({'message': 'Model data written successfully'}), 200
+
 if __name__ == '__main__':
     app.run(debug=True)
 
diff --git a/backend/tests/data_processing/test_simple_read_data.py b/backend/tests/data_processing/test_simple_read_data.py
@@ -549,6 +549,27 @@ def test_read_data_concat(self, mock_pos, mock_previous, mock_installments, mock
 
         self.assertTrue(isinstance(result, pd.DataFrame))
         pd.testing.assert_frame_equal(result, expected_result)
+
+    @patch('pandas.DataFrame.to_csv')
+    @patch('backend.src.data_processing.simple_read_data.SimpleReadData.read_data')
+    def test_write_data_for_model(self, mock_read_data, mock_to_csv):
+        # Create a mock DataFrame to return from read_data
+        mock_df = pd.DataFrame({
+            'SK_ID_CURR': [1, 2, 3],
+            'DATA': ['A', 'B', 'C']
+        })
+        mock_read_data.return_value = mock_df
+
+        mock_path = 'mock_path'
+        mock_file = 'mock_file'
+
+        # Create an instance of the class and call the method
+        self.reader.write_data_for_model(mock_path, mock_file)
+
+        # Check that the result is as expected
+        mock_read_data.assert_called_once_with(mock_path, concat = True, sampling_frequency = 1)
+        mock_df.to_csv.assert_called_once_with(f"{mock_path}/{mock_file}", index=False)
+
 
 
 if __name__ == '__main__':