diff --git a/backend/src/data_processing/simple_read_data.py b/backend/src/data_processing/simple_read_data.py index ec75da2..a0f3005 100644 --- a/backend/src/data_processing/simple_read_data.py +++ b/backend/src/data_processing/simple_read_data.py @@ -315,7 +315,21 @@ def read_data(self, files_path: str, concat: bool, sampling_frequency: int) -> p temp_data = pd.read_csv(f"{files_path}/{file_name}") temp_data = temp_data[temp_data['SK_ID_CURR'].isin(train_data['SK_ID_CURR'])] aggregated_data = aggregation_method(temp_data) - data = pd.merge(data, aggregated_data, on="SK_ID_CURR") + data = pd.merge(data, aggregated_data, on="SK_ID_CURR", how="outer") # Concatenate all the data into a single DataFrame - return data \ No newline at end of file + return data + + def write_data_for_model(self, files_path : str, filename: str): + """ + Write the data for the model. + It is a merge of the training data and the aggregated data from the other tables. + + Parameters: + files_path (str): The path where the file are located. + filename (str): The name of the file to write. + """ + + data = self.read_data(files_path, concat=True, sampling_frequency=1) + + data.to_csv(f"{files_path}/{filename}", index=False) diff --git a/backend/src/main.py b/backend/src/main.py index 1053555..aab07d7 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -10,6 +10,7 @@ reader = SimpleReadData() FILES_FOLDER = 'data' +DATA_FILE_MODEL = 'data_for_model.csv' @app.route('/test', methods=['GET']) def test(): @@ -46,6 +47,11 @@ def most_important_features(): features = predictor.get_most_important_features(nb_features) return jsonify({'features': features.to_dict()}), 200 +@app.route('/write_model_data', methods=['GET']) +def write_model_data(): + reader.write_data_for_model(FILES_FOLDER, DATA_FILE_MODEL) + return jsonify({'message': 'Model data written successfully'}), 200 + if __name__ == '__main__': app.run(debug=True) diff --git a/backend/tests/data_processing/test_simple_read_data.py b/backend/tests/data_processing/test_simple_read_data.py index 6ab8096..9df55e2 100644 --- a/backend/tests/data_processing/test_simple_read_data.py +++ b/backend/tests/data_processing/test_simple_read_data.py @@ -549,6 +549,27 @@ def test_read_data_concat(self, mock_pos, mock_previous, mock_installments, mock self.assertTrue(isinstance(result, pd.DataFrame)) pd.testing.assert_frame_equal(result, expected_result) + + @patch('pandas.DataFrame.to_csv') + @patch('backend.src.data_processing.simple_read_data.SimpleReadData.read_data') + def test_write_data_for_model(self, mock_read_data, mock_to_csv): + # Create a mock DataFrame to return from read_data + mock_df = pd.DataFrame({ + 'SK_ID_CURR': [1, 2, 3], + 'DATA': ['A', 'B', 'C'] + }) + mock_read_data.return_value = mock_df + + mock_path = 'mock_path' + mock_file = 'mock_file' + + # Create an instance of the class and call the method + self.reader.write_data_for_model(mock_path, mock_file) + + # Check that the result is as expected + mock_read_data.assert_called_once_with(mock_path, concat = True, sampling_frequency = 1) + mock_df.to_csv.assert_called_once_with(f"{mock_path}/{mock_file}", index=False) + if __name__ == '__main__':