From a4b662b30a4a4b79e9d8a19202306af45928d381 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 23 Feb 2024 14:59:38 +0100 Subject: [PATCH] Feat/147 update calls to write manifest (#149) Update run_data_preparation to use changed function in https://github.com/RMI-PACTA/pacta.data.preparation/pull/341. Also simplifies somewhat the logic around the list of input filepaths. Should be merged after that, but coordinated closely. **Note:** as-written, this includes #150, but reverting back to [35fd720](https://github.com/RMI-PACTA/workflow.data.preparation/pull/149/commits/35fd7200fac09024d96b93aa568b0b9316cf3af5) removes that dependency. Also as written, depends on #153, as this does a similar simplification from `list.files()` to explicit filepath lists for the archive export feature defined in that PR. Reverting [1b8cd49](https://github.com/RMI-PACTA/workflow.data.preparation/pull/149/commits/1b8cd496ed4ffac7743dfb6030182fe5a80fff18) removes that connection. - [x] Depends on #150 - [x] Depends on #153 Closes #147 --------- Co-authored-by: CJ Yetman --- DESCRIPTION | 2 +- run_pacta_data_preparation.R | 91 +++++++++++++++++++++--------------- 2 files changed, 55 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4ba0c28..ec88448 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,7 @@ Imports: dbplyr, dplyr, logger, - pacta.data.preparation (>= 0.1.0.9002), + pacta.data.preparation (>= 0.1.0.9003), pacta.data.scraping, pacta.scenario.preparation, readr, diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index e6dcca4..6ea8e07 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -155,24 +155,39 @@ factset_timestamp <- # check that everything is ready to go ----------------------------------------- -stopifnot(file.exists(masterdata_ownership_path)) -stopifnot(file.exists(masterdata_debt_path)) -stopifnot(file.exists(ar_company_id__factset_entity_id_path)) -stopifnot(file.exists(factset_financial_data_path)) -stopifnot(file.exists(factset_entity_info_path)) -stopifnot(file.exists(factset_entity_financing_data_path)) -stopifnot(file.exists(factset_fund_data_path)) -stopifnot(file.exists(factset_isin_to_fund_table_path)) -stopifnot(file.exists(factset_iss_emissions_data_path)) -stopifnot(file.exists(factset_issue_code_bridge_path)) -stopifnot(file.exists(factset_industry_map_bridge_path)) -stopifnot(file.exists(factset_manual_pacta_sector_override_path)) -stopifnot(file.exists(data_prep_outputs_path)) +input_filepaths <- c( + masterdata_ownership_path = masterdata_ownership_path, + masterdata_debt_path = masterdata_debt_path, + ar_company_id__factset_entity_id_path = ar_company_id__factset_entity_id_path, + factset_financial_data_path = factset_financial_data_path, + factset_entity_info_path = factset_entity_info_path, + factset_entity_financing_data_path = factset_entity_financing_data_path, + factset_fund_data_path = factset_fund_data_path, + factset_isin_to_fund_table_path = factset_isin_to_fund_table_path, + factset_iss_emissions_data_path = factset_iss_emissions_data_path, + factset_issue_code_bridge_path = factset_issue_code_bridge_path, + factset_industry_map_bridge_path = factset_industry_map_bridge_path, + factset_manual_pacta_sector_override_path = factset_manual_pacta_sector_override_path +) if (!update_currencies) { - stopifnot(file.exists(currencies_data_path)) + input_filepaths <- c( + input_filepaths, + currencies_preflight_data_path = currencies_preflight_data_path + ) } +missing_input_files <- input_filepaths[!file.exists(input_filepaths)] + +if (length(missing_input_files) > 0L) { + logger::log_error( + "Input file cannot be found: {names(missing_input_files)} ({missing_input_files})." + ) + stop( + "Input files are missing: ", + toString(missing_input_files) + ) +} # pre-flight ------------------------------------------------------------------- @@ -180,6 +195,10 @@ logger::log_info("Fetching pre-flight data.") if (update_currencies) { logger::log_info("Fetching currency data.") + input_filepaths <- c( + input_filepaths, + currencies_preflight_data_path = currencies_preflight_data_path + ) currencies <- pacta.data.scraping::get_currency_exchange_rates( quarter = imf_quarter_timestamp ) @@ -191,6 +210,10 @@ if (update_currencies) { } logger::log_info("Scraping index regions.") +input_filepaths <- c( + input_filepaths, + index_regions_preflight_data_path = index_regions_preflight_data_path +) index_regions <- pacta.data.scraping::get_index_regions() saveRDS(index_regions, index_regions_preflight_data_path) @@ -856,23 +879,7 @@ parameters <- list( config_name = config_name, config = unclass(config), - input_filepaths = list( - masterdata_ownership_path = masterdata_ownership_path, - masterdata_debt_path = masterdata_debt_path, - ar_company_id__factset_entity_id_path = ar_company_id__factset_entity_id_path, - factset_financial_data_path = factset_financial_data_path, - factset_entity_info_path = factset_entity_info_path, - factset_entity_financing_data_path = factset_entity_financing_data_path, - factset_fund_data_path = factset_fund_data_path, - factset_isin_to_fund_table_path = factset_isin_to_fund_table_path, - factset_iss_emissions_data_path = factset_iss_emissions_data_path, - factset_issue_code_bridge_path = factset_issue_code_bridge_path, - factset_industry_map_bridge_path = factset_industry_map_bridge_path, - factset_manual_pacta_sector_override_path = factset_manual_pacta_sector_override_path - ), - preflight_filepaths = list( - currencies_data_path = currencies_data_path - ), + input_filepaths = as.list(input_filepaths), timestamps = list( imf_quarter_timestamp = imf_quarter_timestamp, factset_data_identifier = factset_timestamp, @@ -901,14 +908,24 @@ parameters <- package_news = package_news ) +logger::log_trace("Getting list of output files.") +output_files <- normalizePath( + list.files( + data_prep_outputs_path, + full.names = TRUE, + recursive = TRUE + ) +) + +manifest_path <- file.path(data_prep_outputs_path, "manifest.json") +logger::log_trace("Writing manifest file: \"{manifest_path}\".") pacta.data.preparation::write_manifest( - path = file.path(data_prep_outputs_path, "manifest.json"), + path = manifest_path, parameters = parameters, - asset_impact_data_path = asset_impact_data_path, - factset_data_path = factset_data_path, - data_prep_outputs_path = data_prep_outputs_path + input_files = input_filepaths, + output_files = output_files ) - +output_files <- c(output_files, manifest_path = manifest_path) # copy in NEWs.md files from relevant PACTA packages --------------------------- @@ -945,7 +962,7 @@ if (export_archives) { logger::log_trace("Zip file path: \"{outputs_zip_file_path}\".") zip( zipfile = outputs_zip_file_path, - files = list.files(data_prep_outputs_path, full.names = TRUE, recursive = TRUE), + files = output_files, extras = c( "--junk-paths", # do not preserve paths "--no-dir-entries", # do not include directory entries