From 362596416e99e5e9b95b0542a4cb858a55a4c633 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 10:53:37 +0100 Subject: [PATCH 01/39] add default 2023Q4 config --- config.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/config.yml b/config.yml index d677f38..bf9a8dd 100644 --- a/config.yml +++ b/config.yml @@ -112,6 +112,26 @@ default: scenario_raw_data_to_include: ["geco_2022", "ipr_2021", "isf_2021", "weo_2022"] global_aggregate_scenario_sources_list: ["WEO2022"] + +2023Q4: + masterdata_ownership_filename: "" + masterdata_debt_filename: "" + ar_company_id__factset_entity_id_filename: "" + factset_financial_data_filename: "" + factset_entity_info_filename: "" + factset_entity_financing_data_filename: "" + factset_fund_data_filename: "" + factset_isin_to_fund_table_filename: "" + factset_iss_emissions_data_filename: "" + factset_issue_code_bridge_filename: "" + imf_quarter_timestamp: "2023-Q4" + pacta_financial_timestamp: "2023Q4" + market_share_target_reference_year: 2023 + scenario_sources_list: ["GECO2023", "ISF2023", "WEO2023"] + scenario_raw_data_to_include: ["geco_2023", "isf_2023", "weo_2023"] + global_aggregate_scenario_sources_list: ["WEO2023"] + + desktop: inherits: 2022Q4 data_prep_outputs_path: "./outputs" From de2bd6000f417d349a05da041fe2692b765b5105 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 11:17:44 +0100 Subject: [PATCH 02/39] add `factset_industry_map_bridge_filename` --- config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config.yml b/config.yml index d6b32e6..e1d7121 100644 --- a/config.yml +++ b/config.yml @@ -124,6 +124,7 @@ default: factset_isin_to_fund_table_filename: "" factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" + factset_industry_map_bridge_filename: "" imf_quarter_timestamp: "2023-Q4" pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 From 58e365c5e3569e9d9190542fd082f1158683bde4 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 20:33:22 +0100 Subject: [PATCH 03/39] add `factset_manual_pacta_sector_override` --- config.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/config.yml b/config.yml index da9318a..5b5a353 100644 --- a/config.yml +++ b/config.yml @@ -13,6 +13,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" + factset_manual_pacta_sector_override: "" update_currencies: TRUE export_sqlite_files: TRUE imf_quarter_timestamp: "2021-Q4" @@ -43,6 +44,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" + factset_manual_pacta_sector_override: "" imf_quarter_timestamp: "2021-Q4" pacta_financial_timestamp: "2021Q4" market_share_target_reference_year: 2021 @@ -83,6 +85,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" + factset_manual_pacta_sector_override: "" imf_quarter_timestamp: "2022-Q2" pacta_financial_timestamp: "2022Q2" market_share_target_reference_year: 2022 @@ -110,6 +113,7 @@ default: factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override: "" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 @@ -130,6 +134,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" + factset_manual_pacta_sector_override: "" imf_quarter_timestamp: "2023-Q4" pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 From 5a2ec24a858a40ca9b4f2049e56dde34413e7fd9 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 20:35:50 +0100 Subject: [PATCH 04/39] add `_filename` suffix --- config.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config.yml b/config.yml index 5b5a353..fefa772 100644 --- a/config.yml +++ b/config.yml @@ -13,7 +13,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" update_currencies: TRUE export_sqlite_files: TRUE imf_quarter_timestamp: "2021-Q4" @@ -44,7 +44,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" imf_quarter_timestamp: "2021-Q4" pacta_financial_timestamp: "2021Q4" market_share_target_reference_year: 2021 @@ -85,7 +85,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" imf_quarter_timestamp: "2022-Q2" pacta_financial_timestamp: "2022Q2" market_share_target_reference_year: 2022 @@ -113,7 +113,7 @@ default: factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 @@ -134,7 +134,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" imf_quarter_timestamp: "2023-Q4" pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 From a96f286fce743c0f651cf106d8cc3f97cd8496e3 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 21:19:55 +0100 Subject: [PATCH 05/39] add AI dataset filenames --- config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.yml b/config.yml index fefa772..31eb699 100644 --- a/config.yml +++ b/config.yml @@ -123,9 +123,9 @@ default: 2023Q4: - masterdata_ownership_filename: "" - masterdata_debt_filename: "" - ar_company_id__factset_entity_id_filename: "" + masterdata_ownership_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2023Q4.csv" + masterdata_debt_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_debt_2023Q4.csv" + ar_company_id__factset_entity_id_filename: "2024-02-14_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2023Q4.csv" factset_financial_data_filename: "" factset_entity_info_filename: "" factset_entity_financing_data_filename: "" From 6e34fde39c55ac41b9d2fe88dceeebdc88884fd6 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 21:38:29 +0100 Subject: [PATCH 06/39] change default "desktop" config to use 2023Q4 --- config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.yml b/config.yml index 31eb699..ab4cdfa 100644 --- a/config.yml +++ b/config.yml @@ -144,7 +144,7 @@ default: desktop: - inherits: 2022Q4 + inherits: 2023Q4 data_prep_outputs_path: "./outputs" asset_impact_data_path: "./ai_inputs" factset_data_path: "./factset_inputs" From 965c2da51e25a3305e131e5ccfa9000dd01508ba Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Fri, 16 Feb 2024 07:50:21 +0100 Subject: [PATCH 07/39] Update from main --- config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.yml b/config.yml index ab4cdfa..9fdfb1e 100644 --- a/config.yml +++ b/config.yml @@ -113,7 +113,7 @@ default: factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override_filename: "" + factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 From 00c26bc10f53f1f6790e9e1649564aa7ef15f7ab Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Sat, 17 Feb 2024 19:27:39 +0100 Subject: [PATCH 08/39] add more parameters to review --- config.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/config.yml b/config.yml index 9fdfb1e..b82d30e 100644 --- a/config.yml +++ b/config.yml @@ -141,6 +141,13 @@ default: scenario_sources_list: ["GECO2023", "ISF2023", "WEO2023"] scenario_raw_data_to_include: ["geco_2023", "isf_2023", "weo_2023"] global_aggregate_scenario_sources_list: ["WEO2023"] + sector_list: [] + other_sector_list: [] + zero_emission_factor_techs: [] + green_techs: [] + tech_exclude: [] + scenario_geographies_list: [] + global_aggregate_sector_list: [] desktop: From 58d78fa148578394067bcd17f4208495eafdc053 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 17 Feb 2024 19:39:06 +0100 Subject: [PATCH 09/39] docs(deploy): Define prerequisites Define the prerequisite steps prior to running data prep --- README.md | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ac5d44a..561b699 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # workflow.data.preparation -`workflow.data.preparation` orchestrates the PACTA data preparation process, combining production, financial, scenario, and currency data into a format suitable for use in a PACTA for investors analysis. Assuming that the computing resource being used has sufficient memory (which can be >16gb depending on the inputs), storage space, and access to the necessary inputs, this is intended to work on a desktop or laptop using RStudio or run using the included [Dockerfile](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/Dockerfile) and [docker-compose.yml](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/docker-compose.yml). +`workflow.data.preparation` orchestrates the PACTA data preparation process, combining production, financial, scenario, and currency data into a format suitable for use in a PACTA for investors analysis. Assuming that the computing resource being used has sufficient memory (which can be >16Gb depending on the inputs), storage space, and access to the necessary inputs, this is intended to work on a desktop or laptop using RStudio or run using the included [Dockerfile](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/Dockerfile) and [docker-compose.yml](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/docker-compose.yml). ## Running in RStudio @@ -12,7 +12,7 @@ Running workflow.data.preparation has a number of R package dependencies that ar To make things easier, the recommended way to specify the desired config set when running locally in RStudio is by setting the active config set to `desktop` and modifying/adding only a few of the properties in the `desktop` config set. By doing so, you benefit from inheriting many of the appropriate configuration values without having to explicitly specify each one. -You will need to set the `inherits` parameter, e.g. `inherits: 2022Q4`, to select which of the config sets specified in the config.yml file that is desired. +You will need to set the `inherits` parameter, e.g. `inherits: 2022Q4`, to select which of the config sets specified in the config.yml file that is desired. You will need to set `data_prep_outputs_path` to an *existing* directory where you want the outputs to be saved, e.g. `data_prep_outputs_path: "./outputs"` to point to an existing directory named `outputs` in the working directory of the R session you will be running data.prep in. This directory must exist before running data.prep (and ideally be empty). The script will throw an error early on if it does not exist. @@ -57,6 +57,26 @@ Run `docker-compose up` from the root directory, and docker will build the image Use `docker-compose build --no-cache` to force a rebuild of the Docker image. +## Running Data Preparation interactively on Azure VM + +*Instructions specific to the RMI-PACTA team's Azure instance are in Italics.* + +0. **Prerequisites:** + - Set up Storage Accounts containing the [required files](#required-input-files). + While all the files can exist on a single file share, in a single storage account, the workflow can access different storage accounts, to allow for read-only access to raw data, to prevent accident manipulation of source data. + The recommended structure (used by RMI) is: + - Storage Account: `pactadatadev`: (read/write) *RMI QAs datasets prior to moving them to PROD with[ `workflow.pacta.data.qa`](https://github.com/RMI-PACTA/workflow.pacta.data.qa)* + - File Share `workflow-data-preparation-outputs`: Outputs from this workflow. + - Storage Account: `pactarawdata` (read-only) + - File Share `factset-extracted`: Outputs from [`workflow.factset`](https://github.com/RMI-PACTA/workflow.factset) + - File Share `AssetImpact` Raw data files from [Asset Impact](https://asset-impact.gresb.com/) + - (Optional, but recommended) Create a User Assigned Managed Identity. + Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. + * **RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.* + - Grant Appropriate permissions to the Identity: + - `pactadatadev`: "Storage File Data SMB Share Contributor" + - `pactarawdata`: "Storage File Data SMB Share Reader" + ## Required Input Files All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories). From 032459e05f91b1327793f15e3a3e9dab8c5d702f Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 17 Feb 2024 20:55:37 +0100 Subject: [PATCH 10/39] docs(deploy): Instructions up through connecting Everything works up through creating and connecting to VM --- README.md | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 561b699..641e340 100644 --- a/README.md +++ b/README.md @@ -62,21 +62,86 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. *Instructions specific to the RMI-PACTA team's Azure instance are in Italics.* 0. **Prerequisites:** + *These steps have been completed on the RMI Azure instance.* + - Ensure a Virtual Network with a Gateway has been set up, permitting SSH (Port 22) access. + Details of setting this up are out of scope for these instructions. + Talk to your network coordinator for help. - Set up Storage Accounts containing the [required files](#required-input-files). While all the files can exist on a single file share, in a single storage account, the workflow can access different storage accounts, to allow for read-only access to raw data, to prevent accident manipulation of source data. - The recommended structure (used by RMI) is: - - Storage Account: `pactadatadev`: (read/write) *RMI QAs datasets prior to moving them to PROD with[ `workflow.pacta.data.qa`](https://github.com/RMI-PACTA/workflow.pacta.data.qa)* + The recommended structure (*used by RMI*) is: + - Storage Account: `pactadatadev`: (read/write). + Naming note: *RMI QAs datasets prior to moving them to PROD with [`workflow.pacta.data.qa`](https://github.com/RMI-PACTA/workflow.pacta.data.qa)*. - File Share `workflow-data-preparation-outputs`: Outputs from this workflow. - Storage Account: `pactarawdata` (read-only) - File Share `factset-extracted`: Outputs from [`workflow.factset`](https://github.com/RMI-PACTA/workflow.factset) - File Share `AssetImpact` Raw data files from [Asset Impact](https://asset-impact.gresb.com/) - (Optional, but recommended) Create a User Assigned Managed Identity. - Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. - * **RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.* + Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. ***RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.* - Grant Appropriate permissions to the Identity: - `pactadatadev`: "Storage File Data SMB Share Contributor" - `pactarawdata`: "Storage File Data SMB Share Reader" +1. Start a VM. + While the machine can be deployed via the Portal (WebUI), for simplicity, the following code block is provided which ensures consistency: + + ```sh + # The options here work with the RMI-PACTA team's Azure setup. + # Change values for your own instance as needed. + + # Get Network details. + VNET_RESOURCE_GROUP="RMI-PROD-EU-VNET-RG" + VNET_NAME="RMI-PROD-EU-VNET" + SUBNET_NAME="RMI-SP-PACTA-DEV-VNET" + SUBNET_ID=$(az network vnet subnet show --resource-group $VNET_RESOURCE_GROUP --name $SUBNET_NAME --vnet-name $VNET_NAME --query id -o tsv) + + # Use the identity previously setup (see Prerequisites) + MACHINEIDENTITY="/subscriptions/feef729b-4584-44af-a0f9-4827075512f9/resourceGroups/RMI-SP-PACTA-PROD/providers/Microsoft.ManagedIdentity/userAssignedIdentities/workflow-data-preparation" + # This size has 2 vCPU, and 32GiB memory, recommended settings. + MACHINE_SIZE="Standard_E4-2as_v4" + # Using epoch to give machine a (probably) unique name + MACHINE_NAME="dataprep-runner-$(date +%s)" + # NOTE: Change this to your own RG as needed. + VM_RESOURCE_GROUP="RMI-SP-PACTA-DEV" + + # **NOTE: Check these options prior to running** + # Non-RMI users may choose to omit the --public-ip-address line for public SSH Access. + + az vm create \ + --admin-username azureuser \ + --assign-identity "$MACHINEIDENTITY" \ + --generate-ssh-keys \ + --image Ubuntu2204 \ + --name "$MACHINE_NAME" \ + --nic-delete-option delete \ + --os-disk-delete-option delete \ + --public-ip-address "" \ + --resource-group "$VM_RESOURCE_GROUP" \ + --size "$MACHINE_SIZE" \ + --subnet "$SUBNETID" + + ``` + + If this command successfully runs, it will output a JSON block describing the resource (VM) created. + +2. **Connect to the Network.** (Optional) + ***RMI:** Connecting to the VPN will enable SSH access.* + Connect to the Virtual Network specified above, as the comand above does not create a Public IP Address. + Details for this are out of scope for these instructions. + Contact your network coordinator for help. + +2. Connect to the newly created VM via SSH. + + ```sh + This connects to the VM created above via SSH. + + az ssh vm \ + --local-user azureuser \ + --name "$MACHINE_NAME" \ + --prefer-private-ip \ + --resource-group "$VM_RESOURCE_GROUP" + + ``` + ## Required Input Files All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories). From 080ad61552d766a5b2d7b886a11c5b24d7a40a4f Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 17 Feb 2024 21:19:05 +0100 Subject: [PATCH 11/39] feat(deploy): Add mount_afs script Add a helper script to mount Azure File Shares --- scripts/mount_afs.sh | 107 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100755 scripts/mount_afs.sh diff --git a/scripts/mount_afs.sh b/scripts/mount_afs.sh new file mode 100755 index 0000000..350388d --- /dev/null +++ b/scripts/mount_afs.sh @@ -0,0 +1,107 @@ +#! /bin/sh + +# mount an Azure File Share at a given location. +# Requires az cli to be installed and logged in. + +usage() { + echo "Usage: mount_afs.sh [-h] [-v] -r -a -f -m " + echo " -h: help (this message)" + echo " -v: verbose" + echo " -r: resource group (Required)" + echo " -a: storage account name (Required)" + echo " -f: file share name (Required)" + echo " -m: mount point (Required)" + echo " -?: help" + exit 1 +} + +while getopts "h?vr:a:f:m:" opt; do + case "$opt" in + h|\?) + usage + ;; + v) VERBOSE=1 + ;; + r) RESOURCEGROUP=$OPTARG + ;; + a) STORAGEACCOUNTNAME=$OPTARG + ;; + f) FILESHARENAME=$OPTARG + ;; + m) MOUNTPOINT=$OPTARG + ;; + *) + usage + ;; + esac +done + +missing_opts=0 +if [ -z "$RESOURCEGROUP" ]; then + echo "ERROR: Resource group is required" + missing_opts=1 +fi + +if [ -z "$STORAGEACCOUNTNAME" ]; then + echo "ERROR: Storage account name is required" + missing_opts=1 +fi + +if [ -z "$FILESHARENAME" ]; then + echo "ERROR: File share name is required" + missing_opts=1 +fi + +if [ -z "$MOUNTPOINT" ]; then + echo "ERROR: Mount point is required" + missing_opts=1 +fi + +if [ $missing_opts -eq 1 ]; then + usage +fi + +if [ -n "$VERBOSE" ]; then + echo "RESOURCEGROUP: $RESOURCEGROUP" + echo "STORAGEACCOUNTNAME: $STORAGEACCOUNTNAME" + echo "FILESHARENAME: $FILESHARENAME" + echo "MOUNTPOINT: $MOUNTPOINT" +fi + +# This command assumes you have logged in with az login + +if [ -n "$VERBOSE" ]; then + echo "Getting https endpoint for storage account $STORAGEACCOUNTNAME" +fi + +httpEndpoint=$(az storage account show \ + --resource-group "$RESOURCEGROUP" \ + --name "$STORAGEACCOUNTNAME" \ + --query "primaryEndpoints.file" --output tsv | tr -d '"') +smbPath=$(echo "$httpEndpoint" | cut -c7-${#httpEndpoint})$FILESHARENAME +fileHost=$(echo "$httpEndpoint" | cut -c7-${#httpEndpoint}| tr -d "/") +nc -zvw3 "$fileHost" 445 + +if [ -n "$VERBOSE" ]; then + echo "httpEndpoint: $httpEndpoint" + echo "smbPath: $smbPath" + echo "fileHost: $fileHost" +fi + +if [ -n "$VERBOSE" ]; then + echo "Getting storage account key" +fi +storageAccountKey=$(az storage account keys list \ + --resource-group "$RESOURCEGROUP" \ + --account-name "$STORAGEACCOUNTNAME" \ + --query "[0].value" --output tsv | tr -d '"') + +if [ -n "$VERBOSE" ]; then + echo "Creating mount path: $MOUNTPOINT" +fi +sudo mkdir -p "$MOUNTPOINT" + +if [ -n "$VERBOSE" ]; then + echo "Mounting $smbPath to $MOUNTPOINT" +fi +sudo mount -t cifs "$smbPath" "$MOUNTPOINT" -o username="$STORAGEACCOUNTNAME",password="$storageAccountKey",serverino,nosharesock,actimeo=30,file_mode=0777,nobrl,dir_mode=0777,vers=3.1.1 From 222375e6087a9608880258f662513351d01016e6 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 00:11:13 +0100 Subject: [PATCH 12/39] fix(deploy): Update script to default to read-only Mounting an Azure File Share to a linux OS via SMB defaults to read/write access. The change to the mount script default to read-only with file permissions (`0555`) Update Docs accordingly. --- README.md | 36 +++++++++++++++++++++++++++++++----- scripts/mount_afs.sh | 14 ++++++++++++-- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 641e340..8859125 100644 --- a/README.md +++ b/README.md @@ -78,10 +78,12 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. - (Optional, but recommended) Create a User Assigned Managed Identity. Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. ***RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.* - Grant Appropriate permissions to the Identity: - - `pactadatadev`: "Storage File Data SMB Share Contributor" - - `pactarawdata`: "Storage File Data SMB Share Reader" + - `pactadatadev`: "Reader and Data Access". + - `pactarawdata`: "Reader and Data Access" + Note that this gives read/write access the Storage Account via the Storage Account Key. + To grant read-only access to the VM, use the `mount_afs` script without the `-w` flag, as shown below. -1. Start a VM. +1. **Start a VM** While the machine can be deployed via the Portal (WebUI), for simplicity, the following code block is provided which ensures consistency: ```sh @@ -123,13 +125,13 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. If this command successfully runs, it will output a JSON block describing the resource (VM) created. -2. **Connect to the Network.** (Optional) +2. **Connect to the Network.** (Optional) ***RMI:** Connecting to the VPN will enable SSH access.* Connect to the Virtual Network specified above, as the comand above does not create a Public IP Address. Details for this are out of scope for these instructions. Contact your network coordinator for help. -2. Connect to the newly created VM via SSH. +3. **Connect to the newly created VM via SSH.** ```sh This connects to the VM created above via SSH. @@ -142,6 +144,30 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. ``` +4. **Connect the VM to required resources** + Clone this repo, install the `az` cli utility, and mount the appropriate Azure File Shares. + + ```sh + # Clone this repo through https to avoid need for an SSH key + git clone https://github.com/RMI-PACTA/workflow.data.preparation.git ~/workflow.data.preparation + + # Install az cli + sudo apt update + # See https://aka.ms/installcli for alternate instructions + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + + # Login to azure with assigned identity + az login --identity + + # Use script from this repo to connect to file shares + ~/workflow.data.preparation/scripts/mount_afs.sh -r "RMI-SP-PACTA-PROD" -a "pactarawdata" -f "factset-extracted" -m "/mnt/factset-extracted" + ~/workflow.data.preparation/scripts/mount_afs.sh -r "RMI-SP-PACTA-PROD" -a "pactarawdata" -f "asset-impact" -m "/mnt/asset-impact" + + # Note the outputs directory has the -w flag, meaning write permissions are enabled. + ~/workflow.data.preparation/scripts/mount_afs.sh -r "RMI-SP-PACTA-DEV" -a "pactadatadev" -f "workflow-data-preparation-outputs" -m "/mnt/workflow-data-preparation-outputs" -w + + ``` + ## Required Input Files All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories). diff --git a/scripts/mount_afs.sh b/scripts/mount_afs.sh index 350388d..d0f58c9 100755 --- a/scripts/mount_afs.sh +++ b/scripts/mount_afs.sh @@ -7,6 +7,7 @@ usage() { echo "Usage: mount_afs.sh [-h] [-v] -r -a -f -m " echo " -h: help (this message)" echo " -v: verbose" + echo " -w: Allow write access to the file share (default is read-only)" echo " -r: resource group (Required)" echo " -a: storage account name (Required)" echo " -f: file share name (Required)" @@ -15,13 +16,15 @@ usage() { exit 1 } -while getopts "h?vr:a:f:m:" opt; do +while getopts "h?vwr:a:f:m:" opt; do case "$opt" in h|\?) usage ;; v) VERBOSE=1 ;; + w) ALLOW_WRITE=1 + ;; r) RESOURCEGROUP=$OPTARG ;; a) STORAGEACCOUNTNAME=$OPTARG @@ -104,4 +107,11 @@ sudo mkdir -p "$MOUNTPOINT" if [ -n "$VERBOSE" ]; then echo "Mounting $smbPath to $MOUNTPOINT" fi -sudo mount -t cifs "$smbPath" "$MOUNTPOINT" -o username="$STORAGEACCOUNTNAME",password="$storageAccountKey",serverino,nosharesock,actimeo=30,file_mode=0777,nobrl,dir_mode=0777,vers=3.1.1 + +if [ -n "$ALLOW_WRITE" ]; then + permissions="file_mode=0777,dir_mode=0777" +else + permissions="file_mode=0555,dir_mode=0555" +fi + +sudo mount -t cifs "$smbPath" "$MOUNTPOINT" -o username="$STORAGEACCOUNTNAME",password="$storageAccountKey",serverino,nosharesock,actimeo=30,nobrl,"$permissions",vers=3.1.1 From c2e798c93c4b3e69cad869f90f8b6a7ff17eb9f2 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 02:09:15 +0100 Subject: [PATCH 13/39] feat(deploy): Use new split inputs in docker-compose --- config.yml | 2 ++ docker-compose.yml | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/config.yml b/config.yml index 0680490..b342d56 100644 --- a/config.yml +++ b/config.yml @@ -102,6 +102,8 @@ default: 2022Q4: + asset_impact_data_path: "/mnt/factset-extracted" + factset_data_path: "/mnt/asset-impact" masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" diff --git a/docker-compose.yml b/docker-compose.yml index e8baf92..f0b3f7d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,8 +6,11 @@ services: context: . volumes: - type: bind - source: ${HOST_INPUTS_PATH} - target: /inputs + source: ${HOST_FACTSET_EXTRACTED_PATH} + target: /mnt/factset-extracted + - type: bind + source: ${HOST_ASSET_IMPACT_PATH} + target: /asset-impact - type: bind source: ${HOST_OUTPUTS_PATH} target: /outputs From 4f42a33aae6e3b8afcc019f6db45bdb34d707670 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:17:25 +0100 Subject: [PATCH 14/39] feat(deploy): Change AI File paths Reflect actual Azure FIles structure --- config.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config.yml b/config.yml index b342d56..9af590a 100644 --- a/config.yml +++ b/config.yml @@ -102,11 +102,11 @@ default: 2022Q4: - asset_impact_data_path: "/mnt/factset-extracted" - factset_data_path: "/mnt/asset-impact" - masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" - masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" - ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" + asset_impact_data_path: "/mnt/asset-impact" + factset_data_path: "/mnt/factset-extracted" + masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" + masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" + ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds" factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds" factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds" From 13d2d3c39df951a63a6716451e2487832b67e9cb Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:22:40 +0100 Subject: [PATCH 15/39] feat(deploy): Update Factset file paths for 2022Q4 --- config.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/config.yml b/config.yml index 9af590a..00d3e01 100644 --- a/config.yml +++ b/config.yml @@ -103,19 +103,19 @@ default: 2022Q4: asset_impact_data_path: "/mnt/asset-impact" - factset_data_path: "/mnt/factset-extracted" + factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z" masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" - factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds" - factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds" - factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds" - factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_fund_data.rds" - factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_isin_to_fund_table.rds" - factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" - factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" - factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds" + factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_financial_data.rds" + factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_info.rds" + factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_financing_data.rds" + factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_fund_data.rds" + factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_isin_to_fund_table.rds" + factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_iss_emissions.rds" + factset_issue_code_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_issue_code_bridge.rds" + factset_industry_map_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_manual_sector_override.rds" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 From dfc4825c4e8a66c11f265ca1d3b4aba43293e6f5 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:23:37 +0100 Subject: [PATCH 16/39] ci(deploy): Add verbose logging for remote environment --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index f0b3f7d..a25de22 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,6 +4,8 @@ services: data_prep: build: context: . + environment: + - LOG_LEVEL=TRACE volumes: - type: bind source: ${HOST_FACTSET_EXTRACTED_PATH} From 12499023a5f408622af6a9d0d473b8b4c56c82ac Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:25:14 +0100 Subject: [PATCH 17/39] docs(deploy): Update README instructions --- README.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8859125..f666945 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ Running the workflow requires a file `.env` to exist in the root directory, that ```sh # .env -HOST_INPUTS_PATH=/PATH/TO/inputs +HOST_FACTSET_EXTRACTED_PATH=/PATH/TO/factset-extracted +HOST_ASSET_IMPACT_PATH=/PATH/TO/asset-impact HOST_OUTPUTS_PATH=/PATH/TO/YYYYQQ_pacta_analysis_inputs_YYYY-MM-DD/YYYYQQ GITHUB_PAT=ghp_XXXXxxXxXXXxXxxX R_CONFIG_ACTIVE=YYYYQQ @@ -119,7 +120,7 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. --public-ip-address "" \ --resource-group "$VM_RESOURCE_GROUP" \ --size "$MACHINE_SIZE" \ - --subnet "$SUBNETID" + --subnet "$SUBNET_ID" ``` @@ -134,7 +135,8 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. 3. **Connect to the newly created VM via SSH.** ```sh - This connects to the VM created above via SSH. + # This connects to the VM created above via SSH. + # See above block for envvars referenced here. az ssh vm \ --local-user azureuser \ @@ -168,6 +170,49 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. ``` +5. **Install Docker** + + ```sh + # install docker + sudo apt -y install \ + docker-compose \ + docker.io + + # Allow azureuser to run docker without sudo + sudo usermod -aG docker azureuser + ``` + + At this point, you need to log out of the shell to reevaluate group memberships (add the `docker` group to `azureuser`). + You can log back in with the `az ssh` command from step 3. + When you are back into the shell, you can run `docker run --rm hello-world` to confirm that docker is working correctly, and you are able to run as a non-root user. + +6. **Prepare `.env` file** + The `ubuntu2204` image used for the VM includes both `vim` and `nano`. + Create a `.env` file in the `workflow.data.preparation` directory, according to the instructions in the [running locally](running-locally-with-docker-compose) section of this file. + +7. **Build Docker image** + The cloned git repo in the home directory, and mounted directories should sill be in place after logging in again. + Additionally, `azureuser` should be part of the `docker` group. + you can confirm this with + + ```sh + groups + ls ~ + ls /mnt + ``` + + With that in place, you are ready to build the `workflow.data.preparation` docker image. + + ```sh + # navigate to the workflow.data.preparation repo + cd ~/workflow.data.preparation + + docker-compose build + + docker-compose up + + ``` + ## Required Input Files All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories). From e730ab70870090fff7ea7e5e3981307ef65af0c8 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:41:45 +0100 Subject: [PATCH 18/39] fix(deploy): fix path in docker volume mount --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index a25de22..b0fc14f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,7 +12,7 @@ services: target: /mnt/factset-extracted - type: bind source: ${HOST_ASSET_IMPACT_PATH} - target: /asset-impact + target: /mnt/asset-impact - type: bind source: ${HOST_OUTPUTS_PATH} target: /outputs From 90b32936c1bcf07eea6cef091c270d7cf611e1b8 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:46:15 +0100 Subject: [PATCH 19/39] feat(deploy): make docker-compose mounts read-only --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index b0fc14f..4e5deb5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,9 +10,11 @@ services: - type: bind source: ${HOST_FACTSET_EXTRACTED_PATH} target: /mnt/factset-extracted + read_only: true - type: bind source: ${HOST_ASSET_IMPACT_PATH} target: /mnt/asset-impact + read_only: true - type: bind source: ${HOST_OUTPUTS_PATH} target: /outputs From fa02e0139e0796ff33f5246727abf45598484a38 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:47:29 +0100 Subject: [PATCH 20/39] docs(deploy): update Readme --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f666945..834e447 100644 --- a/README.md +++ b/README.md @@ -193,7 +193,7 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. 7. **Build Docker image** The cloned git repo in the home directory, and mounted directories should sill be in place after logging in again. Additionally, `azureuser` should be part of the `docker` group. - you can confirm this with + you can confirm this with: ```sh groups @@ -202,11 +202,14 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. ``` With that in place, you are ready to build the `workflow.data.preparation` docker image. + **To ensure that a dropped network connection does not kill the process, you should run this in `tmux`.** ```sh # navigate to the workflow.data.preparation repo cd ~/workflow.data.preparation + tmux + docker-compose build docker-compose up From 3020e58357433cc9ff83c16629b39986c1821b8a Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 11:35:41 +0100 Subject: [PATCH 21/39] Add current working config for 2022q4 --- config.yml | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/config.yml b/config.yml index 0680490..00d3e01 100644 --- a/config.yml +++ b/config.yml @@ -102,18 +102,20 @@ default: 2022Q4: - masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" - masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" - ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" - factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds" - factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds" - factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds" - factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_fund_data.rds" - factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_isin_to_fund_table.rds" - factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" - factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" - factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds" + asset_impact_data_path: "/mnt/asset-impact" + factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z" + masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" + masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" + ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" + factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_financial_data.rds" + factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_info.rds" + factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_financing_data.rds" + factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_fund_data.rds" + factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_isin_to_fund_table.rds" + factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_iss_emissions.rds" + factset_issue_code_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_issue_code_bridge.rds" + factset_industry_map_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_manual_sector_override.rds" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 From 665a67698c08700cca67637e2d087be641f9e403 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 11:37:34 +0100 Subject: [PATCH 22/39] return config to `main` don't touch config in this PR. --- config.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/config.yml b/config.yml index 00d3e01..0680490 100644 --- a/config.yml +++ b/config.yml @@ -102,20 +102,18 @@ default: 2022Q4: - asset_impact_data_path: "/mnt/asset-impact" - factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z" - masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" - masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" - ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" - factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_financial_data.rds" - factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_info.rds" - factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_financing_data.rds" - factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_fund_data.rds" - factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_isin_to_fund_table.rds" - factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_iss_emissions.rds" - factset_issue_code_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_issue_code_bridge.rds" - factset_industry_map_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_manual_sector_override.rds" + masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" + masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" + ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" + factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds" + factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds" + factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds" + factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_fund_data.rds" + factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_isin_to_fund_table.rds" + factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" + factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" + factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 From b3ddeedd758f9b8ae668446e8d349ff4b4430b53 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 11:46:57 +0100 Subject: [PATCH 23/39] Add temporary step for targeting this branch --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 834e447..5f0c3f1 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,12 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. # Clone this repo through https to avoid need for an SSH key git clone https://github.com/RMI-PACTA/workflow.data.preparation.git ~/workflow.data.preparation + # **Temporary Step: change to develop-vm branch + cd ~/workflow.data.preparation + git fetch + git checkout develop-vm + cd ~ + # Install az cli sudo apt update # See https://aka.ms/installcli for alternate instructions From 2f6320362f8e6fc2f90b3b502d74f28ebfa7d208 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 13:49:53 +0100 Subject: [PATCH 24/39] feat(package): #147 Update dependency update required version of `pacta.data.preparation` --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4ba0c28..ec88448 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,7 @@ Imports: dbplyr, dplyr, logger, - pacta.data.preparation (>= 0.1.0.9002), + pacta.data.preparation (>= 0.1.0.9003), pacta.data.scraping, pacta.scenario.preparation, readr, From 88c1f7df4de014a7086c60c1fa4a93efdd2e57c9 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 14:12:27 +0100 Subject: [PATCH 25/39] hack(docker): Use dev version of pacta.data.prep Update Description to use unmerged branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4ba0c28..4568734 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,7 +42,7 @@ Imports: stringr, tidyr Remotes: - RMI-PACTA/pacta.data.preparation, + RMI-PACTA/pacta.data.preparation#341, RMI-PACTA/pacta.data.scraping, RMI-PACTA/pacta.scenario.preparation Depends: From 2866159941132fa29f902e0069e45bf82f70a486 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 16:29:13 +0100 Subject: [PATCH 26/39] feat(app): #147 convert checks to use input_filepaths Create an input_filepaths object, rather than use individual filepaths --- run_pacta_data_preparation.R | 60 ++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 42406d8..585ac2a 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -128,24 +128,40 @@ factset_timestamp <- # check that everything is ready to go ----------------------------------------- -stopifnot(file.exists(masterdata_ownership_path)) -stopifnot(file.exists(masterdata_debt_path)) -stopifnot(file.exists(ar_company_id__factset_entity_id_path)) -stopifnot(file.exists(factset_financial_data_path)) -stopifnot(file.exists(factset_entity_info_path)) -stopifnot(file.exists(factset_entity_financing_data_path)) -stopifnot(file.exists(factset_fund_data_path)) -stopifnot(file.exists(factset_isin_to_fund_table_path)) -stopifnot(file.exists(factset_iss_emissions_data_path)) -stopifnot(file.exists(factset_issue_code_bridge_path)) -stopifnot(file.exists(factset_industry_map_bridge_path)) -stopifnot(file.exists(factset_manual_pacta_sector_override_path)) -stopifnot(file.exists(data_prep_outputs_path)) +input_filepaths <- c( + masterdata_ownership_path = masterdata_ownership_path, + masterdata_debt_path = masterdata_debt_path, + ar_company_id__factset_entity_id_path = ar_company_id__factset_entity_id_path, + factset_financial_data_path = factset_financial_data_path, + factset_entity_info_path = factset_entity_info_path, + factset_entity_financing_data_path = factset_entity_financing_data_path, + factset_fund_data_path = factset_fund_data_path, + factset_isin_to_fund_table_path = factset_isin_to_fund_table_path, + factset_iss_emissions_data_path = factset_iss_emissions_data_path, + factset_issue_code_bridge_path = factset_issue_code_bridge_path, + factset_industry_map_bridge_path = factset_industry_map_bridge_path, + factset_manual_pacta_sector_override_path = factset_manual_pacta_sector_override_path, + data_prep_outputs_path = data_prep_outputs_path +) if (!update_currencies) { - stopifnot(file.exists(currencies_data_path)) + input_filepaths <- c( + input_filepaths, + currencies_data_path = currencies_data_path + ) } +missing_input_files <- input_filepaths[!file.exists(input_filepaths)] + +if (length(missing_input_files) > 0L) { + logger::log_error( + "Input file cannot be found: {names(missing_input_files)} ({missing_input_files})." + ) + stop( + "Input files are missing: ", + toString(missing_input_files) + ) +} # pre-flight ------------------------------------------------------------------- @@ -156,6 +172,10 @@ if (update_currencies) { currencies <- pacta.data.scraping::get_currency_exchange_rates( quarter = imf_quarter_timestamp ) + input_filepaths <- c( + input_filepaths, + currencies_data_path = currencies_data_path + ) } logger::log_info("Scraping index regions.") @@ -862,12 +882,18 @@ parameters <- package_news = package_news ) +output_files <- normalizePath( + list.files( + data_prep_outputs_path, + full.names = TRUE + ) +) + pacta.data.preparation::write_manifest( path = file.path(data_prep_outputs_path, "manifest.json"), parameters = parameters, - asset_impact_data_path = asset_impact_data_path, - factset_data_path = factset_data_path, - data_prep_outputs_path = data_prep_outputs_path + input_files = input_filepaths, + output_files = output_files ) From e9f5e18aab6f0a0beae484bd49cd0e510bb42fe2 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 16:45:54 +0100 Subject: [PATCH 27/39] fix(app): #147 Move data prep outputs path out of input files Closes: #147 --- run_pacta_data_preparation.R | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 585ac2a..7691953 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -140,8 +140,7 @@ input_filepaths <- c( factset_iss_emissions_data_path = factset_iss_emissions_data_path, factset_issue_code_bridge_path = factset_issue_code_bridge_path, factset_industry_map_bridge_path = factset_industry_map_bridge_path, - factset_manual_pacta_sector_override_path = factset_manual_pacta_sector_override_path, - data_prep_outputs_path = data_prep_outputs_path + factset_manual_pacta_sector_override_path = factset_manual_pacta_sector_override_path ) if (!update_currencies) { @@ -163,6 +162,16 @@ if (length(missing_input_files) > 0L) { ) } +if (dir.exists(data_prep_outputs_path)) { + logger::log_trace("data_prep_outputs_path exists: \"{data_prep_outputs_path}\".") +} else { + logger::log_warn( + "data_prep_outputs_path ({data_prep_outputs_path}) does not exist. Creating." + ) + warning("creating data_prep_outputs_path") + dir.create(data_prep_outputs_path, recursive = TRUE) +} + # pre-flight ------------------------------------------------------------------- logger::log_info("Fetching pre-flight data.") @@ -882,13 +891,16 @@ parameters <- package_news = package_news ) +logger::log_trace("Getting list of output files.") output_files <- normalizePath( list.files( data_prep_outputs_path, - full.names = TRUE + full.names = TRUE, + recursive = TRUE ) ) +logger::log_trace("Writing manifest file.") pacta.data.preparation::write_manifest( path = file.path(data_prep_outputs_path, "manifest.json"), parameters = parameters, From 32b7c231a1aa20a78baff95261d546ed8cf183df Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 16:49:51 +0100 Subject: [PATCH 28/39] feat(app): #147 Use `input_filepaths` in `parameters` --- run_pacta_data_preparation.R | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 7691953..fc6f12b 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -846,20 +846,7 @@ parameters <- list( config_name = config_name, config = unclass(config), - input_filepaths = list( - masterdata_ownership_path = masterdata_ownership_path, - masterdata_debt_path = masterdata_debt_path, - ar_company_id__factset_entity_id_path = ar_company_id__factset_entity_id_path, - factset_financial_data_path = factset_financial_data_path, - factset_entity_info_path = factset_entity_info_path, - factset_entity_financing_data_path = factset_entity_financing_data_path, - factset_fund_data_path = factset_fund_data_path, - factset_isin_to_fund_table_path = factset_isin_to_fund_table_path, - factset_iss_emissions_data_path = factset_iss_emissions_data_path, - factset_issue_code_bridge_path = factset_issue_code_bridge_path, - factset_industry_map_bridge_path = factset_industry_map_bridge_path, - factset_manual_pacta_sector_override_path = factset_manual_pacta_sector_override_path - ), + input_filepaths = as.list(input_filepaths), preflight_filepaths = list( currencies_data_path = currencies_data_path ), From 2d2c72f28a071300a3dc9b680b812a0fe964df96 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 17:15:44 +0100 Subject: [PATCH 29/39] feat(app): #148 Export data from scraped files Closes: #148 --- config.yml | 2 ++ run_pacta_data_preparation.R | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/config.yml b/config.yml index 0680490..962e848 100644 --- a/config.yml +++ b/config.yml @@ -2,6 +2,7 @@ default: data_prep_outputs_path: "/outputs" asset_impact_data_path: "/inputs" factset_data_path: "/inputs" + preflight_data_path: "" masterdata_ownership_filename: "" masterdata_debt_filename: "" ar_company_id__factset_entity_id_filename: "" @@ -127,3 +128,4 @@ desktop: data_prep_outputs_path: "./outputs" asset_impact_data_path: "./ai_inputs" factset_data_path: "./factset_inputs" + preflight_data_path: "" diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 42406d8..45f32d3 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -96,8 +96,15 @@ factset_manual_pacta_sector_override_path <- # pre-flight filepaths --------------------------------------------------------- +preflight_data_path <- config$preflight_data_path +if (preflight_data_path == "") { + preflight_data_path <- data_prep_outputs_path +} + +currencies_preflight_data_path <- file.path(preflight_data_path, "currencies.rds") currencies_data_path <- file.path(data_prep_outputs_path, "currencies.rds") +index_regions_preflight_data_path <- file.path(preflight_data_path, "index_regions.rds") # computed options ------------------------------------------------------------- @@ -156,10 +163,16 @@ if (update_currencies) { currencies <- pacta.data.scraping::get_currency_exchange_rates( quarter = imf_quarter_timestamp ) + saveRDS(currencies, currencies_preflight_data_path) +} else { + logger::log_info("Using pre-existing currency data.") + # This requires the preflight path to be defined in the config + currencies <- readRDS(currencies_preflight_data_path) } logger::log_info("Scraping index regions.") index_regions <- pacta.data.scraping::get_index_regions() +saveRDS(index_regions, index_regions_preflight_data_path) logger::log_info("Fetching pre-flight data done.") From 35fd7200fac09024d96b93aa568b0b9316cf3af5 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 17:20:53 +0100 Subject: [PATCH 30/39] refactor(app): #147 Move code to avoid conflict --- run_pacta_data_preparation.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index fc6f12b..76c1d52 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -178,13 +178,13 @@ logger::log_info("Fetching pre-flight data.") if (update_currencies) { logger::log_info("Fetching currency data.") - currencies <- pacta.data.scraping::get_currency_exchange_rates( - quarter = imf_quarter_timestamp - ) input_filepaths <- c( input_filepaths, currencies_data_path = currencies_data_path ) + currencies <- pacta.data.scraping::get_currency_exchange_rates( + quarter = imf_quarter_timestamp + ) } logger::log_info("Scraping index regions.") From 15058c59f5f8d9bd1f85c30efa86acece7425101 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 17:24:44 +0100 Subject: [PATCH 31/39] feat(app): #147 INclude preflight paths in input_filepaths adjust code to account for changes made in #150 --- run_pacta_data_preparation.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 11caca4..c537457 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -187,7 +187,7 @@ if (update_currencies) { logger::log_info("Fetching currency data.") input_filepaths <- c( input_filepaths, - currencies_data_path = currencies_data_path + currencies_preflight_data_path = currencies_preflight_data_path ) currencies <- pacta.data.scraping::get_currency_exchange_rates( quarter = imf_quarter_timestamp @@ -200,6 +200,10 @@ if (update_currencies) { } logger::log_info("Scraping index regions.") +input_filepaths <- c( + input_filepaths, + index_regions_preflight_data_path = index_regions_preflight_data_path +) index_regions <- pacta.data.scraping::get_index_regions() saveRDS(index_regions, index_regions_preflight_data_path) From b27ca1377ac76664c8ef5ae9195ddee911ba508f Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 18:02:22 +0100 Subject: [PATCH 32/39] feat(app): #151 Put outputs into unique directory create a unique directory for outputs, and warn if it already exists Closes: #151 --- run_pacta_data_preparation.R | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 42406d8..fb904c3 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -33,7 +33,6 @@ config <- asset_impact_data_path <- config$asset_impact_data_path factset_data_path <- config$factset_data_path -data_prep_outputs_path <- config$data_prep_outputs_path masterdata_ownership_filename <- config$masterdata_ownership_filename masterdata_debt_filename <- config$masterdata_debt_filename ar_company_id__factset_entity_id_filename <- config$ar_company_id__factset_entity_id_filename @@ -64,6 +63,24 @@ scenario_geographies_list <- config$scenario_geographies_list global_aggregate_scenario_sources_list <- config$global_aggregate_scenario_sources_list global_aggregate_sector_list <- config$global_aggregate_sector_list +system_timestamp <- format( + Sys.time(), + format = "%Y%m%dT%H%M%SZ", + tz = "UTC" + ) + +data_prep_outputs_path <- file.path( + config$data_prep_outputs_path, + paste(pacta_financial_timestamp, system_timestamp, sep = "_") +) + +if (dir.exists(data_prep_outputs_path)) { + logger::log_warn("POTENTIAL DATA LOSS: Output directory already exists, and files may be overwritten ({data_prep_outputs_path}).") + warning("Output directory exists. Files may be overwritten.") +} else { + logger::log_trace("Creating output directory: \"{data_prep_outputs_path}\"") + dir.create(data_prep_outputs_path, recursive = TRUE) +} # input filepaths -------------------------------------------------------------- From 27be26f44b754fbbfdb546ca28c0a43f4b727dc0 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 18:15:45 +0100 Subject: [PATCH 33/39] refactor(deploy): Harmonize docker mount points Change from /outputs to /mnt/outputs, to match structure defined in #142 and #145 No associated issue. --- config.yml | 2 +- docker-compose.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config.yml b/config.yml index 0680490..02511f1 100644 --- a/config.yml +++ b/config.yml @@ -1,5 +1,5 @@ default: - data_prep_outputs_path: "/outputs" + data_prep_outputs_path: "/mnt/outputs" asset_impact_data_path: "/inputs" factset_data_path: "/inputs" masterdata_ownership_filename: "" diff --git a/docker-compose.yml b/docker-compose.yml index e8baf92..2c1e990 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,4 +10,4 @@ services: target: /inputs - type: bind source: ${HOST_OUTPUTS_PATH} - target: /outputs + target: /mnt/outputs From d415ea44f605188b3fab7f29c3036b9199646462 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 19:57:01 +0100 Subject: [PATCH 34/39] feat(app): #152 Export archives of outputs and inputs Export a zip archive of all files specified as inputs and the contents of the output directory Closes: #152 --- run_pacta_data_preparation.R | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index fb904c3..c559baf 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -900,6 +900,36 @@ for (pkg_name in pacta_packages) { ) } +# Create archive files +logger::log_info("Exporting input and output archives.") + +logger::log_debug("Creating inputs zip file.") +inputs_zip_file_path <- paste0(data_prep_outputs_path, "_inputs.zip") +logger::log_trace("Zip file path: \"{inputs_zip_file_path}\".") +zip( + zipfile = inputs_zip_file_path, + files = unlist(parameters[["input_filepaths"]]), + extras = c( + "--junk-paths", # do not preserve paths + "--no-dir-entries", # do not include directory entries + "--quiet" # do not print progress to stdout + ) +) +logger::log_debug("Inputs archive created.") + +logger::log_debug("Creating outputs zip file.") +outputs_zip_file_path <- paste0(data_prep_outputs_path, ".zip") +logger::log_trace("Zip file path: \"{outputs_zip_file_path}\".") +zip( + zipfile = outputs_zip_file_path, + files = list.files(data_prep_outputs_path, full.names = TRUE, recursive = TRUE), + extras = c( + "--junk-paths", # do not preserve paths + "--no-dir-entries", # do not include directory entries + "--quiet" # do not print progress to stdout + ) +) +logger::log_debug("Outputs archive created.") # ------------------------------------------------------------------------------ From 4239eaefd079197f0afd839365e3f8b0a4db052e Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 20:07:27 +0100 Subject: [PATCH 35/39] feat(app): #147 Use explicit filepaths for archives Use explicit filepath vectors for archive export process defined in #153 --- run_pacta_data_preparation.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 1c7ca01..80ccae8 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -921,14 +921,15 @@ output_files <- normalizePath( ) ) -logger::log_trace("Writing manifest file.") +manifest_path <- file.path(data_prep_outputs_path, "manifest.json") +logger::log_trace("Writing manifest file: \"{manifest_path}\".") pacta.data.preparation::write_manifest( - path = file.path(data_prep_outputs_path, "manifest.json"), + path = manifest_path, parameters = parameters, input_files = input_filepaths, output_files = output_files ) - +output_files <- c(output_files, manifest_path = manifest_path) # copy in NEWs.md files from relevant PACTA packages --------------------------- @@ -950,7 +951,7 @@ inputs_zip_file_path <- paste0(data_prep_outputs_path, "_inputs.zip") logger::log_trace("Zip file path: \"{inputs_zip_file_path}\".") zip( zipfile = inputs_zip_file_path, - files = unlist(parameters[["input_filepaths"]]), + files = input_filepaths, extras = c( "--junk-paths", # do not preserve paths "--no-dir-entries", # do not include directory entries @@ -964,7 +965,7 @@ outputs_zip_file_path <- paste0(data_prep_outputs_path, ".zip") logger::log_trace("Zip file path: \"{outputs_zip_file_path}\".") zip( zipfile = outputs_zip_file_path, - files = list.files(data_prep_outputs_path, full.names = TRUE, recursive = TRUE), + files = output_files, extras = c( "--junk-paths", # do not preserve paths "--no-dir-entries", # do not include directory entries From ee97f2649fbb47188fc64509a788ee073f93653b Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 20:16:10 +0100 Subject: [PATCH 36/39] build(docker): #138 Allow .git in build context We may want to have a more elegant solution in the future (see discussion in #138), but this works for now to include git status in the manifest export. Closes: #138 --- .dockerignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockerignore b/.dockerignore index f5dd321..101d8fe 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,6 +2,7 @@ * # include certain files !.env +!.git !config.yml !DESCRIPTION !LICENSE From ef49c6e77a2a51acd8dd986eaf71d257054ec8ef Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 23:55:05 +0100 Subject: [PATCH 37/39] build(deploy): #144 Draft 2023Q4 Config Establish a working draft of a 2023Q4 config that build sucessfully. Closes: #144 --- config.yml | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/config.yml b/config.yml index b82d30e..c0fbd3f 100644 --- a/config.yml +++ b/config.yml @@ -123,18 +123,20 @@ default: 2023Q4: + asset_impact_data_path: "/mnt/asset-impact/2024-02-15_AI_RMI_2023Q4" + factset_data_path: "/factset-extracted/factset-pacta_timestamp-20231231T000000Z_pulled-20240217T135833Z" masterdata_ownership_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2023Q4.csv" masterdata_debt_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_debt_2023Q4.csv" ar_company_id__factset_entity_id_filename: "2024-02-14_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2023Q4.csv" - factset_financial_data_filename: "" - factset_entity_info_filename: "" - factset_entity_financing_data_filename: "" - factset_fund_data_filename: "" - factset_isin_to_fund_table_filename: "" - factset_iss_emissions_data_filename: "" - factset_issue_code_bridge_filename: "" - factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override_filename: "" + factset_financial_data_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_financial_data.rds" + factset_entity_info_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_entity_info.rds" + factset_entity_financing_data_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_entity_financing_data.rds" + factset_fund_data_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_fund_data.rds" + factset_isin_to_fund_table_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_isin_to_fund_table.rds" + factset_iss_emissions_data_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_iss_emissions.rds" + factset_issue_code_bridge_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_issue_code_bridge.rds" + factset_industry_map_bridge_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_manual_sector_override.rds" imf_quarter_timestamp: "2023-Q4" pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 From a490e3c3a35f4f76431e7b6693413fcd1b8ab33c Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 19 Feb 2024 00:05:03 +0100 Subject: [PATCH 38/39] build(deploy): #144 Only change data sources from 2022Q4 Only change from 2022Q4 is data sources, in hope of getting a functional build, while https://github.com/RMI-PACTA/pacta.scenario.preparation/issues/83 is in process. --- config.yml | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/config.yml b/config.yml index c0fbd3f..514db0d 100644 --- a/config.yml +++ b/config.yml @@ -140,17 +140,9 @@ default: imf_quarter_timestamp: "2023-Q4" pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 - scenario_sources_list: ["GECO2023", "ISF2023", "WEO2023"] - scenario_raw_data_to_include: ["geco_2023", "isf_2023", "weo_2023"] - global_aggregate_scenario_sources_list: ["WEO2023"] - sector_list: [] - other_sector_list: [] - zero_emission_factor_techs: [] - green_techs: [] - tech_exclude: [] - scenario_geographies_list: [] - global_aggregate_sector_list: [] - + scenario_sources_list: ["GECO2022", "ISF2021", "WEO2022"] + scenario_raw_data_to_include: ["geco_2022", "isf_2021", "weo_2022"] + global_aggregate_scenario_sources_list: ["WEO2022"] desktop: inherits: 2023Q4 From 93459a639c43748082776c1c876a83711d44a9e8 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 19 Feb 2024 00:09:56 +0100 Subject: [PATCH 39/39] fix(deploy): #144 fix bad factset path --- config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.yml b/config.yml index 514db0d..1215978 100644 --- a/config.yml +++ b/config.yml @@ -124,7 +124,7 @@ default: 2023Q4: asset_impact_data_path: "/mnt/asset-impact/2024-02-15_AI_RMI_2023Q4" - factset_data_path: "/factset-extracted/factset-pacta_timestamp-20231231T000000Z_pulled-20240217T135833Z" + factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20231231T000000Z_pulled-20240217T135833Z" masterdata_ownership_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2023Q4.csv" masterdata_debt_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_debt_2023Q4.csv" ar_company_id__factset_entity_id_filename: "2024-02-14_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2023Q4.csv"