From 58d78fa148578394067bcd17f4208495eafdc053 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sat, 17 Feb 2024 19:39:06 +0100
Subject: [PATCH 01/13] docs(deploy): Define prerequisites

Define the prerequisite steps prior to running data prep
---
 README.md | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ac5d44a..561b699 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # workflow.data.preparation
 
-`workflow.data.preparation` orchestrates the PACTA data preparation process, combining production, financial, scenario, and currency data into a format suitable for use in a PACTA for investors analysis. Assuming that the computing resource being used has sufficient memory (which can be >16gb depending on the inputs), storage space, and access to the necessary inputs, this is intended to work on a desktop or laptop using RStudio or run using the included [Dockerfile](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/Dockerfile) and [docker-compose.yml](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/docker-compose.yml).
+`workflow.data.preparation` orchestrates the PACTA data preparation process, combining production, financial, scenario, and currency data into a format suitable for use in a PACTA for investors analysis. Assuming that the computing resource being used has sufficient memory (which can be >16Gb depending on the inputs), storage space, and access to the necessary inputs, this is intended to work on a desktop or laptop using RStudio or run using the included [Dockerfile](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/Dockerfile) and [docker-compose.yml](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/docker-compose.yml).
 
 ## Running in RStudio
 
@@ -12,7 +12,7 @@ Running workflow.data.preparation has a number of R package dependencies that ar
 
 To make things easier, the recommended way to specify the desired config set when running locally in RStudio is by setting the active config set to `desktop` and modifying/adding only a few of the properties in the `desktop` config set. By doing so, you benefit from inheriting many of the appropriate configuration values without having to explicitly specify each one.
 
-You will need to set the `inherits` parameter, e.g. `inherits: 2022Q4`, to select which of the config sets specified in the config.yml file that is desired. 
+You will need to set the `inherits` parameter, e.g. `inherits: 2022Q4`, to select which of the config sets specified in the config.yml file that is desired.
 
 You will need to set `data_prep_outputs_path` to an *existing* directory where you want the outputs to be saved, e.g. `data_prep_outputs_path: "./outputs"` to point to an existing directory named `outputs` in the working directory of the R session you will be running data.prep in. This directory must exist before running data.prep (and ideally be empty). The script will throw an error early on if it does not exist.
 
@@ -57,6 +57,26 @@ Run `docker-compose up` from the root directory, and docker will build the image
 
 Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
 
+## Running Data Preparation interactively on Azure VM
+
+*Instructions specific to the RMI-PACTA team's Azure instance are in Italics.*
+
+0. **Prerequisites:**
+    - Set up Storage Accounts containing the [required files](#required-input-files).
+      While all the files can exist on a single file share, in a single storage account, the workflow can access different storage accounts, to allow for read-only access to raw data, to prevent accident manipulation of source data.
+      The recommended structure (used by RMI) is:
+      - Storage Account: `pactadatadev`: (read/write) *RMI QAs datasets prior to moving them to PROD with[ `workflow.pacta.data.qa`](https://github.com/RMI-PACTA/workflow.pacta.data.qa)*
+        - File Share `workflow-data-preparation-outputs`: Outputs from this workflow.
+      - Storage Account: `pactarawdata` (read-only)
+        - File Share `factset-extracted`: Outputs from [`workflow.factset`](https://github.com/RMI-PACTA/workflow.factset)
+        - File Share `AssetImpact` Raw data files from [Asset Impact](https://asset-impact.gresb.com/)
+    - (Optional, but recommended) Create a User Assigned Managed Identity.
+      Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions.
+      * **RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.*
+    - Grant Appropriate permissions to the Identity:
+      - `pactadatadev`: "Storage File Data SMB Share Contributor"
+      - `pactarawdata`: "Storage File Data SMB Share Reader"
+
 ## Required Input Files
 
 All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories).

From 032459e05f91b1327793f15e3a3e9dab8c5d702f Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sat, 17 Feb 2024 20:55:37 +0100
Subject: [PATCH 02/13] docs(deploy): Instructions up through connecting

Everything works up through creating and connecting to VM
---
 README.md | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 561b699..641e340 100644
--- a/README.md
+++ b/README.md
@@ -62,21 +62,86 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
 *Instructions specific to the RMI-PACTA team's Azure instance are in Italics.*
 
 0. **Prerequisites:**
+    *These steps have been completed on the RMI Azure instance.*
+    - Ensure a Virtual Network with a Gateway has been set up, permitting SSH (Port 22) access.
+      Details of setting this up are out of scope for these instructions.
+      Talk to your network coordinator for help.
     - Set up Storage Accounts containing the [required files](#required-input-files).
       While all the files can exist on a single file share, in a single storage account, the workflow can access different storage accounts, to allow for read-only access to raw data, to prevent accident manipulation of source data.
-      The recommended structure (used by RMI) is:
-      - Storage Account: `pactadatadev`: (read/write) *RMI QAs datasets prior to moving them to PROD with[ `workflow.pacta.data.qa`](https://github.com/RMI-PACTA/workflow.pacta.data.qa)*
+      The recommended structure (*used by RMI*) is:
+      - Storage Account: `pactadatadev`: (read/write).
+        Naming note: *RMI QAs datasets prior to moving them to PROD with [`workflow.pacta.data.qa`](https://github.com/RMI-PACTA/workflow.pacta.data.qa)*.
         - File Share `workflow-data-preparation-outputs`: Outputs from this workflow.
       - Storage Account: `pactarawdata` (read-only)
         - File Share `factset-extracted`: Outputs from [`workflow.factset`](https://github.com/RMI-PACTA/workflow.factset)
         - File Share `AssetImpact` Raw data files from [Asset Impact](https://asset-impact.gresb.com/)
     - (Optional, but recommended) Create a User Assigned Managed Identity.
-      Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions.
-      * **RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.*
+      Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. ***RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.*
     - Grant Appropriate permissions to the Identity:
       - `pactadatadev`: "Storage File Data SMB Share Contributor"
       - `pactarawdata`: "Storage File Data SMB Share Reader"
 
+1. Start a VM.
+  While the machine can be deployed via the Portal (WebUI), for simplicity, the following code block is provided which ensures consistency:
+
+    ```sh
+    # The options here work with the RMI-PACTA team's Azure setup.
+    # Change values for your own instance as needed.
+
+    # Get Network details.
+    VNET_RESOURCE_GROUP="RMI-PROD-EU-VNET-RG"
+    VNET_NAME="RMI-PROD-EU-VNET"
+    SUBNET_NAME="RMI-SP-PACTA-DEV-VNET"
+    SUBNET_ID=$(az network vnet subnet show --resource-group $VNET_RESOURCE_GROUP --name $SUBNET_NAME --vnet-name $VNET_NAME --query id -o tsv)
+
+    # Use the identity previously setup (see Prerequisites)
+    MACHINEIDENTITY="/subscriptions/feef729b-4584-44af-a0f9-4827075512f9/resourceGroups/RMI-SP-PACTA-PROD/providers/Microsoft.ManagedIdentity/userAssignedIdentities/workflow-data-preparation"
+    # This size has 2 vCPU, and 32GiB memory, recommended settings.
+    MACHINE_SIZE="Standard_E4-2as_v4"
+    # Using epoch to give machine a (probably) unique name
+    MACHINE_NAME="dataprep-runner-$(date +%s)"
+    # NOTE: Change this to your own RG as needed.
+    VM_RESOURCE_GROUP="RMI-SP-PACTA-DEV"
+
+    # **NOTE: Check these options prior to running**
+    # Non-RMI users may choose to omit the --public-ip-address line for public SSH Access.
+
+    az vm create \
+      --admin-username azureuser \
+      --assign-identity "$MACHINEIDENTITY" \
+      --generate-ssh-keys  \
+      --image Ubuntu2204 \
+      --name "$MACHINE_NAME" \
+      --nic-delete-option delete \
+      --os-disk-delete-option delete \
+      --public-ip-address "" \
+      --resource-group "$VM_RESOURCE_GROUP" \
+      --size "$MACHINE_SIZE" \
+      --subnet "$SUBNETID"
+
+    ```
+
+    If this command successfully runs, it will output a JSON block describing the resource (VM) created.
+
+2. **Connect to the Network.** (Optional) 
+  ***RMI:** Connecting to the VPN will enable SSH access.*
+  Connect to the Virtual Network specified above, as the comand above does not create a Public IP Address.
+  Details for this are out of scope for these instructions.
+  Contact your network coordinator for help.
+
+2. Connect to the newly created VM via SSH.
+
+    ```sh
+    This connects to the VM created above via SSH.
+
+    az ssh vm \
+        --local-user azureuser \
+        --name "$MACHINE_NAME" \
+        --prefer-private-ip \
+        --resource-group "$VM_RESOURCE_GROUP"
+
+    ```
+
 ## Required Input Files
 
 All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories).

From 080ad61552d766a5b2d7b886a11c5b24d7a40a4f Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sat, 17 Feb 2024 21:19:05 +0100
Subject: [PATCH 03/13] feat(deploy): Add mount_afs script

Add a helper script to mount Azure File Shares
---
 scripts/mount_afs.sh | 107 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100755 scripts/mount_afs.sh

diff --git a/scripts/mount_afs.sh b/scripts/mount_afs.sh
new file mode 100755
index 0000000..350388d
--- /dev/null
+++ b/scripts/mount_afs.sh
@@ -0,0 +1,107 @@
+#! /bin/sh
+
+#  mount an Azure File Share at a given location.
+#  Requires az cli to be installed and logged in.
+
+usage() {
+    echo "Usage: mount_afs.sh [-h] [-v] -r <resource group> -a <storage account name> -f <file share name> -m <mount point>"
+    echo "  -h: help (this message)"
+    echo "  -v: verbose"
+    echo "  -r: resource group (Required)"
+    echo "  -a: storage account name (Required)"
+    echo "  -f: file share name (Required)"
+    echo "  -m: mount point (Required)"
+    echo "  -?: help"
+    exit 1
+}
+
+while getopts "h?vr:a:f:m:" opt; do
+    case "$opt" in
+    h|\?)
+        usage
+        ;;
+    v)  VERBOSE=1
+        ;;
+    r)  RESOURCEGROUP=$OPTARG
+        ;;
+    a)  STORAGEACCOUNTNAME=$OPTARG
+        ;;
+    f)  FILESHARENAME=$OPTARG
+        ;;
+    m)  MOUNTPOINT=$OPTARG
+        ;;
+    *)
+        usage
+        ;;
+    esac
+done
+
+missing_opts=0
+if [ -z "$RESOURCEGROUP" ]; then
+    echo "ERROR: Resource group is required"
+    missing_opts=1
+fi
+
+if [ -z "$STORAGEACCOUNTNAME" ]; then
+    echo "ERROR: Storage account name is required"
+    missing_opts=1
+fi
+
+if [ -z "$FILESHARENAME" ]; then
+    echo "ERROR: File share name is required"
+    missing_opts=1
+fi
+
+if [ -z "$MOUNTPOINT" ]; then
+    echo "ERROR: Mount point is required"
+    missing_opts=1
+fi
+
+if [ $missing_opts -eq 1 ]; then
+    usage
+fi
+
+if [ -n "$VERBOSE" ]; then
+    echo "RESOURCEGROUP: $RESOURCEGROUP"
+    echo "STORAGEACCOUNTNAME: $STORAGEACCOUNTNAME"
+    echo "FILESHARENAME: $FILESHARENAME"
+    echo "MOUNTPOINT: $MOUNTPOINT"
+fi
+
+# This command assumes you have logged in with az login
+
+if [ -n "$VERBOSE" ]; then
+    echo "Getting https endpoint for storage account $STORAGEACCOUNTNAME"
+fi
+
+httpEndpoint=$(az storage account show \
+  --resource-group "$RESOURCEGROUP" \
+  --name "$STORAGEACCOUNTNAME" \
+  --query "primaryEndpoints.file" --output tsv | tr -d '"')
+smbPath=$(echo "$httpEndpoint" | cut -c7-${#httpEndpoint})$FILESHARENAME
+fileHost=$(echo "$httpEndpoint" | cut -c7-${#httpEndpoint}| tr -d "/")
+nc -zvw3 "$fileHost" 445
+
+if [ -n "$VERBOSE" ]; then
+    echo "httpEndpoint: $httpEndpoint"
+    echo "smbPath: $smbPath"
+    echo "fileHost: $fileHost"
+fi
+
+if [ -n "$VERBOSE" ]; then
+    echo "Getting storage account key"
+fi
+storageAccountKey=$(az storage account keys list \
+  --resource-group "$RESOURCEGROUP" \
+  --account-name "$STORAGEACCOUNTNAME" \
+  --query "[0].value" --output tsv | tr -d '"')
+
+if [ -n "$VERBOSE" ]; then
+    echo "Creating mount path: $MOUNTPOINT"
+fi
+sudo mkdir -p "$MOUNTPOINT"
+
+if [ -n "$VERBOSE" ]; then
+    echo "Mounting $smbPath to $MOUNTPOINT"
+fi
+sudo mount -t cifs "$smbPath" "$MOUNTPOINT" -o username="$STORAGEACCOUNTNAME",password="$storageAccountKey",serverino,nosharesock,actimeo=30,file_mode=0777,nobrl,dir_mode=0777,vers=3.1.1

From 222375e6087a9608880258f662513351d01016e6 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 00:11:13 +0100
Subject: [PATCH 04/13] fix(deploy): Update script to default to read-only

Mounting an Azure File Share to a linux OS via SMB defaults to
read/write access. The change to the mount script default to read-only
with file permissions (`0555`)

Update Docs accordingly.
---
 README.md            | 36 +++++++++++++++++++++++++++++++-----
 scripts/mount_afs.sh | 14 ++++++++++++--
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 641e340..8859125 100644
--- a/README.md
+++ b/README.md
@@ -78,10 +78,12 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
     - (Optional, but recommended) Create a User Assigned Managed Identity.
       Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. ***RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.*
     - Grant Appropriate permissions to the Identity:
-      - `pactadatadev`: "Storage File Data SMB Share Contributor"
-      - `pactarawdata`: "Storage File Data SMB Share Reader"
+      - `pactadatadev`: "Reader and Data Access".
+      - `pactarawdata`: "Reader and Data Access"
+        Note that this gives read/write access the Storage Account via the Storage Account Key.
+        To grant read-only access to the VM, use the `mount_afs` script without the `-w` flag, as shown below.
 
-1. Start a VM.
+1. **Start a VM**
   While the machine can be deployed via the Portal (WebUI), for simplicity, the following code block is provided which ensures consistency:
 
     ```sh
@@ -123,13 +125,13 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
 
     If this command successfully runs, it will output a JSON block describing the resource (VM) created.
 
-2. **Connect to the Network.** (Optional) 
+2. **Connect to the Network.** (Optional)
   ***RMI:** Connecting to the VPN will enable SSH access.*
   Connect to the Virtual Network specified above, as the comand above does not create a Public IP Address.
   Details for this are out of scope for these instructions.
   Contact your network coordinator for help.
 
-2. Connect to the newly created VM via SSH.
+3. **Connect to the newly created VM via SSH.**
 
     ```sh
     This connects to the VM created above via SSH.
@@ -142,6 +144,30 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
 
     ```
 
+4. **Connect the VM to required resources**
+    Clone this repo, install the `az` cli utility, and mount the appropriate Azure File Shares.
+
+    ```sh
+    # Clone this repo through https to avoid need for an SSH key
+    git clone https://github.com/RMI-PACTA/workflow.data.preparation.git ~/workflow.data.preparation
+
+    # Install az cli
+    sudo apt update
+    # See https://aka.ms/installcli for alternate instructions
+    curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+    # Login to azure with assigned identity
+    az login --identity
+
+    # Use script from this repo to connect to file shares
+    ~/workflow.data.preparation/scripts/mount_afs.sh -r "RMI-SP-PACTA-PROD" -a "pactarawdata" -f "factset-extracted" -m "/mnt/factset-extracted"
+    ~/workflow.data.preparation/scripts/mount_afs.sh -r "RMI-SP-PACTA-PROD" -a "pactarawdata" -f "asset-impact" -m "/mnt/asset-impact"
+
+    # Note the outputs directory has the -w flag, meaning write permissions are enabled.
+    ~/workflow.data.preparation/scripts/mount_afs.sh -r "RMI-SP-PACTA-DEV" -a "pactadatadev" -f "workflow-data-preparation-outputs" -m "/mnt/workflow-data-preparation-outputs" -w
+
+    ```
+
 ## Required Input Files
 
 All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories).
diff --git a/scripts/mount_afs.sh b/scripts/mount_afs.sh
index 350388d..d0f58c9 100755
--- a/scripts/mount_afs.sh
+++ b/scripts/mount_afs.sh
@@ -7,6 +7,7 @@ usage() {
     echo "Usage: mount_afs.sh [-h] [-v] -r <resource group> -a <storage account name> -f <file share name> -m <mount point>"
     echo "  -h: help (this message)"
     echo "  -v: verbose"
+    echo "  -w: Allow write access to the file share (default is read-only)"
     echo "  -r: resource group (Required)"
     echo "  -a: storage account name (Required)"
     echo "  -f: file share name (Required)"
@@ -15,13 +16,15 @@ usage() {
     exit 1
 }
 
-while getopts "h?vr:a:f:m:" opt; do
+while getopts "h?vwr:a:f:m:" opt; do
     case "$opt" in
     h|\?)
         usage
         ;;
     v)  VERBOSE=1
         ;;
+    w)  ALLOW_WRITE=1
+        ;;
     r)  RESOURCEGROUP=$OPTARG
         ;;
     a)  STORAGEACCOUNTNAME=$OPTARG
@@ -104,4 +107,11 @@ sudo mkdir -p "$MOUNTPOINT"
 if [ -n "$VERBOSE" ]; then
     echo "Mounting $smbPath to $MOUNTPOINT"
 fi
-sudo mount -t cifs "$smbPath" "$MOUNTPOINT" -o username="$STORAGEACCOUNTNAME",password="$storageAccountKey",serverino,nosharesock,actimeo=30,file_mode=0777,nobrl,dir_mode=0777,vers=3.1.1
+
+if [ -n "$ALLOW_WRITE" ]; then
+  permissions="file_mode=0777,dir_mode=0777"
+else
+  permissions="file_mode=0555,dir_mode=0555"
+fi
+
+sudo mount -t cifs "$smbPath" "$MOUNTPOINT" -o username="$STORAGEACCOUNTNAME",password="$storageAccountKey",serverino,nosharesock,actimeo=30,nobrl,"$permissions",vers=3.1.1

From c2e798c93c4b3e69cad869f90f8b6a7ff17eb9f2 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 02:09:15 +0100
Subject: [PATCH 05/13] feat(deploy): Use new split inputs in docker-compose

---
 config.yml         | 2 ++
 docker-compose.yml | 7 +++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/config.yml b/config.yml
index 0680490..b342d56 100644
--- a/config.yml
+++ b/config.yml
@@ -102,6 +102,8 @@ default:
 
 
 2022Q4:
+  asset_impact_data_path: "/mnt/factset-extracted"
+  factset_data_path: "/mnt/asset-impact"
   masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv"
   masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv"
   ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv"
diff --git a/docker-compose.yml b/docker-compose.yml
index e8baf92..f0b3f7d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,8 +6,11 @@ services:
       context: .
     volumes:
       - type: bind
-        source: ${HOST_INPUTS_PATH}
-        target: /inputs
+        source: ${HOST_FACTSET_EXTRACTED_PATH}
+        target: /mnt/factset-extracted
+      - type: bind
+        source: ${HOST_ASSET_IMPACT_PATH}
+        target: /asset-impact
       - type: bind
         source: ${HOST_OUTPUTS_PATH}
         target: /outputs

From 4f42a33aae6e3b8afcc019f6db45bdb34d707670 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 04:17:25 +0100
Subject: [PATCH 06/13] feat(deploy): Change AI File paths

Reflect actual Azure FIles structure
---
 config.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/config.yml b/config.yml
index b342d56..9af590a 100644
--- a/config.yml
+++ b/config.yml
@@ -102,11 +102,11 @@ default:
 
 
 2022Q4:
-  asset_impact_data_path: "/mnt/factset-extracted"
-  factset_data_path: "/mnt/asset-impact"
-  masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv"
-  masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv"
-  ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv"
+  asset_impact_data_path: "/mnt/asset-impact"
+  factset_data_path: "/mnt/factset-extracted"
+  masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv"
+  masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv"
+  ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv"
   factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds"
   factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds"
   factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds"

From 13d2d3c39df951a63a6716451e2487832b67e9cb Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 04:22:40 +0100
Subject: [PATCH 07/13] feat(deploy): Update Factset file paths for 2022Q4

---
 config.yml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/config.yml b/config.yml
index 9af590a..00d3e01 100644
--- a/config.yml
+++ b/config.yml
@@ -103,19 +103,19 @@ default:
 
 2022Q4:
   asset_impact_data_path: "/mnt/asset-impact"
-  factset_data_path: "/mnt/factset-extracted"
+  factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z"
   masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv"
   masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv"
   ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv"
-  factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds"
-  factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds"
-  factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds"
-  factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_fund_data.rds"
-  factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_isin_to_fund_table.rds"
-  factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds"
-  factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds"
-  factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds"
-  factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds"
+  factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_financial_data.rds"
+  factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_info.rds"
+  factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_financing_data.rds"
+  factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_fund_data.rds"
+  factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_isin_to_fund_table.rds"
+  factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_iss_emissions.rds"
+  factset_issue_code_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_issue_code_bridge.rds"
+  factset_industry_map_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_industry_map_bridge.rds"
+  factset_manual_pacta_sector_override_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_manual_sector_override.rds"
   imf_quarter_timestamp: "2022-Q4"
   pacta_financial_timestamp: "2022Q4"
   market_share_target_reference_year: 2022

From dfc4825c4e8a66c11f265ca1d3b4aba43293e6f5 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 04:23:37 +0100
Subject: [PATCH 08/13] ci(deploy): Add verbose logging for remote environment

---
 docker-compose.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index f0b3f7d..a25de22 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,6 +4,8 @@ services:
   data_prep:
     build:
       context: .
+    environment:
+      - LOG_LEVEL=TRACE
     volumes:
       - type: bind
         source: ${HOST_FACTSET_EXTRACTED_PATH}

From 12499023a5f408622af6a9d0d473b8b4c56c82ac Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 04:25:14 +0100
Subject: [PATCH 09/13] docs(deploy): Update README instructions

---
 README.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 8859125..f666945 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,8 @@ Running the workflow requires a file `.env` to exist in the root directory, that
 
 ```sh
 # .env
-HOST_INPUTS_PATH=/PATH/TO/inputs
+HOST_FACTSET_EXTRACTED_PATH=/PATH/TO/factset-extracted
+HOST_ASSET_IMPACT_PATH=/PATH/TO/asset-impact
 HOST_OUTPUTS_PATH=/PATH/TO/YYYYQQ_pacta_analysis_inputs_YYYY-MM-DD/YYYYQQ
 GITHUB_PAT=ghp_XXXXxxXxXXXxXxxX
 R_CONFIG_ACTIVE=YYYYQQ
@@ -119,7 +120,7 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
       --public-ip-address "" \
       --resource-group "$VM_RESOURCE_GROUP" \
       --size "$MACHINE_SIZE" \
-      --subnet "$SUBNETID"
+      --subnet "$SUBNET_ID"
 
     ```
 
@@ -134,7 +135,8 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
 3. **Connect to the newly created VM via SSH.**
 
     ```sh
-    This connects to the VM created above via SSH.
+    # This connects to the VM created above via SSH.
+    # See above block for envvars referenced here.
 
     az ssh vm \
         --local-user azureuser \
@@ -168,6 +170,49 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
 
     ```
 
+5. **Install Docker**
+
+    ```sh
+    # install docker
+    sudo apt -y install \
+        docker-compose \
+        docker.io
+
+    # Allow azureuser to run docker without sudo
+    sudo usermod -aG docker azureuser
+    ```
+
+    At this point, you need to log out of the shell to reevaluate group memberships (add the `docker` group to `azureuser`).
+    You can log back in with the `az ssh` command from step 3.
+    When you are back into the shell, you can run `docker run --rm hello-world` to confirm that docker is working correctly, and you are able to run as a non-root user.
+
+6. **Prepare `.env` file**
+  The `ubuntu2204` image used for the VM includes both `vim` and `nano`.
+  Create a `.env` file in the `workflow.data.preparation` directory, according to the instructions in the [running locally](running-locally-with-docker-compose) section of this file.
+
+7. **Build Docker image**
+    The cloned git repo in the home directory, and mounted directories should sill be in place after logging in again.
+    Additionally, `azureuser` should be part of the `docker` group.
+    you can confirm this with 
+
+    ```sh
+    groups
+    ls ~
+    ls /mnt
+    ```
+
+    With that in place, you are ready to build the `workflow.data.preparation` docker image.
+    
+    ```sh
+    # navigate to the workflow.data.preparation repo
+    cd ~/workflow.data.preparation
+
+    docker-compose build
+
+    docker-compose up
+
+    ```
+
 ## Required Input Files
 
 All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories).

From e730ab70870090fff7ea7e5e3981307ef65af0c8 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 04:41:45 +0100
Subject: [PATCH 10/13] fix(deploy): fix path in docker volume mount

---
 docker-compose.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index a25de22..b0fc14f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,7 +12,7 @@ services:
         target: /mnt/factset-extracted
       - type: bind
         source: ${HOST_ASSET_IMPACT_PATH}
-        target: /asset-impact
+        target: /mnt/asset-impact
       - type: bind
         source: ${HOST_OUTPUTS_PATH}
         target: /outputs

From 90b32936c1bcf07eea6cef091c270d7cf611e1b8 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 04:46:15 +0100
Subject: [PATCH 11/13] feat(deploy): make docker-compose mounts read-only

---
 docker-compose.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index b0fc14f..4e5deb5 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -10,9 +10,11 @@ services:
       - type: bind
         source: ${HOST_FACTSET_EXTRACTED_PATH}
         target: /mnt/factset-extracted
+        read_only: true
       - type: bind
         source: ${HOST_ASSET_IMPACT_PATH}
         target: /mnt/asset-impact
+        read_only: true
       - type: bind
         source: ${HOST_OUTPUTS_PATH}
         target: /outputs

From fa02e0139e0796ff33f5246727abf45598484a38 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 04:47:29 +0100
Subject: [PATCH 12/13] docs(deploy): update Readme

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f666945..834e447 100644
--- a/README.md
+++ b/README.md
@@ -193,7 +193,7 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
 7. **Build Docker image**
     The cloned git repo in the home directory, and mounted directories should sill be in place after logging in again.
     Additionally, `azureuser` should be part of the `docker` group.
-    you can confirm this with 
+    you can confirm this with:
 
     ```sh
     groups
@@ -202,11 +202,14 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image.
     ```
 
     With that in place, you are ready to build the `workflow.data.preparation` docker image.
+    **To ensure that a dropped network connection does not kill the process, you should run this in `tmux`.**
     
     ```sh
     # navigate to the workflow.data.preparation repo
     cd ~/workflow.data.preparation
 
+    tmux
+
     docker-compose build
 
     docker-compose up

From 665a67698c08700cca67637e2d087be641f9e403 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Sun, 18 Feb 2024 11:37:34 +0100
Subject: [PATCH 13/13] return config to `main`

don't touch config in this PR.
---
 config.yml | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/config.yml b/config.yml
index 00d3e01..0680490 100644
--- a/config.yml
+++ b/config.yml
@@ -102,20 +102,18 @@ default:
 
 
 2022Q4:
-  asset_impact_data_path: "/mnt/asset-impact"
-  factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z"
-  masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv"
-  masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv"
-  ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv"
-  factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_financial_data.rds"
-  factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_info.rds"
-  factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_financing_data.rds"
-  factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_fund_data.rds"
-  factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_isin_to_fund_table.rds"
-  factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_iss_emissions.rds"
-  factset_issue_code_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_issue_code_bridge.rds"
-  factset_industry_map_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_industry_map_bridge.rds"
-  factset_manual_pacta_sector_override_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_manual_sector_override.rds"
+  masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv"
+  masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv"
+  ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv"
+  factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds"
+  factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds"
+  factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds"
+  factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_fund_data.rds"
+  factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_isin_to_fund_table.rds"
+  factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds"
+  factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds"
+  factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds"
+  factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds"
   imf_quarter_timestamp: "2022-Q4"
   pacta_financial_timestamp: "2022Q4"
   market_share_target_reference_year: 2022