merge with develop, resolve conflicts

Sage-Bionetworks · Oct 31, 2024 · 5335d0c · 5335d0c
2 parents 5d778f4 + 86afb66
commit 5335d0c
Show file tree

Hide file tree

Showing 13 changed files with 588 additions and 101 deletions.
diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
@@ -23,7 +23,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         fetch-depth: 2
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,5 +1,28 @@
 # genie-bpc-pipeline: Contributing Guidelines
 
+## Getting started
+1. [Clone the repository](https://help.github.com/articles/cloning-a-repository/) to your local machine so you can begin making changes.
+2. On your local machine make sure you have the latest version of the `develop` branch:
+
+    ```
+    git checkout develop
+    git pull origin develop
+    ```
+3. Create a feature branch off the `develop` branch and work on it. The branch should be named the same as the JIRA issue you are working on in **lowercase** (e.g., `gen-1234-{feature-here}`). Make sure the branch name as informative as possible. 
+    ```
+    git checkout develop
+    git checkout -b gen-1234-{feature-here}
+    ```
+4. Once you have made your additions or changes, make sure you write tests and run [comparison scripts](https://github.com/Sage-Bionetworks/Genie_processing/blob/ed806d163fa4063a84920483e8ada21ea4b6cf47/README.md#comparisons-between-two-synapse-entities) to ensure changes are expected.
+5. At this point, you have only created the branch locally, you need to push this to your fork on GitHub.
+
+    ```
+    git add your file
+    git commit -m "your commit information"
+    git push --set-upstream origin gen-1234-{feature-here}
+    ```
+6. Create a pull request from the feature branch to the develop branch. When changes are made to `script/<module_folder_name>` (only applies to scripts/references and scripts/table_updates for now), a Github action will be triggered to create a docker image for the branch, you can check it [here](https://github.com/Sage-Bionetworks/genie-bpc-pipeline/pkgs/container/genie-bpc-pipeline). 
+
 ## Nextflow Pipeline contribution
 
 Here is how to contribute to the nextflow workflow of the genie-bpc-pipeline
@@ -30,3 +53,7 @@ Parameters should be initialized / defined with default values in the set parame
 ### Default processes resource requirements
 
 Defaults for process resource requirements (CPUs / memory / time) for a process should be defined in `nextflow.config`.
+
+### Testing
+1. To test locally,  you can’t use ghcr.io/sage-bionetworks/docker_image_name locally directly. You will need to pull the docker image to local and test with it. 
+2. To test on Sequra, go to the test_bpc_pipeline, edit the pipeline by pointing it to your feature branch then update. Doing this will allow you to select parameters from the dropdown menu directly. 
diff --git a/main.nf b/main.nf
@@ -97,13 +97,15 @@ workflow BPC_PIPELINE {
     // validate_data.out.view()
    } else if (params.step == "merge_and_uncode_rca_uploads"){
     merge_and_uncode_rca_uploads("default", ch_cohort, ch_comment, params.production)
+   } else if (params.step == "update_data_table") {
+    update_data_table("default", ch_cohort, ch_comment, params.production)
    } else if (params.step == "genie_bpc_pipeline"){
     update_potential_phi_fields_table(ch_comment, params.production)
     run_quac_upload_report_error(update_potential_phi_fields_table.out, ch_cohort)
     run_quac_upload_report_warning(run_quac_upload_report_error.out, ch_cohort, params.production)
     merge_and_uncode_rca_uploads(run_quac_upload_report_warning.out, ch_cohort, ch_comment, params.production)
     // remove_patients_from_merged(merge_and_uncode_rca_uploads.out, ch_cohort, params.production)
-    update_data_table(merge_and_uncode_rca_uploads.out, ch_comment, params.production)
+    update_data_table(merge_and_uncode_rca_uploads.out, ch_cohort, ch_comment, params.production)
     update_date_tracking_table(update_data_table.out, ch_cohort, ch_comment, params.production)
     run_quac_table_report(update_date_tracking_table.out, ch_cohort, params.production)
     run_quac_comparison_report(run_quac_table_report.out, ch_cohort, params.production)

diff --git a/modules/run_quac_upload_report_error.nf b/modules/run_quac_upload_report_error.nf
@@ -9,6 +9,7 @@ process run_quac_upload_report_error {
    debug true
 
    input:
+   val previous
    val cohort
 
    output:

diff --git a/modules/update_data_table.nf b/modules/update_data_table.nf
@@ -2,13 +2,14 @@
 Update Synapse tables with merged and uncoded data.
 */
 process update_data_table {
+   container "$params.table_updates_docker"
 
-   container 'sagebionetworks/genie-bpc-pipeline-table-updates'
    secret 'SYNAPSE_AUTH_TOKEN'
    debug true
 
    input:
    val previous
+   val cohort
    val comment
    val production
 
@@ -19,13 +20,12 @@ process update_data_table {
    if (production) {
       """
       cd /root/scripts/
-      python update_data_table.py -p /root/scripts/config.json -m "$comment" primary
+      python update_data_table.py -p /root/scripts/config.json -c $cohort -m "$comment" primary -pd
       """
-   }
-   else {
+   } else {
       """
       cd /root/scripts/
-      python update_data_table.py -p /root/scripts/config.json -m "$comment" primary -d
+      python update_data_table.py -p /root/scripts/config.json -c $cohort -m "$comment" primary
       """
    }
 }
diff --git a/nextflow.config b/nextflow.config
@@ -54,6 +54,7 @@ profiles {
 			// docker image parameters, see nextflow_schema.json for details
 			references_docker = "sagebionetworks/genie-bpc-pipeline-references"
 			uploads_docker = "sagebionetworks/genie-bpc-pipeline-uploads"
+			table_updates_docker = "sagebionetworks/genie-bpc-pipeline-table-updates"
 		}
 	}
 }
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -51,6 +51,7 @@
                     "enum": [
                         "update_potential_phi_fields_table",
                         "merge_and_uncode_rca_uploads",
+                        "update_data_table",
                         "genie_bpc_pipeline"
                     ]
                 },
@@ -62,6 +63,10 @@
                     "type": "string",
                     "description": "Name of docker to use in processes in scripts/uploads"
                 },
+                "table_updates_docker":{
+                    "type": "string",
+                    "description": "Name of docker to use in processes in scripts/table_updates" 
+                },
                 "schema_ignore_params": {
                     "type": "string",
                     "description": "Put parameters to ignore for validation here separated by comma",

diff --git a/scripts/table_updates/Dockerfile b/scripts/table_updates/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8
+FROM python:3.11
 
 WORKDIR /root/scripts
 

diff --git a/scripts/table_updates/README.md b/scripts/table_updates/README.md
@@ -39,6 +39,14 @@ Usage
 
 ### Update the Synapse Tables with data
 #### Primary Case Tables
-    python update_data_table.py -m [version_comment] primary
-#### IRR Case Tables
+##### 1. dry-run (Save output to local)
+    python update_data_table.py -c [cohort_name] -m [version_comment] primary -d
+
+##### 2. production (Save output to production projects)
+    python update_data_table.py -c [cohort_name] -m [version_comment] primary -pd
+
+##### 3. staging (Save output to staging projects)
+    python update_data_table.py -c [cohort_name] -m [version_comment] primary
+
+#### IRR Case Tables (Deprecated)
     python update_data_table.py -m [version_comment] irr
diff --git a/scripts/table_updates/requirements.txt b/scripts/table_updates/requirements.txt
@@ -1 +1 @@
-synapseclient[pandas] == 2.7.2
+synapseclient[pandas] == 4.6.0
diff --git a/scripts/table_updates/tests/test_utilities.py b/scripts/table_updates/tests/test_utilities.py
@@ -0,0 +1,230 @@
+from unittest.mock import MagicMock, create_autospec, patch
+
+import numpy as np
+import pandas as pd
+import pytest
+import synapseclient
+from synapseclient import Schema, Table
+from table_updates import utilities
+
+
+@pytest.fixture(scope="session")
+def syn():
+    return create_autospec(synapseclient.Synapse)
+
+
+@pytest.fixture(scope="session")
+def table_schema():
+    schema = synapseclient.table.Schema(
+        name="test_table",
+        parent="syn123",
+        column_names=["col1", "col2"],
+        column_types=["STRING", "INTEGER"],
+    )
+    return schema
+
+
+@pytest.mark.parametrize(
+    "query_return_df,select,query,expected_df",
+    [
+        (
+            pd.DataFrame({"col1": ["value1", "value2"]}),
+            "col1",
+            "SELECT col1 from syn123456",
+            pd.DataFrame({"col1": ["value1", "value2"]}),
+        ),
+        (
+            pd.DataFrame({"col1": ["value1", "value2"], "col2": [1, 2]}),
+            "col1,col2",
+            "SELECT col1,col2 from syn123456",
+            pd.DataFrame({"col1": ["value1", "value2"], "col2": [1, 2]}),
+        ),
+        (
+            pd.DataFrame({"col1": ["NA", "value1", "None"], "col2": [1, 2, 3]}),
+            "*",
+            "SELECT * from syn123456",
+            pd.DataFrame({"col1": [np.nan, "value1", "None"], "col2": [1, 2, 3]}),
+        ),
+        (
+            pd.DataFrame(columns=["col1", "col2"]),
+            "*",
+            "SELECT * from syn123456",
+            pd.DataFrame(columns=["col1", "col2"]),
+        ),
+    ],
+    ids=[
+        "selected_single_column",
+        "selected_multiple_column",
+        "pull_table_with_na_values_all_columns",
+        "pull_empty_table_all_columns",
+    ],
+)
+def test_download_synapse_table_default_condition(
+    syn, table_schema, query_return_df, select, query, expected_df
+):
+    syn.tableQuery = MagicMock(return_value=Table(table_schema, query_return_df))
+    result = utilities.download_synapse_table(syn, "syn123456", select)
+
+    # validate
+    syn.tableQuery.assert_called_once_with(query)
+    pd.testing.assert_frame_equal(result, expected_df)
+
+
+@pytest.mark.parametrize(
+    "query_return_df,condition,query,expected_df",
+    [
+        (
+            pd.DataFrame({"col1": ["value1"], "col2": [1]}),
+            "col1 = 'value1'",
+            "SELECT * from syn123456 WHERE col1 = 'value1'",
+            pd.DataFrame({"col1": ["value1"], "col2": [1]}),
+        ),
+        (
+            pd.DataFrame({"col1": ["NA", "value1", "None"], "col2": [1, 1, 1]}),
+            "col2 = 1",
+            "SELECT * from syn123456 WHERE col2 = 1",
+            pd.DataFrame({"col1": [np.nan, "value1", "None"], "col2": [1, 1, 1]}),
+        ),
+    ],
+    ids=["selected_row_all_columns", "pull_table_with_na_values_all_columns"],
+)
+def test_download_synapse_table_with_condition(
+    syn, table_schema, query_return_df, condition, query, expected_df
+):
+    syn.tableQuery = MagicMock(return_value=Table(table_schema, query_return_df))
+    result = utilities.download_synapse_table(syn, "syn123456", condition=condition)
+
+    # validate
+    syn.tableQuery.assert_called_once_with(query)
+    pd.testing.assert_frame_equal(result, expected_df)
+
+
+@pytest.mark.parametrize(
+    "query_return_df,select,condition,query,expected_df",
+    [
+        (
+            pd.DataFrame({"col1": ["value1"], "col2": [1]}),
+            "col1",
+            "col1 = 'value1'",
+            "SELECT col1 from syn123456 WHERE col1 = 'value1'",
+            pd.DataFrame({"col1": ["value1"], "col2": [1]}),
+        ),
+        (
+            pd.DataFrame({"col1": ["value1"], "col2": [1]}),
+            "col1,col2",
+            "col1 = 'value1'",
+            "SELECT col1,col2 from syn123456 WHERE col1 = 'value1'",
+            pd.DataFrame({"col1": ["value1"], "col2": [1]}),
+        ),
+    ],
+    ids=[
+        "selected_one_columns_with_condition",
+        "select_multiple_columns_with_condition",
+    ],
+)
+def test_download_synapse_table_with_select_and_condition(
+    syn, table_schema, query_return_df, select, condition, query, expected_df
+):
+    syn.tableQuery = MagicMock(return_value=Table(table_schema, query_return_df))
+    result = utilities.download_synapse_table(
+        syn, "syn123456", select=select, condition=condition
+    )
+
+    # validate
+    syn.tableQuery.assert_called_once_with(query)
+    pd.testing.assert_frame_equal(result, expected_df)
+
+
+def test_download_empty_synapse_table_with_condition(
+    syn,
+    table_schema,
+):
+    syn.tableQuery = MagicMock(
+        return_value=Table(table_schema, pd.DataFrame(columns=["col1", "col2"]))
+    )
+    result = utilities.download_synapse_table(syn, "syn123456", condition="col2 = 1")
+
+    # validate
+    syn.tableQuery.assert_called_once_with("SELECT * from syn123456 WHERE col2 = 1")
+    pd.testing.assert_frame_equal(result, pd.DataFrame(columns=["col1", "col2"]))
+
+
+@pytest.mark.parametrize(
+    "input_df,cols,expected_df",
+    [
+        (
+            pd.DataFrame({"col1": ["\\abc", "def"], "col2": ["abc", "def\\"]}),
+            ["col1", "col2"],
+            pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def"]}),
+        ),
+        (
+            pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def\\"]}),
+            ["col2"],
+            pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def"]}),
+        ),
+        (
+            pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def\\"]}),
+            ["col1"],
+            pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def\\"]}),
+        ),
+        (
+            pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def"]}),
+            ["col1", "col2"],
+            pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def"]}),
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "col1": ["\\abc", "de\\f", "ghi\\"],
+                    "col2": ["abc(\\hh)", "def,\\,hh", "ghi, ,\\hh"],
+                }
+            ),
+            ["col1", "col2"],
+            pd.DataFrame(
+                {
+                    "col1": ["abc", "def", "ghi"],
+                    "col2": ["abc(hh)", "def,,hh", "ghi, ,hh"],
+                }
+            ),
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "col1": [1, "de\\f", "ghi\\", np.nan],
+                    "col2": ["abc(\\hh)", "def,\\,hh", "ghi, ,\\hh", 2],
+                }
+            ),
+            ["col1", "col2"],
+            pd.DataFrame(
+                {
+                    "col1": [1, "def", "ghi", np.nan],
+                    "col2": ["abc(hh)", "def,,hh", "ghi, ,hh", 2],
+                }
+            ),
+        ),
+    ],
+    ids=[
+        "multiple_columns_with_backslash",
+        "one_column_with_backslash",
+        "one_column_with_backslash_but_not_selected",
+        "none_column_with_backslash",
+        "backslashes_in_multiple_places",
+        "various_column_types",
+    ],
+)
+def test_remove_backslash(input_df, cols, expected_df):
+    results = utilities.remove_backslash(input_df, cols)
+    pd.testing.assert_frame_equal(results, expected_df)
+
+
+@pytest.mark.parametrize(
+    "input_df,cols",
+    [
+        (pd.DataFrame({"col1": ["\\abc", "def"], "col2": ["abc", "def\\"]}), ["col3"]),
+    ],
+)
+def test_remove_backslsh_fail(input_df, cols):
+    with pytest.raises(
+        ValueError, match="Invalid column list. Not all columns are in the dataframe."
+    ):
+        utilities.remove_backslash(input_df, cols)
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ process run_quac_upload_report_error { @@
        debug true
        input:
+       val previous
        val cohort
        output:
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		synapseclient[pandas] == 2.7.2
		synapseclient[pandas] == 4.6.0