Skip to content

Commit

Permalink
merge with develop, resolve conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
rxu17 committed Oct 31, 2024
2 parents 5d778f4 + 86afb66 commit 5335d0c
Show file tree
Hide file tree
Showing 13 changed files with 588 additions and 101 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-docker-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 2

Expand Down
27 changes: 27 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
# genie-bpc-pipeline: Contributing Guidelines

## Getting started
1. [Clone the repository](https://help.github.com/articles/cloning-a-repository/) to your local machine so you can begin making changes.
2. On your local machine make sure you have the latest version of the `develop` branch:

```
git checkout develop
git pull origin develop
```
3. Create a feature branch off the `develop` branch and work on it. The branch should be named the same as the JIRA issue you are working on in **lowercase** (e.g., `gen-1234-{feature-here}`). Make sure the branch name as informative as possible.
```
git checkout develop
git checkout -b gen-1234-{feature-here}
```
4. Once you have made your additions or changes, make sure you write tests and run [comparison scripts](https://github.com/Sage-Bionetworks/Genie_processing/blob/ed806d163fa4063a84920483e8ada21ea4b6cf47/README.md#comparisons-between-two-synapse-entities) to ensure changes are expected.
5. At this point, you have only created the branch locally, you need to push this to your fork on GitHub.
```
git add your file
git commit -m "your commit information"
git push --set-upstream origin gen-1234-{feature-here}
```
6. Create a pull request from the feature branch to the develop branch. When changes are made to `script/<module_folder_name>` (only applies to scripts/references and scripts/table_updates for now), a Github action will be triggered to create a docker image for the branch, you can check it [here](https://github.com/Sage-Bionetworks/genie-bpc-pipeline/pkgs/container/genie-bpc-pipeline).
## Nextflow Pipeline contribution
Here is how to contribute to the nextflow workflow of the genie-bpc-pipeline
Expand Down Expand Up @@ -30,3 +53,7 @@ Parameters should be initialized / defined with default values in the set parame
### Default processes resource requirements
Defaults for process resource requirements (CPUs / memory / time) for a process should be defined in `nextflow.config`.
### Testing
1. To test locally, you can’t use ghcr.io/sage-bionetworks/docker_image_name locally directly. You will need to pull the docker image to local and test with it.
2. To test on Sequra, go to the test_bpc_pipeline, edit the pipeline by pointing it to your feature branch then update. Doing this will allow you to select parameters from the dropdown menu directly.
4 changes: 3 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,15 @@ workflow BPC_PIPELINE {
// validate_data.out.view()
} else if (params.step == "merge_and_uncode_rca_uploads"){
merge_and_uncode_rca_uploads("default", ch_cohort, ch_comment, params.production)
} else if (params.step == "update_data_table") {
update_data_table("default", ch_cohort, ch_comment, params.production)
} else if (params.step == "genie_bpc_pipeline"){
update_potential_phi_fields_table(ch_comment, params.production)
run_quac_upload_report_error(update_potential_phi_fields_table.out, ch_cohort)
run_quac_upload_report_warning(run_quac_upload_report_error.out, ch_cohort, params.production)
merge_and_uncode_rca_uploads(run_quac_upload_report_warning.out, ch_cohort, ch_comment, params.production)
// remove_patients_from_merged(merge_and_uncode_rca_uploads.out, ch_cohort, params.production)
update_data_table(merge_and_uncode_rca_uploads.out, ch_comment, params.production)
update_data_table(merge_and_uncode_rca_uploads.out, ch_cohort, ch_comment, params.production)
update_date_tracking_table(update_data_table.out, ch_cohort, ch_comment, params.production)
run_quac_table_report(update_date_tracking_table.out, ch_cohort, params.production)
run_quac_comparison_report(run_quac_table_report.out, ch_cohort, params.production)
Expand Down
1 change: 1 addition & 0 deletions modules/run_quac_upload_report_error.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ process run_quac_upload_report_error {
debug true

input:
val previous
val cohort

output:
Expand Down
10 changes: 5 additions & 5 deletions modules/update_data_table.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
Update Synapse tables with merged and uncoded data.
*/
process update_data_table {
container "$params.table_updates_docker"

container 'sagebionetworks/genie-bpc-pipeline-table-updates'
secret 'SYNAPSE_AUTH_TOKEN'
debug true

input:
val previous
val cohort
val comment
val production

Expand All @@ -19,13 +20,12 @@ process update_data_table {
if (production) {
"""
cd /root/scripts/
python update_data_table.py -p /root/scripts/config.json -m "$comment" primary
python update_data_table.py -p /root/scripts/config.json -c $cohort -m "$comment" primary -pd
"""
}
else {
} else {
"""
cd /root/scripts/
python update_data_table.py -p /root/scripts/config.json -m "$comment" primary -d
python update_data_table.py -p /root/scripts/config.json -c $cohort -m "$comment" primary
"""
}
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ profiles {
// docker image parameters, see nextflow_schema.json for details
references_docker = "sagebionetworks/genie-bpc-pipeline-references"
uploads_docker = "sagebionetworks/genie-bpc-pipeline-uploads"
table_updates_docker = "sagebionetworks/genie-bpc-pipeline-table-updates"
}
}
}
5 changes: 5 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
"enum": [
"update_potential_phi_fields_table",
"merge_and_uncode_rca_uploads",
"update_data_table",
"genie_bpc_pipeline"
]
},
Expand All @@ -62,6 +63,10 @@
"type": "string",
"description": "Name of docker to use in processes in scripts/uploads"
},
"table_updates_docker":{
"type": "string",
"description": "Name of docker to use in processes in scripts/table_updates"
},
"schema_ignore_params": {
"type": "string",
"description": "Put parameters to ignore for validation here separated by comma",
Expand Down
2 changes: 1 addition & 1 deletion scripts/table_updates/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.8
FROM python:3.11

WORKDIR /root/scripts

Expand Down
12 changes: 10 additions & 2 deletions scripts/table_updates/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@ Usage

### Update the Synapse Tables with data
#### Primary Case Tables
python update_data_table.py -m [version_comment] primary
#### IRR Case Tables
##### 1. dry-run (Save output to local)
python update_data_table.py -c [cohort_name] -m [version_comment] primary -d

##### 2. production (Save output to production projects)
python update_data_table.py -c [cohort_name] -m [version_comment] primary -pd

##### 3. staging (Save output to staging projects)
python update_data_table.py -c [cohort_name] -m [version_comment] primary

#### IRR Case Tables (Deprecated)
python update_data_table.py -m [version_comment] irr
2 changes: 1 addition & 1 deletion scripts/table_updates/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
synapseclient[pandas] == 2.7.2
synapseclient[pandas] == 4.6.0
230 changes: 230 additions & 0 deletions scripts/table_updates/tests/test_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
from unittest.mock import MagicMock, create_autospec, patch

import numpy as np
import pandas as pd
import pytest
import synapseclient
from synapseclient import Schema, Table
from table_updates import utilities


@pytest.fixture(scope="session")
def syn():
return create_autospec(synapseclient.Synapse)


@pytest.fixture(scope="session")
def table_schema():
schema = synapseclient.table.Schema(
name="test_table",
parent="syn123",
column_names=["col1", "col2"],
column_types=["STRING", "INTEGER"],
)
return schema


@pytest.mark.parametrize(
"query_return_df,select,query,expected_df",
[
(
pd.DataFrame({"col1": ["value1", "value2"]}),
"col1",
"SELECT col1 from syn123456",
pd.DataFrame({"col1": ["value1", "value2"]}),
),
(
pd.DataFrame({"col1": ["value1", "value2"], "col2": [1, 2]}),
"col1,col2",
"SELECT col1,col2 from syn123456",
pd.DataFrame({"col1": ["value1", "value2"], "col2": [1, 2]}),
),
(
pd.DataFrame({"col1": ["NA", "value1", "None"], "col2": [1, 2, 3]}),
"*",
"SELECT * from syn123456",
pd.DataFrame({"col1": [np.nan, "value1", "None"], "col2": [1, 2, 3]}),
),
(
pd.DataFrame(columns=["col1", "col2"]),
"*",
"SELECT * from syn123456",
pd.DataFrame(columns=["col1", "col2"]),
),
],
ids=[
"selected_single_column",
"selected_multiple_column",
"pull_table_with_na_values_all_columns",
"pull_empty_table_all_columns",
],
)
def test_download_synapse_table_default_condition(
syn, table_schema, query_return_df, select, query, expected_df
):
syn.tableQuery = MagicMock(return_value=Table(table_schema, query_return_df))
result = utilities.download_synapse_table(syn, "syn123456", select)

# validate
syn.tableQuery.assert_called_once_with(query)
pd.testing.assert_frame_equal(result, expected_df)


@pytest.mark.parametrize(
"query_return_df,condition,query,expected_df",
[
(
pd.DataFrame({"col1": ["value1"], "col2": [1]}),
"col1 = 'value1'",
"SELECT * from syn123456 WHERE col1 = 'value1'",
pd.DataFrame({"col1": ["value1"], "col2": [1]}),
),
(
pd.DataFrame({"col1": ["NA", "value1", "None"], "col2": [1, 1, 1]}),
"col2 = 1",
"SELECT * from syn123456 WHERE col2 = 1",
pd.DataFrame({"col1": [np.nan, "value1", "None"], "col2": [1, 1, 1]}),
),
],
ids=["selected_row_all_columns", "pull_table_with_na_values_all_columns"],
)
def test_download_synapse_table_with_condition(
syn, table_schema, query_return_df, condition, query, expected_df
):
syn.tableQuery = MagicMock(return_value=Table(table_schema, query_return_df))
result = utilities.download_synapse_table(syn, "syn123456", condition=condition)

# validate
syn.tableQuery.assert_called_once_with(query)
pd.testing.assert_frame_equal(result, expected_df)


@pytest.mark.parametrize(
"query_return_df,select,condition,query,expected_df",
[
(
pd.DataFrame({"col1": ["value1"], "col2": [1]}),
"col1",
"col1 = 'value1'",
"SELECT col1 from syn123456 WHERE col1 = 'value1'",
pd.DataFrame({"col1": ["value1"], "col2": [1]}),
),
(
pd.DataFrame({"col1": ["value1"], "col2": [1]}),
"col1,col2",
"col1 = 'value1'",
"SELECT col1,col2 from syn123456 WHERE col1 = 'value1'",
pd.DataFrame({"col1": ["value1"], "col2": [1]}),
),
],
ids=[
"selected_one_columns_with_condition",
"select_multiple_columns_with_condition",
],
)
def test_download_synapse_table_with_select_and_condition(
syn, table_schema, query_return_df, select, condition, query, expected_df
):
syn.tableQuery = MagicMock(return_value=Table(table_schema, query_return_df))
result = utilities.download_synapse_table(
syn, "syn123456", select=select, condition=condition
)

# validate
syn.tableQuery.assert_called_once_with(query)
pd.testing.assert_frame_equal(result, expected_df)


def test_download_empty_synapse_table_with_condition(
syn,
table_schema,
):
syn.tableQuery = MagicMock(
return_value=Table(table_schema, pd.DataFrame(columns=["col1", "col2"]))
)
result = utilities.download_synapse_table(syn, "syn123456", condition="col2 = 1")

# validate
syn.tableQuery.assert_called_once_with("SELECT * from syn123456 WHERE col2 = 1")
pd.testing.assert_frame_equal(result, pd.DataFrame(columns=["col1", "col2"]))


@pytest.mark.parametrize(
"input_df,cols,expected_df",
[
(
pd.DataFrame({"col1": ["\\abc", "def"], "col2": ["abc", "def\\"]}),
["col1", "col2"],
pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def"]}),
),
(
pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def\\"]}),
["col2"],
pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def"]}),
),
(
pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def\\"]}),
["col1"],
pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def\\"]}),
),
(
pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def"]}),
["col1", "col2"],
pd.DataFrame({"col1": ["abc", "def"], "col2": ["abc", "def"]}),
),
(
pd.DataFrame(
{
"col1": ["\\abc", "de\\f", "ghi\\"],
"col2": ["abc(\\hh)", "def,\\,hh", "ghi, ,\\hh"],
}
),
["col1", "col2"],
pd.DataFrame(
{
"col1": ["abc", "def", "ghi"],
"col2": ["abc(hh)", "def,,hh", "ghi, ,hh"],
}
),
),
(
pd.DataFrame(
{
"col1": [1, "de\\f", "ghi\\", np.nan],
"col2": ["abc(\\hh)", "def,\\,hh", "ghi, ,\\hh", 2],
}
),
["col1", "col2"],
pd.DataFrame(
{
"col1": [1, "def", "ghi", np.nan],
"col2": ["abc(hh)", "def,,hh", "ghi, ,hh", 2],
}
),
),
],
ids=[
"multiple_columns_with_backslash",
"one_column_with_backslash",
"one_column_with_backslash_but_not_selected",
"none_column_with_backslash",
"backslashes_in_multiple_places",
"various_column_types",
],
)
def test_remove_backslash(input_df, cols, expected_df):
results = utilities.remove_backslash(input_df, cols)
pd.testing.assert_frame_equal(results, expected_df)


@pytest.mark.parametrize(
"input_df,cols",
[
(pd.DataFrame({"col1": ["\\abc", "def"], "col2": ["abc", "def\\"]}), ["col3"]),
],
)
def test_remove_backslsh_fail(input_df, cols):
with pytest.raises(
ValueError, match="Invalid column list. Not all columns are in the dataframe."
):
utilities.remove_backslash(input_df, cols)
Loading

0 comments on commit 5335d0c

Please sign in to comment.