Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to new test action #6

Open
wants to merge 35 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
8fa2efc
add cohort and notebook actions
wjchulme Dec 7, 2020
8cb0534
Update .gitignore
wjchulme Dec 7, 2020
981dd3d
Fix run_notebook child key indentation
ghickman Dec 8, 2020
fff3ffa
Merge pull request #2 from ghickman/patch-1
wjchulme Dec 8, 2020
61d8e67
remove workspace from directory path
wjchulme Dec 8, 2020
6ad5740
update directory
wjchulme Dec 8, 2020
a1fba79
correct directory path
wjchulme Dec 8, 2020
0d1ad28
add metadata folder
wjchulme Dec 8, 2020
541ed98
comment out code using folium, geopandas, and prettytable for now
wjchulme Dec 8, 2020
d65cb4b
BugFix: Element Mismatch
Dec 9, 2020
0b8903d
BugFIx: Revert to original
Dec 10, 2020
cb8c7bf
BugFix: Fill Series with zeros if none counted
Dec 10, 2020
f684c42
Restructure for Debugging
Dec 13, 2020
b3c8611
Make keys unique in YAML
Dec 13, 2020
acbbfa9
type
Dec 13, 2020
2e05d2c
release outputs
wjchulme Dec 14, 2020
5c6aba7
Merge branch 'will-run' of https://github.com/opensafely/vaccine-elig…
wjchulme Dec 14, 2020
82092a4
Update BMI
Dec 14, 2020
8565314
remove where .isnull
Dec 14, 2020
907de92
release outputs
wjchulme Dec 15, 2020
b8f699a
Merge branch 'will-run' of https://github.com/opensafely/vaccine-elig…
wjchulme Dec 15, 2020
a716ca5
Create report.py from cohortextractor
Dec 16, 2020
da5ca5f
Merge branch 'will-run' of https://github.com/opensafely/vaccine-elig…
Dec 16, 2020
ed15b3c
Automate low number suppression
sebbacon Dec 16, 2020
0e5a970
Invoke python run-command correctly
sebbacon Dec 16, 2020
7e546d4
Include vaccinated
Dec 17, 2020
fa2aa7d
Merge branch 'will-run' of https://github.com/opensafely/vaccine-elig…
Dec 17, 2020
6ce94d7
Update COVID vaccination target
Dec 17, 2020
54619dd
Merge pull request #1 from opensafely/will-run
JRPearson500 Dec 17, 2020
1e3cf15
Revert back to docker.opensafely.org
JRPearson500 Dec 17, 2020
ac9fc49
17-dec run (#3)
sebbacon Dec 18, 2020
580fa47
Add vaccinated .csv
Dec 18, 2020
e3d25d5
Additional Covid target definition
Dec 18, 2020
78d2746
Remove SARS-CORONAVIRUS 2
Jan 4, 2021
7f335df
update to new study test action and remove old uneeded action
bloodearnest Jan 29, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Automate low number suppression
  • Loading branch information
sebbacon committed Dec 16, 2020
commit ed15b3c4374c1fbc3e63f93cc909db6a7e17e6c8
80 changes: 53 additions & 27 deletions analysis/report.py
Original file line number Diff line number Diff line change
@@ -9,6 +9,8 @@
from pandas.api.types import is_bool_dtype
from pandas.api.types import is_datetime64_dtype
from pandas.api.types import is_numeric_dtype
import pandas as pd


import seaborn as sns
import datetime
@@ -23,6 +25,7 @@ def make_chart(name, series, dtype):
img = BytesIO()
# Setting figure sizes in seaborn is a bit weird:
# https://stackoverflow.com/a/23973562/559140

if is_categorical_dtype(dtype):
sns.set_style("ticks")
sns.catplot(
@@ -52,8 +55,8 @@ def make_chart(name, series, dtype):
series.hist(bins=int(bins), ax=ax)
plt.xticks(rotation=45, ha="right")
elif is_numeric_dtype(dtype):
# Trim percentiles and negatives which are usually bad data
series = series.fillna(0)
# Trim percentiles and negatives which are usually bad data
series = series[
(series < np.percentile(series, 95))
& (series > np.percentile(series, 5))
@@ -72,37 +75,58 @@ def make_chart(name, series, dtype):
return base64.b64encode(img.read()).decode("UTF-8")


def suppress_numbers(series, dtype):
suppress_numbers_under = 6
if is_categorical_dtype(dtype) or is_bool_dtype(dtype):
if ~np.any(series.value_counts() < suppress_numbers_under):
return series
elif is_datetime64_dtype(dtype) or is_numeric_dtype(dtype):
if (
~np.any(pd.isnull(series).value_counts() < suppress_numbers_under)
and series[~pd.isnull(series)].count() >= suppress_numbers_under
):
return series
return pd.Series()


def _make_cohort_report(input_dir, output_dir, study_name, suffix):
study = load_study_definition(study_name)

df = study.csv_to_df(f"{input_dir}/input{suffix}.csv")
descriptives = df.describe(include="all")

html = ""
contents = "<h2>Contents</h2><ul>"
for name, dtype in zip(df.columns, df.dtypes):
if name == "patient_id":
continue
main_chart = '<div><img src="data:image/png;base64,{}"/></div>'.format(
make_chart(name, df[name], dtype)
)
empty_values_chart = ""
if is_datetime64_dtype(dtype):
# also do a null / not null plot
empty_values_chart = (
'<div><img src="data:image/png;base64,{}"/></div>'.format(
make_chart(name, df[name].isnull(), bool)
)
contents += f"<li><a href='#{name}'>{name}</a></li>"
series = suppress_numbers(df[name], dtype)
if len(series):
descriptives = series.describe()
main_chart = '<div><img src="data:image/png;base64,{}"/></div>'.format(
make_chart(name, df[name], dtype)
)
elif is_numeric_dtype(dtype):
# also do a null / not null plot
empty_values_chart = (
'<div><img src="data:image/png;base64,{}"/></div>'.format(
make_chart(name, df[name] > 0, bool)
)
empty_values_chart = "n/a"
if np.any(pd.isnull(series)):
if is_datetime64_dtype(dtype):
# also do a null / not null plot
empty_values_chart = '<div><img src="data:image/png;base64,{}"/></div>'.format(
make_chart(name, df[name].isnull(), bool)
)
elif is_numeric_dtype(dtype):
# also do a null / not null plot
empty_values_chart = '<div><img src="data:image/png;base64,{}"/></div>'.format(
make_chart(name, df[name] > 0, bool)
)
descriptives.loc["values"] = main_chart
descriptives.loc["nulls"] = empty_values_chart
html += f"<a name='{name}'></a><h2>{name}</h2>"
html += descriptives.to_frame().to_html(
escape=False, na_rep="", justify="left", border=0
)
descriptives.loc["values", name] = main_chart
descriptives.loc["nulls", name] = empty_values_chart

else:
html += f"<a name='{name}'></a><h2>{name}</h2>"
html += f"outputs suppressed (low number suppression)"
contents += "</ul>"
with open(f"{output_dir}/descriptives{suffix}.html", "w") as f:

f.write(
@@ -132,15 +156,17 @@ def _make_cohort_report(input_dir, output_dir, study_name, suffix):
</head>
<body>"""
)

f.write(descriptives.to_html(escape=False, na_rep="", justify="left", border=0))
f.write(contents)
f.write(html)
f.write("</body></html>")
print(f"Created cohort report at {output_dir}/descriptives{suffix}.html")


def make_cohort_report(input_dir, output_dir):
for study_name, suffix in list_study_definitions():
_make_cohort_report(input_dir, output_dir, study_name, suffix)

if __name__ == "__main__":
make_cohort_report(sys.argv[1],sys.argv[2])


if __name__ == "__main__":
make_cohort_report(sys.argv[1], sys.argv[2])