Skip to content

Commit

Permalink
Merge pull request #49 from smart-on-fhir/mikix/add-all
Browse files Browse the repository at this point in the history
feat: add cumulus__all rows to each summary resource collection
  • Loading branch information
mikix authored Jul 2, 2024
2 parents 2cb5b7d + 5c2bb2c commit 5cd0e5a
Show file tree
Hide file tree
Showing 17 changed files with 175 additions and 27 deletions.
8 changes: 8 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,11 @@ as well as set up an auto-formatter commit hook.
- Don't depend on `core__` tables.
- Allows folks to build this study even if they can't or haven't built `core`
- Allows `core` to smooth over data oddities we might be interested in

- For non-CUBE tables (like quality `q_` metric summaries or `c_resources_per_pt`),
always try to have a `cumulus__all` row for the second grouping column,
as well as the normal row -- which can hold the group value,
be NULL if no grouping exists, or maybe `cumulus__none` if the row
represents all resources without a group value.
This way, downstream visualizers can reliably roll-up detailed views into
more abstract views.
27 changes: 17 additions & 10 deletions cumulus_library_data_metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,39 +19,46 @@ def __init__(self):
super().__init__()
self.display_text = f"Creating {self.name} tables…"
self.summary_entries = {}
# A "group" is a value in the second column - a secondary characteristic like "field"
# (group examples: "code", "valueCodeableConcept") or a stratifier like "profile"
# (group examples: "Lab", "Note"). These will "roll-up" to the resource level in the
# summary with a "cumulus__all" group entry row.
self.summary_groups = {}
self.queries = []
self.schemas = {}

# These date fields are mostly static, but may be modified when checking the schema if
# some of them are not present (notably DocRef.context.period.start).
self.date_fields = copy.deepcopy(resource_info.DATES)

def make_table_fragment(self, src: str, stratifier: str | None = None):
def make_table_fragment(self, src: str, group: str | None = None):
key = src.lower()
if stratifier:
key += f"_{stratifier.lower().replace(' ', '_')}"
if group:
key += f"_{group.lower().replace(' ', '_')}"
return key

def add_summary_entry(
self, src: str, stratifier: str | None = None, *, denominator: str | None = None
self, src: str, group: str | None = None, *, denominator: str | None = None
) -> None:
# These are all flags for the summary-table-builder jinja.
key = self.make_table_fragment(src, stratifier)
key = self.make_table_fragment(src, group)
self.summary_entries[key] = {
"src": src,
"stratifier": stratifier,
"group": group,
"denominator": denominator,
}
self.summary_groups.setdefault(src, set()).add(group)

def make_summary(self, stratifier_column: str | None = None) -> None:
def make_summary(self, group_column: str | None = None) -> None:
"""Makes a summary table, from all the individual metric tables"""
# Always define *something* even if we don't use it, so that consuming visualizations
# can assume a consistent two-column definition of resource + stratifier.
stratifier_column = stratifier_column or "stratifier"
# can assume a consistent two-column definition of resource + group.
group_column = group_column or "subgroup"
sql = self.render_sql(
"../base.summary",
entries=self.summary_entries,
stratifier_column=stratifier_column,
group_column=group_column,
group_values=self.summary_groups,
metric=self.name,
)
self.queries.append(sql)
Expand Down
61 changes: 53 additions & 8 deletions cumulus_library_data_metrics/base.summary.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,74 @@ CREATE TABLE data_metrics__{{ metric }}_summary AS (
WITH
{%- for entry_key, summary_info in entries.items() %}
{{ entry_key }}_numerator AS (
SELECT '{{ entry_key }}' AS entry_key, COUNT(id) AS numerator
SELECT COUNT(id) AS numerator
FROM data_metrics__{{ metric }}_{{ entry_key }}
),
{{ entry_key }}_denominator AS (
{% if summary_info["denominator"] and summary_info["denominator"].strip() %}
{% if summary_info["denominator"] %}
WITH denominator_slice AS {{ summary_info["denominator"] }}
{% else %}
WITH denominator_slice AS (SELECT id from {{ summary_info["src"] }})
{% endif %}
SELECT '{{ entry_key }}' AS entry_key, COUNT(id) AS denominator
SELECT COUNT(id) AS denominator
FROM denominator_slice
),
{%- endfor %}

-- Prepare roll-up "all resources" entries by unifying numerators and grabbing distinct IDs
{% for resource, groups in group_values.items() %}
{% set all_entry_key = resource.lower() + "_all" %}

{{ all_entry_key }}_numerator_ids AS (
{% for group_entry_key, summary_info in entries.items() %}
{% if summary_info["src"] == resource %}
SELECT id FROM data_metrics__{{ metric }}_{{ group_entry_key }}
UNION
{% endif %}
{% endfor %}
SELECT id FROM {{ resource }} WHERE 1=0 -- empty table to finish loop
),
{{ all_entry_key }}_numerator AS (
SELECT COUNT(DISTINCT id) AS numerator FROM {{ all_entry_key }}_numerator_ids
),

-- And now the denominators
{% set has_custom_denominator = false %}
{% for entry_key, summary_info in entries.items() %}
{% if summary_info["src"] == resource and summary_info["denominator"] %}
{% set has_custom_denominator = true %}
{% endif %}
{% endfor %}
{% if has_custom_denominator %}
{{ all_entry_key }}_denominator_ids AS (
{% for group_entry_key, summary_info in entries.items() %}
{% if summary_info["src"] == resource %}
SELECT id FROM {{ summary_info["denominator"] }}
UNION
{% endif %}
{% endfor %}
SELECT id FROM {{ resource }} WHERE 1=0 -- empty table to finish loop
),
{{ all_entry_key }}_denominator AS (
SELECT COUNT(DISTINCT id) AS denominator FROM {{ all_entry_key }}_denominator_ids
),
{% else %}
{{ all_entry_key }}_denominator AS (
SELECT COUNT(id) AS denominator FROM {{ resource }}
),
{% endif %}

{% set _ = entries.update({all_entry_key: {"src": resource, "group": "cumulus__all"} }) %}
{% endfor %}

union_table AS (
{%- for entry_key, summary_info in entries.items() %}
SELECT
'{{ summary_info["src"] }}' AS resource,
{% if summary_info["stratifier"] %}
'{{ summary_info["stratifier"] }}' AS {{ stratifier_column }},
{% if summary_info["group"] %}
'{{ summary_info["group"] }}' AS {{ group_column }},
{% else %}
CAST(NULL AS VARCHAR) AS {{ stratifier_column }},
CAST(NULL AS VARCHAR) AS {{ group_column }},
{% endif %}
numerator,
denominator,
Expand All @@ -35,8 +81,7 @@ CREATE TABLE data_metrics__{{ metric }}_summary AS (
END
AS DECIMAL(5, 2)) AS percentage
FROM {{ entry_key }}_numerator AS numerator_table
INNER JOIN {{ entry_key }}_denominator AS denominator_table
ON numerator_table.entry_key = denominator_table.entry_key
CROSS JOIN {{ entry_key }}_denominator AS denominator_table
{%- if not loop.last %}
UNION
{%- endif -%}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,19 @@ SELECT
MAX(num_resources) AS "max"
FROM summed_counts
UNION
-- duplicate the above for consistency with flagging
-- "no category available" situations via a NULL category
SELECT
'cumulus__all' AS resource,
CAST(NULL AS VARCHAR) AS category,
CAST(AVG(num_resources) AS DECIMAL(18, 2)) AS average,
CAST(STDDEV_POP(num_resources) AS DECIMAL(18, 2)) AS std_dev,
MAX(num_resources) AS "max"
FROM summed_counts
UNION
{%- for resource in patient_fields %}
-- If you change this cumulus_all block, also change its
-- duplicate block below where we use NULL as the category.
SELECT
'{{ resource }}' AS resource,
'cumulus__all' AS category,
Expand All @@ -97,6 +109,22 @@ AND counts.patient = patient_refs.ref
AND counts.category = cat_values.category
GROUP BY cat_values.category
HAVING MAX(num_resources) > 0
{% else %}
-- copy of cumulus__all above, but with a NULL category,
-- for consistency: there's always a roll-up and a detailed row,
-- and this is how we treat other "no categorization available"
-- metrics like rows in q_date_recent.
-- Note how this is different than "cumulus__none" here, because
-- there just isn't a categorization we even look for.
UNION
SELECT
'{{ resource }}' AS resource,
CAST(NULL AS VARCHAR) AS category,
CAST(AVG(num_resources) AS DECIMAL(18, 2)) AS average,
CAST(STDDEV_POP(num_resources) AS DECIMAL(18, 2)) AS std_dev,
MAX(num_resources) AS "max"
FROM summed_{{ resource|lower }}_counts
WHERE resource = '{{ resource }}'
{% endif %}
{%- if not loop.last %}
UNION
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ def add_metric_queries(self) -> None:
self.make_table(src="MedicationRequest", dest="Patient", field="subject")
self.make_table(src="Observation", dest="Patient", field="subject")
self.make_table(src="Procedure", dest="Patient", field="subject")
self.make_summary(stratifier_column="target")
self.make_summary(group_column="target")
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ def add_metric_queries(self) -> None:
self.make_table(src="Observation", dest="Encounter", field="encounter")
self.make_table(src="Procedure", dest="Patient", field="subject")
self.make_table(src="Procedure", dest="Encounter", field="encounter")
self.make_summary(stratifier_column="target")
self.make_summary(group_column="target")
2 changes: 1 addition & 1 deletion cumulus_library_data_metrics/q_system_use/q_system_use.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,4 @@ def add_metric_queries(self) -> None:
"http://www.cms.gov/Medicare/Coding/ICD10",
],
)
self.make_summary(stratifier_column="field")
self.make_summary(group_column="field")
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ def make_table(self, **kwargs) -> None:

def add_metric_queries(self) -> None:
super().add_metric_queries()
self.make_summary(stratifier_column="profile")
self.make_summary(group_column="profile")
5 changes: 5 additions & 0 deletions tests/data/c_resources_per_pt/general/expected_summary.csv
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
resource,category,average,std_dev,max
cumulus__all,,3.00,0.82,4
cumulus__all,cumulus__all,3.00,0.82,4
ServiceRequest,,0.00,0.00,0
ServiceRequest,cumulus__all,0.00,0.00,0
Procedure,,0.00,0.00,0
Procedure,cumulus__all,0.00,0.00,0
Observation,cumulus__all,0.00,0.00,0
MedicationRequest,outpatient,1.33,0.47,2
MedicationRequest,inpatient,0.33,0.47,1
MedicationRequest,discharge; inpatient,0.33,0.47,1
MedicationRequest,cumulus__none,0.33,0.47,1
MedicationRequest,cumulus__all,2.33,1.25,4
Immunization,,0.00,0.00,0
Immunization,cumulus__all,0.00,0.00,0
Encounter,cumulus__all,0.00,0.00,0
DocumentReference,cumulus__all,0.00,0.00,0
DiagnosticReport,cumulus__all,0.00,0.00,0
Device,,0.00,0.00,0
Device,cumulus__all,0.00,0.00,0
Condition,cumulus__none,0.67,0.47,1
Condition,cumulus__all,0.67,0.47,1
Expand Down
11 changes: 10 additions & 1 deletion tests/data/q_date_recent/general/expected_summary.csv
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
resource,stratifier,numerator,denominator,percentage
resource,subgroup,numerator,denominator,percentage
Procedure,,2,5,40.00
Procedure,cumulus__all,2,5,40.00
Observation,,0,0,0.00
Observation,cumulus__all,0,0,0.00
MedicationRequest,,0,0,0.00
MedicationRequest,cumulus__all,0,0,0.00
Immunization,,0,0,0.00
Immunization,cumulus__all,0,0,0.00
Encounter,,1,4,25.00
Encounter,cumulus__all,1,4,25.00
DocumentReference,,0,0,0.00
DocumentReference,cumulus__all,0,0,0.00
DiagnosticReport,,0,0,0.00
DiagnosticReport,cumulus__all,0,0,0.00
Condition,,2,4,50.00
Condition,cumulus__all,2,4,50.00
AllergyIntolerance,,0,0,0.00
AllergyIntolerance,cumulus__all,0,0,0.00
10 changes: 10 additions & 0 deletions tests/data/q_ref_target_pop/general/expected_summary.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
resource,target,numerator,denominator,percentage
# Various edge cases here in procedure
Procedure,cumulus__all,1,4,25.00
Procedure,Patient,1,4,25.00
# Rest are just short happy-path checks to confirm that we look at the right json field
Observation,cumulus__all,0,1,0.00
Observation,Patient,0,1,0.00
MedicationRequest,cumulus__all,0,1,0.00
MedicationRequest,Patient,0,1,0.00
Immunization,cumulus__all,0,1,0.00
Immunization,Patient,0,1,0.00
Encounter,cumulus__all,0,1,0.00
Encounter,Patient,0,1,0.00
DocumentReference,cumulus__all,0,1,0.00
DocumentReference,Patient,0,1,0.00
DiagnosticReport,cumulus__all,0,1,0.00
DiagnosticReport,Patient,0,1,0.00
Device,cumulus__all,0,1,0.00
Device,Patient,0,1,0.00
Condition,cumulus__all,0,1,0.00
Condition,Patient,0,1,0.00
AllergyIntolerance,cumulus__all,0,1,0.00
AllergyIntolerance,Patient,0,1,0.00
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
id,status,target
bad-encounter,,Encounter/Nope
bad-both,,Encounter/Nope
14 changes: 12 additions & 2 deletions tests/data/q_ref_target_valid/general/expected_summary.csv
Original file line number Diff line number Diff line change
@@ -1,22 +1,32 @@
resource,target,numerator,denominator,percentage
# Various edge cases here in procedure
Procedure,Patient,1,9,11.11
Procedure,Encounter,1,9,11.11
Procedure,cumulus__all,3,10,30.00
Procedure,Patient,2,10,20.00
Procedure,Encounter,2,10,20.00
# Rest are just short happy-path checks to confirm that we look at the right json field
Observation,cumulus__all,0,1,0.00
Observation,Patient,0,1,0.00
Observation,Encounter,0,1,0.00
MedicationRequest,cumulus__all,0,1,0.00
MedicationRequest,Patient,0,1,0.00
MedicationRequest,Encounter,0,1,0.00
Immunization,cumulus__all,0,1,0.00
Immunization,Patient,0,1,0.00
Immunization,Encounter,0,1,0.00
Encounter,cumulus__all,0,1,0.00
Encounter,Patient,0,1,0.00
# Except DocRefs also have some extra cases around encounter array support
DocumentReference,cumulus__all,3,6,50.00
DocumentReference,Patient,0,6,0.00
DocumentReference,Encounter,3,6,50.00
DiagnosticReport,cumulus__all,0,1,0.00
DiagnosticReport,Patient,0,1,0.00
DiagnosticReport,Encounter,0,1,0.00
Device,cumulus__all,0,1,0.00
Device,Patient,0,1,0.00
Condition,cumulus__all,0,1,0.00
Condition,Patient,0,1,0.00
Condition,Encounter,0,1,0.00
AllergyIntolerance,cumulus__all,0,1,0.00
AllergyIntolerance,Patient,0,1,0.00
AllergyIntolerance,Encounter,0,1,0.00
1 change: 1 addition & 0 deletions tests/data/q_ref_target_valid/general/procedure/0.ndjson
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{"resourceType": "Procedure", "id": "valid", "subject": {"reference": "Patient/A"}, "encounter": {"reference": "Encounter/A"}}
{"resourceType": "Procedure", "id": "bad-patient", "subject": {"reference": "Patient/Nope"}, "encounter": {"reference": "Encounter/A"}}
{"resourceType": "Procedure", "id": "bad-encounter", "subject": {"reference": "Patient/A"}, "encounter": {"reference": "Encounter/Nope"}}
{"resourceType": "Procedure", "id": "bad-both", "subject": {"reference": "Patient/Nope"}, "encounter": {"reference": "Encounter/Nope"}}
{"resourceType": "Procedure", "id": "non-patient", "subject": {"reference": "Device/A"}, "encounter": {"reference": "Encounter/A"}}
{"resourceType": "Procedure", "id": "non-encounter", "subject": {"reference": "Patient/A"}, "encounter": {"reference": "Device/A"}}
{"resourceType": "Procedure", "id": "missing-patient-ref", "subject": {"display": "Patient A"}, "encounter": {"reference": "Encounter/A"}}
Expand Down
10 changes: 10 additions & 0 deletions tests/data/q_system_use/general/expected_summary.csv
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
resource,field,numerator,denominator,percentage
# Various edge cases here in procedure
Procedure,cumulus__all,4,12,33.33
Procedure,code,4,12,33.33
# Rest are often short happy-path checks to confirm that we look at the right json field
Observation,valueCodeableConcept,0,2,0.00
Observation,cumulus__all,0,2,0.00
Observation,code,0,2,0.00
MedicationRequest,medicationCodeableConcept,0,1,0.00
MedicationRequest,cumulus__all,0,1,0.00
Medication,cumulus__all,0,1,0.00
Medication,code,0,1,0.00
Immunization,vaccineCode,0,1,0.00
Immunization,cumulus__all,0,1,0.00
DocumentReference,type,1,3,33.33
DocumentReference,cumulus__all,1,3,33.33
DiagnosticReport,cumulus__all,0,1,0.00
DiagnosticReport,code,0,1,0.00
Device,type,0,1,0.00
Device,cumulus__all,0,1,0.00
Condition,cumulus__all,0,1,0.00
Condition,code,0,1,0.00
AllergyIntolerance,cumulus__all,0,1,0.00
AllergyIntolerance,code,0,1,0.00
15 changes: 13 additions & 2 deletions tests/data/q_valid_us_core_v4/general/expected_summary.csv
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
resource,profile,numerator,denominator,percentage
Procedure,,0,0,0.00
Procedure,cumulus__all,0,0,0.00
Patient,,0,0,0.00
Patient,cumulus__all,0,0,0.00
Observation,cumulus__all,3,3,100.00
Observation,Vital Signs,0,0,0.00
Observation,Smoking Status,0,0,0.00
Observation,Laboratory,0,0,0.00
Observation,Smoking Status,2,2,100.00
Observation,Laboratory,2,2,100.00
MedicationRequest,,0,0,0.00
MedicationRequest,cumulus__all,0,0,0.00
Medication,,0,0,0.00
Medication,cumulus__all,0,0,0.00
Immunization,,0,0,0.00
Immunization,cumulus__all,0,0,0.00
Encounter,,2,3,66.67
Encounter,cumulus__all,2,3,66.67
DocumentReference,,0,0,0.00
DocumentReference,cumulus__all,0,0,0.00
DiagnosticReport,cumulus__all,0,0,0.00
DiagnosticReport,Note,0,0,0.00
DiagnosticReport,Lab,0,0,0.00
Condition,,0,0,0.00
Condition,cumulus__all,0,0,0.00
AllergyIntolerance,,0,0,0.00
AllergyIntolerance,cumulus__all,0,0,0.00
3 changes: 3 additions & 0 deletions tests/data/q_valid_us_core_v4/general/observation/0.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"resourceType": "Observation", "id": "lab-nothing", "category": [{"coding": [{"code": "laboratory", "system": "http://terminology.hl7.org/CodeSystem/observation-category"}]}]}
{"resourceType": "Observation", "id": "smoking-nothing", "code": {"coding": [{"system": "http://loinc.org", "code": "72166-2"}]}}
{"resourceType": "Observation", "id": "both-nothing", "code": {"coding": [{"system": "http://loinc.org", "code": "72166-2"}]}, "category": [{"coding": [{"code": "laboratory", "system": "http://terminology.hl7.org/CodeSystem/observation-category"}]}]}

0 comments on commit 5cd0e5a

Please sign in to comment.