diff --git a/docker/vm_boot_images/config/tensorflow-requirements.txt b/docker/vm_boot_images/config/tensorflow-requirements.txt index 7adb3aef0..3fcfd6a80 100644 --- a/docker/vm_boot_images/config/tensorflow-requirements.txt +++ b/docker/vm_boot_images/config/tensorflow-requirements.txt @@ -16,8 +16,8 @@ bokeh Pillow==7.0.0 notebook pytest -tensorflow-addons -tensorflow_probability +tensorflow-addons==0.9.1 +tensorflow_probability==0.9.0 numcodecs beautifulsoup4 lxml diff --git a/ml4cvd/arguments.py b/ml4cvd/arguments.py index aac3c88d6..dd0d08d23 100644 --- a/ml4cvd/arguments.py +++ b/ml4cvd/arguments.py @@ -26,7 +26,7 @@ from ml4cvd.models import parent_sort, BottleneckType, check_no_bottleneck from ml4cvd.tensor_maps_by_hand import TMAPS from ml4cvd.defines import IMPUTATION_RANDOM, IMPUTATION_MEAN -from ml4cvd.tensor_maps_partners_ecg import build_partners_tensor_maps, build_cardiac_surgery_tensor_maps +from ml4cvd.tensor_maps_partners_ecg import build_partners_tensor_maps, build_cardiac_surgery_tensor_maps, build_partners_time_series_tensor_maps from ml4cvd.tensor_map_maker import generate_continuous_tensor_map_from_file @@ -329,15 +329,17 @@ def _get_tmap(name: str, needed_tensor_maps: List[str]) -> TensorMap: if name in TMAPS: return TMAPS[name] + TMAPS.update(build_partners_time_series_tensor_maps(needed_tensor_maps)) + if name in TMAPS: + return TMAPS[name] + from ml4cvd.tensor_maps_partners_ecg import TMAPS as partners_tmaps TMAPS.update(partners_tmaps) - if name in TMAPS: return TMAPS[name] from ml4cvd.tensor_maps_partners_ecg_labels import TMAPS as partners_label_tmaps TMAPS.update(partners_label_tmaps) - if name in TMAPS: return TMAPS[name] diff --git a/ml4cvd/explorations.py b/ml4cvd/explorations.py index 239a318da..a9e2b443c 100644 --- a/ml4cvd/explorations.py +++ b/ml4cvd/explorations.py @@ -674,7 +674,7 @@ def _hd5_to_disk(tmaps, path, gen_name, tot, output_folder, id): with count.get_lock(): i = count.value if i % 500 == 0: - logging.info(f"{gen_name} - Parsing {i}/{tot} ({i/tot*100:.1f}%) done") + logging.info(f"Parsing {i}/{tot} ({i/tot*100:.1f}%) done") count.value += 1 # each worker should write to it's own file @@ -750,7 +750,7 @@ def _tensors_to_df(args): tmaps = [tm for tm in args.tensor_maps_in] global count # TODO figure out how to not use global count = multiprocess.Value('l', 1) - paths = [(path, gen.name) for gen in generators for worker_paths in gen.path_iters for path in worker_paths.paths] + paths = [(path, gen.name.replace('_worker', '')) for gen in generators for worker_paths in gen.path_iters for path in worker_paths.paths] num_hd5 = len(paths) chunksize = num_hd5 // args.num_workers with multiprocess.Pool(processes=args.num_workers) as pool: @@ -781,9 +781,6 @@ def _tensors_to_df(args): logging.debug(f'Appended {fpath} to overall dataframe') temp_files.append(fpath) - # Remove "_worker" from "generator" values - df["generator"].replace("_worker", "", regex=True, inplace=True) - logging.info(f"Extracted {len(tmaps)} tmaps from {len(df)} tensors across {num_hd5} hd5 files into DataFrame") # remove temporary files @@ -958,7 +955,7 @@ def explore(args): df_stats = df_stats.round(2) df_stats.to_csv(fpath) logging.info(f"Saved summary stats of {Interpretation.LANGUAGE} tmaps to {fpath}") - + if args.plot_hist == "True": for tm in args.tensor_maps_in: if tm.interpretation == Interpretation.CONTINUOUS: diff --git a/ml4cvd/models.py b/ml4cvd/models.py index 3945f882a..2f293b3ba 100755 --- a/ml4cvd/models.py +++ b/ml4cvd/models.py @@ -75,14 +75,13 @@ def make_shallow_model( my_metrics = {} loss_weights = [] input_tensors = [Input(shape=tm.shape, name=tm.input_name()) for tm in tensor_maps_in] - if len(input_tensors) > 1: - logging.warning('multi input tensors not fully supported') - for it in input_tensors: - for ot in tensor_maps_out: - losses.append(ot.loss) - loss_weights.append(ot.loss_weight) - my_metrics[ot.output_name()] = ot.metrics - outputs.append(Dense(units=len(ot.channel_map), activation=ot.activation, name=ot.output_name())(it)) + + it = concatenate(input_tensors) if len(input_tensors) > 1 else input_tensors[0] + for ot in tensor_maps_out: + losses.append(ot.loss) + loss_weights.append(ot.loss_weight) + my_metrics[ot.output_name()] = ot.metrics + outputs.append(Dense(units=len(ot.channel_map), activation=ot.activation, name=ot.output_name())(it)) opt = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08) m = Model(inputs=input_tensors, outputs=outputs) diff --git a/ml4cvd/plots.py b/ml4cvd/plots.py index 469b2d13c..6c28ad035 100755 --- a/ml4cvd/plots.py +++ b/ml4cvd/plots.py @@ -704,8 +704,8 @@ def _partners_top_panel(data, ax0): ax0.text(0.55, 0.9, f"{data['sitename']}", weight='bold') ax0.text(0.0, 0.75, f"{dob} ({age} yr)", weight='bold') # TODO age units - gender = {value: key for key, value in data['gender'].items()} - ax0.text(0.0, 0.67, f"{gender[1]}".title(), weight='bold') + sex = {value: key for key, value in data['sex'].items()} + ax0.text(0.0, 0.67, f"{sex[1]}".title(), weight='bold') ax0.text(0.0, 0.51, f"Room: ", weight='bold') # TODO room? ax0.text(0.0, 0.43, f"Loc: {data['location']}", weight='bold') @@ -988,7 +988,7 @@ def _partners_clinical(data, args): def plot_partners_ecgs(args): plot_tensors = [ 'partners_ecg_patientid', 'partners_ecg_firstname', 'partners_ecg_lastname', - 'partners_ecg_gender', 'partners_ecg_dob', 'partners_ecg_age', + 'partners_ecg_sex', 'partners_ecg_dob', 'partners_ecg_age', 'partners_ecg_datetime', 'partners_ecg_sitename', 'partners_ecg_location', 'partners_ecg_read_md_raw', 'partners_ecg_taxis_md', 'partners_ecg_rate_md', 'partners_ecg_pr_md', 'partners_ecg_qrs_md', 'partners_ecg_qt_md', diff --git a/ml4cvd/recipes.py b/ml4cvd/recipes.py index 5b408dcef..9cce12dfd 100755 --- a/ml4cvd/recipes.py +++ b/ml4cvd/recipes.py @@ -90,8 +90,6 @@ def run(args): train_siamese_model(args) elif 'write_tensor_maps' == args.mode: write_tensor_maps(args) - elif 'sort_csv' == args.mode: - sort_csv(args.tensors, args.tensor_maps_in) elif 'append_continuous_csv' == args.mode: append_fields_from_csv(args.tensors, args.app_csv, 'continuous', ',') elif 'append_categorical_csv' == args.mode: diff --git a/ml4cvd/tensor_generators.py b/ml4cvd/tensor_generators.py index 1a38b1cc4..adaca18f6 100755 --- a/ml4cvd/tensor_generators.py +++ b/ml4cvd/tensor_generators.py @@ -552,15 +552,15 @@ def _sample_csv_to_set(sample_csv: Optional[str] = None) -> Union[None, Set[str] # If no matches, assume the first column is MRN if not matches: mrn_col_name = df.columns[0] + else: + # Get first string from set of matches to use as column name + mrn_col_name = next(iter(matches)) - elif len(matches) > 1: + if len(matches) > 1: logging.warning( f"{sample_csv} has more than one potential column for MRNs. Inferring most likely column name, but recommend explicitly setting MRN column name.", ) - # Get one string from the set of matches; this is the column name - mrn_col_name = next(iter(matches)) - # Isolate this column from the dataframe, and cast to strings sample_ids = df[mrn_col_name].apply(str) diff --git a/ml4cvd/tensor_maps_partners_ecg.py b/ml4cvd/tensor_maps_partners_ecg.py index 1a6802630..31e467a96 100644 --- a/ml4cvd/tensor_maps_partners_ecg.py +++ b/ml4cvd/tensor_maps_partners_ecg.py @@ -5,25 +5,36 @@ import logging import datetime import numpy as np +import pandas as pd +from itertools import product from collections import defaultdict from typing import Callable, Dict, List, Tuple, Union from ml4cvd.tensor_maps_by_hand import TMAPS -from ml4cvd.defines import ECG_REST_AMP_LEADS, PARTNERS_DATE_FORMAT, STOP_CHAR, PARTNERS_CHAR_2_IDX, PARTNERS_DATETIME_FORMAT, TENSOR_EXT, CARDIAC_SURGERY_DATE_FORMAT +from ml4cvd.defines import ECG_REST_AMP_LEADS, PARTNERS_DATE_FORMAT, STOP_CHAR, PARTNERS_CHAR_2_IDX, PARTNERS_DATETIME_FORMAT, CARDIAC_SURGERY_DATE_FORMAT from ml4cvd.TensorMap import TensorMap, str2date, Interpretation, make_range_validator, decompress_data, TimeSeriesOrder -from ml4cvd.normalizer import Standardize +from ml4cvd.normalizer import Standardize, ZeroMeanStd1 YEAR_DAYS = 365.26 INCIDENCE_CSV = '/media/erisone_snf13/lc_outcomes.csv' -CARDIAC_SURGERY_OUTCOMES_CSV = '/data/sts-data/mgh-all-features-labels.csv' +CARDIAC_SURGERY_OUTCOMES_CSV = '/data/sts-data/mgh-preop-ecg-outcome-labels.csv' PARTNERS_PREFIX = 'partners_ecg_rest' +def _hd5_filename_to_mrn_int(filename: str) -> int: + return int(os.path.basename(filename).split('.')[0]) + + def _get_ecg_dates(tm, hd5): + if not hasattr(_get_ecg_dates, 'mrn_lookup'): + _get_ecg_dates.mrn_lookup = dict() + mrn = _hd5_filename_to_mrn_int(hd5.filename) + if mrn in _get_ecg_dates.mrn_lookup: + return _get_ecg_dates.mrn_lookup[mrn] + dates = list(hd5[tm.path_prefix]) if tm.time_series_lookup is not None: - mrn = int(os.path.basename(hd5.filename).split(TENSOR_EXT)[0]) start, end = tm.time_series_lookup[mrn] dates = [date for date in dates if start <= date <= end] if tm.time_series_order == TimeSeriesOrder.NEWEST: @@ -37,9 +48,25 @@ def _get_ecg_dates(tm, hd5): start_idx = tm.time_series_limit if tm.time_series_limit is not None else 1 dates = dates[-start_idx:] # If num_tensors is 0, get all tensors dates.sort(reverse=True) + _get_ecg_dates.mrn_lookup[mrn] = dates return dates +def validator_no_empty(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): + if any(tensor == ''): + raise ValueError(f'TensorMap {tm.name} failed empty string check.') + + +def validator_no_negative(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): + if any(tensor < 0): + raise ValueError(f'TensorMap {tm.name} failed non-negative check') + + +def validator_not_all_zero(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): + if np.count_nonzero(tensor) == 0: + raise ValueError(f'TensorMap {tm.name} failed all-zero check') + + def _is_dynamic_shape(tm: TensorMap, num_ecgs: int) -> Tuple[bool, Tuple[int, ...]]: if tm.shape[0] is None: return True, (num_ecgs,) + tm.shape[1:] @@ -80,58 +107,84 @@ def _resample_voltage_with_rate(voltage, desired_samples, rate, desired_rate): raise ValueError(f'Voltage length {len(voltage)} is not desired {desired_samples} with desired rate {desired_rate} and rate {rate}.') -def make_voltage(population_normalize: float = None): +def make_voltage(exact_length = False): def get_voltage_from_file(tm, hd5, dependents={}): ecg_dates = _get_ecg_dates(tm, hd5) dynamic, shape = _is_dynamic_shape(tm, len(ecg_dates)) + voltage_length = shape[1] if dynamic else shape[0] tensor = np.zeros(shape, dtype=np.float32) for i, ecg_date in enumerate(ecg_dates): for cm in tm.channel_map: try: path = _make_hd5_path(tm, ecg_date, cm) voltage = decompress_data(data_compressed=hd5[path][()], dtype=hd5[path].attrs['dtype']) - voltage = _resample_voltage(voltage, shape[1] if dynamic else shape[0]) + if exact_length: + assert len(voltage) == voltage_length + voltage = _resample_voltage(voltage, voltage_length) slices = (i, ..., tm.channel_map[cm]) if dynamic else (..., tm.channel_map[cm]) tensor[slices] = voltage - except KeyError: - logging.warning(f'KeyError for channel {cm} in {tm.name}') - if population_normalize is not None: - tensor /= population_normalize + except (KeyError, AssertionError, ValueError): + logging.debug(f'Could not get voltage for lead {cm} with {voltage_length} samples in {hd5.filename}') return tensor return get_voltage_from_file +# Creates 12 TMaps: +# partners_ecg_2500 partners_ecg_2500_exact partners_ecg_5000 partners_ecg_5000_exact +# partners_ecg_2500_std partners_ecg_2500_std_exact partners_ecg_5000_std partners_ecg_5000_std_exact +# partners_ecg_2500_raw partners_ecg_2500_raw_exact partners_ecg_5000_raw partners_ecg_5000_raw_exact +# +# default normalizes with ZeroMeanStd1 and resamples +# _std normalizes with Standardize mean = 0, std = 2000 +# _raw does not normalize +# _exact does not resample +length_options = [2500, 5000] +exact_options = [True, False] +normalize_options = [ZeroMeanStd1(), Standardize(mean=0, std=2000), None] +for length, exact_length, normalization in product(length_options, exact_options, normalize_options): + norm = '' if isinstance(normalization, ZeroMeanStd1) else '_std' if isinstance(normalization, Standardize) else '_raw' + exact = '_exact' if exact_length else '' + name = f'partners_ecg_{length}{norm}{exact}' + TMAPS[name] = TensorMap( + name, + shape=(None, length, 12), + path_prefix=PARTNERS_PREFIX, + tensor_from_file=make_voltage(exact_length), + normalization=normalization, + channel_map=ECG_REST_AMP_LEADS, + time_series_limit=0, + validator=validator_not_all_zero, + ) + -TMAPS['partners_ecg_voltage'] = TensorMap( - 'partners_ecg_voltage', - shape=(None, 2500, 12), - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_voltage(population_normalize=2000.0), - channel_map=ECG_REST_AMP_LEADS, - time_series_limit=0, -) +def voltage_stat(tm, hd5, dependents={}): + ecg_dates = _get_ecg_dates(tm, hd5) + dynamic, shape = _is_dynamic_shape(tm, len(ecg_dates)) + tensor = np.zeros(shape, dtype=np.float32) + for i, ecg_date in enumerate(ecg_dates): + try: + slices = lambda stat: (i, tm.channel_map[stat]) if dynamic else (tm.channel_map[stat],) + path = lambda lead: _make_hd5_path(tm, ecg_date, lead) + voltages = np.array([decompress_data(data_compressed=hd5[path(lead)][()], dtype='int16') for lead in ECG_REST_AMP_LEADS]) + tensor[slices('mean')] = np.mean(voltages) + tensor[slices('std')] = np.std(voltages) + tensor[slices('min')] = np.min(voltages) + tensor[slices('max')] = np.max(voltages) + tensor[slices('median')] = np.median(voltages) + except KeyError: + logging.warning(f'Could not get voltage stats for ECG at {hd5.filename}') + return tensor -TMAPS['partners_ecg_voltage_newest'] = TensorMap( - 'partners_ecg_voltage_newest', - shape=(2500, 12), - interpretation=Interpretation.CONTINUOUS, +TMAPS['partners_ecg_voltage_stats'] = TensorMap( + 'partners_ecg_voltage_stats', + shape=(None, 5), path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_voltage(population_normalize=2000.0), - channel_map=ECG_REST_AMP_LEADS, + tensor_from_file=voltage_stat, + channel_map={'mean': 0, 'std': 1, 'min': 2, 'max': 3, 'median': 4}, + time_series_limit=0, ) -TMAPS['partners_ecg_2500'] = TensorMap('ecg_rest_2500', shape=(None, 2500, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage(), normalization={'zero_mean_std1': True}, channel_map=ECG_REST_AMP_LEADS, time_series_limit=0) -TMAPS['partners_ecg_5000'] = TensorMap('ecg_rest_5000', shape=(None, 5000, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage(), normalization={'zero_mean_std1': True}, channel_map=ECG_REST_AMP_LEADS, time_series_limit=0) -TMAPS['partners_ecg_2500_raw'] = TensorMap('ecg_rest_2500_raw', shape=(None, 2500, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage(population_normalize=2000.0), channel_map=ECG_REST_AMP_LEADS, time_series_limit=0) -TMAPS['partners_ecg_5000_raw'] = TensorMap('ecg_rest_5000_raw', shape=(None, 5000, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage(population_normalize=2000.0), channel_map=ECG_REST_AMP_LEADS, time_series_limit=0) -TMAPS['partners_ecg_2500_newest'] = TensorMap('ecg_rest_2500_newest', shape=(2500, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage(), normalization={'zero_mean_std1': True}, channel_map=ECG_REST_AMP_LEADS) -TMAPS['partners_ecg_5000_newest'] = TensorMap('ecg_rest_5000_newest', shape=(5000, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage(), normalization={'zero_mean_std1': True}, channel_map=ECG_REST_AMP_LEADS) -TMAPS['partners_ecg_2500_raw_newest'] = TensorMap('ecg_rest_2500_raw_newest', shape=(2500, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage(population_normalize=2000.0), channel_map=ECG_REST_AMP_LEADS) -TMAPS['partners_ecg_5000_raw_newest'] = TensorMap('ecg_rest_5000_raw_newest', shape=(5000, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage(population_normalize=2000.0), channel_map=ECG_REST_AMP_LEADS) - - def make_voltage_attr(volt_attr: str = ""): def get_voltage_attr_from_file(tm, hd5, dependents={}): ecg_dates = _get_ecg_dates(tm, hd5) @@ -160,22 +213,6 @@ def get_voltage_attr_from_file(tm, hd5, dependents={}): ) -TMAPS["voltage_len_newest"] = TensorMap( - "voltage_len_newest", - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_voltage_attr(volt_attr="len"), - shape=(12,), - channel_map=ECG_REST_AMP_LEADS, -) - - -TMAPS["len_i"] = TensorMap("len_i", shape=(None, 1), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage_attr(volt_attr="len"), channel_map={'I': 0}, time_series_limit=0) -TMAPS["len_v6"] = TensorMap("len_v6", shape=(None, 1), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage_attr(volt_attr="len"), channel_map={'V6': 0}, time_series_limit=0) -TMAPS["len_i_newest"] = TensorMap("len_i_newest", shape=(1,), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage_attr(volt_attr="len"), channel_map={'I': 0}) -TMAPS["len_v6_newest"] = TensorMap("len_v6_newest", shape=(1,), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage_attr(volt_attr="len"), channel_map={'V6': 0}) - - def make_partners_ecg_label(keys: Union[str, List[str]] = "read_md_clean", dict_of_list: Dict = dict(), not_found_key: str = "unspecified"): if type(keys) == str: keys = [keys] @@ -212,21 +249,6 @@ def get_partners_ecg_label(tm, hd5, dependents={}): return get_partners_ecg_label -def validator_no_empty(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): - if any(tensor == ''): - raise ValueError(f'TensorMap {tm.name} failed empty string check.') - - -def validator_no_negative(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): - if any(tensor < 0): - raise ValueError(f'TensorMap {tm.name} failed non-negative check') - - -def validator_not_all_zero(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): - if not any(tensor != 0): - raise ValueError(f'TensorMap {tm.name} failed all-zero check') - - def partners_ecg_datetime(tm, hd5, dependents={}): ecg_dates = _get_ecg_dates(tm, hd5) dynamic, shape = _is_dynamic_shape(tm, len(ecg_dates)) @@ -248,18 +270,7 @@ def partners_ecg_datetime(tm, hd5, dependents={}): ) -task = "partners_ecg_datetime_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=partners_ecg_datetime, - shape=(1,), - validator=validator_no_empty, -) - - -def make_voltage_len_categorical_tmap(lead, cm_prefix = '_', cm_unknown = 'other'): +def make_voltage_len_categorical_tmap(lead, channel_prefix = '_', channel_unknown = 'other'): def _tensor_from_file(tm, hd5, dependents = {}): ecg_dates = _get_ecg_dates(tm, hd5) dynamic, shape = _is_dynamic_shape(tm, len(ecg_dates)) @@ -268,7 +279,7 @@ def _tensor_from_file(tm, hd5, dependents = {}): path = _make_hd5_path(tm, ecg_date, lead) try: lead_len = hd5[path].attrs['len'] - lead_len = f'{cm_prefix}{lead_len}' + lead_len = f'{channel_prefix}{lead_len}' matched = False for cm in tm.channel_map: if lead_len.lower() == cm.lower(): @@ -277,7 +288,7 @@ def _tensor_from_file(tm, hd5, dependents = {}): matched = True break if not matched: - slices = (i, tm.channel_map[cm_unknown]) if dynamic else (tm.channel_map[cm_unknown],) + slices = (i, tm.channel_map[channel_unknown]) if dynamic else (tm.channel_map[channel_unknown],) tensor[slices] = 1.0 except KeyError: logging.debug(f'Could not get voltage length for lead {lead} from ECG on {ecg_date} in {hd5.filename}') @@ -297,18 +308,8 @@ def _tensor_from_file(tm, hd5, dependents = {}): validator=validator_not_all_zero, ) - tmap_name = f'lead_{lead}_len_newest' - TMAPS[tmap_name] = TensorMap( - tmap_name, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_voltage_len_categorical_tmap(lead=lead), - channel_map={'_2500': 0, '_5000': 1, 'other': 2}, - validator=validator_not_all_zero, - ) - -def make_partners_ecg_tensor(key: str, fill: float = 0, cm_prefix: str = '', cm_unknown: str = 'other'): +def make_partners_ecg_tensor(key: str, fill: float = 0, channel_prefix: str = '', channel_unknown: str = 'other'): def get_partners_ecg_tensor(tm, hd5, dependents={}): ecg_dates = _get_ecg_dates(tm, hd5) dynamic, shape = _is_dynamic_shape(tm, len(ecg_dates)) @@ -327,7 +328,7 @@ def get_partners_ecg_tensor(tm, hd5, dependents={}): data = decompress_data(data_compressed=hd5[path][()], dtype='str') if tm.interpretation == Interpretation.CATEGORICAL: matched = False - data = f'{cm_prefix}{data}' + data = f'{channel_prefix}{data}' for cm in tm.channel_map: if data.lower() == cm.lower(): slices = (i, tm.channel_map[cm]) if dynamic else (tm.channel_map[cm],) @@ -335,7 +336,7 @@ def get_partners_ecg_tensor(tm, hd5, dependents={}): matched = True break if not matched: - slices = (i, tm.channel_map[cm_unknown]) if dynamic else (tm.channel_map[cm_unknown],) + slices = (i, tm.channel_map[channel_unknown]) if dynamic else (tm.channel_map[channel_unknown],) tensor[slices] = 1.0 else: tensor[i] = data @@ -378,17 +379,6 @@ def language_tensor(tm, hd5, dependents={}): ) -task = "partners_ecg_read_md_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="read_md_clean"), - shape=(1,), - validator=validator_no_empty, -) - - task = "partners_ecg_read_pc" TMAPS[task] = TensorMap( task, @@ -405,49 +395,6 @@ def language_tensor(tm, hd5, dependents={}): ) -task = "partners_ecg_read_pc_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="read_pc_clean"), - shape=(1,), - validator=validator_no_empty, -) - - -# TODO do we still need the cross reference tmaps? -def validator_cross_reference(tm: TensorMap, tensor: np.ndarray): - if int(tensor) not in tm.cross_reference: - raise ValueError(f"Skipping TensorMap {tm.name} not found in Apollo.") - - -def create_cross_reference_dict(fpath="/data/apollo/demographics.csv"): - try: - with open(fpath, mode="r") as f: - reader = csv.reader(f) - next(reader) - cross_reference_dict = {int(rows[0]):None for rows in reader} - return cross_reference_dict - except FileNotFoundError: - return {} - - -task = "partners_ecg_patientid_cross_reference_apollo" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="patientid"), - shape=(None, 1), - time_series_limit=0, - validator=validator_cross_reference, -) - - -TMAPS[task].cross_reference = create_cross_reference_dict() - - task = "partners_ecg_patientid" TMAPS[task] = TensorMap( task, @@ -460,17 +407,6 @@ def create_cross_reference_dict(fpath="/data/apollo/demographics.csv"): ) -task = "partners_ecg_patientid_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="patientid"), - shape=(1,), - validator=validator_no_empty, -) - - def validator_clean_mrn(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): int(tensor) @@ -487,17 +423,6 @@ def validator_clean_mrn(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): ) -task = "partners_ecg_patientid_clean_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="patientid_clean"), - shape=(1,), - validator=validator_clean_mrn, -) - - task = "partners_ecg_firstname" TMAPS[task] = TensorMap( task, @@ -513,17 +438,6 @@ def validator_clean_mrn(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): ) -task = "partners_ecg_firstname_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="patientfirstname"), - shape=(1,), - validator=validator_no_empty, -) - - task = "partners_ecg_lastname" TMAPS[task] = TensorMap( task, @@ -539,26 +453,15 @@ def validator_clean_mrn(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): ) -task = "partners_ecg_lastname_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="patientlastname"), - shape=(1,), - validator=validator_no_empty, -) - - -task = "partners_ecg_gender" +task = "partners_ecg_sex" TMAPS[task] = TensorMap( task, - interpretation=Interpretation.LANGUAGE, + interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, tensor_from_file=make_partners_ecg_tensor(key="gender"), - shape=(None, 1), + channel_map={'female': 0, 'male': 1}, time_series_limit=0, - validator=validator_no_empty, + validator=validator_not_all_zero, ) task = "partners_ecg_date" @@ -573,17 +476,6 @@ def validator_clean_mrn(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): ) -task = "partners_ecg_date_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="acquisitiondate"), - shape=(1,), - validator=validator_no_empty, -) - - task = "partners_ecg_time" TMAPS[task] = TensorMap( task, @@ -596,17 +488,6 @@ def validator_clean_mrn(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): ) -task = "partners_ecg_time_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="acquisitiontime"), - shape=(1,), - validator=validator_no_empty, -) - - task = "partners_ecg_sitename" TMAPS[task] = TensorMap( task, @@ -619,17 +500,6 @@ def validator_clean_mrn(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): ) -task = "partners_ecg_sitename_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="sitename"), - shape=(1,), - validator=validator_no_empty, -) - - task = "partners_ecg_location" TMAPS[task] = TensorMap( task, @@ -642,17 +512,6 @@ def validator_clean_mrn(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): ) -task = "partners_ecg_location_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="location"), - shape=(1,), - validator=validator_no_empty, -) - - task = "partners_ecg_dob" TMAPS[task] = TensorMap( task, @@ -665,18 +524,7 @@ def validator_clean_mrn(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): ) -task = "partners_ecg_dob_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.LANGUAGE, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="dateofbirth"), - shape=(1,), - validator=validator_no_empty, -) - - -def make_sampling_frequency_from_file(lead: str = "I", duration: int = 10, cm_prefix: str = "_", cm_unknown: str = "other", fill: int = -1): +def make_sampling_frequency_from_file(lead: str = "I", duration: int = 10, channel_prefix: str = "_", channel_unknown: str = "other", fill: int = -1): def sampling_frequency_from_file(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}): ecg_dates = _get_ecg_dates(tm, hd5) dynamic, shape = _is_dynamic_shape(tm, len(ecg_dates)) @@ -691,7 +539,7 @@ def sampling_frequency_from_file(tm: TensorMap, hd5: h5py.File, dependents: Dict try: if tm.interpretation == Interpretation.CATEGORICAL: matched = False - sampling_frequency = f'{cm_prefix}{sampling_frequency}' + sampling_frequency = f'{channel_prefix}{sampling_frequency}' for cm in tm.channel_map: if sampling_frequency.lower() == cm.lower(): slices = (i, tm.channel_map[cm]) if dynamic else (tm.channel_map[cm],) @@ -699,7 +547,7 @@ def sampling_frequency_from_file(tm: TensorMap, hd5: h5py.File, dependents: Dict matched = True break if not matched: - slices = (i, tm.channel_map[cm_unknown]) if dynamic else (tm.channel_map[cm_unknown],) + slices = (i, tm.channel_map[channel_unknown]) if dynamic else (tm.channel_map[channel_unknown],) tensor[slices] = 1.0 else: tensor[i] = sampling_frequency @@ -721,86 +569,42 @@ def sampling_frequency_from_file(tm: TensorMap, hd5: h5py.File, dependents: Dict ) -task = "partners_ecg_sampling_frequency_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_sampling_frequency_from_file(), - channel_map={'_250': 0, '_500': 1, 'other': 2}, - validator=validator_not_all_zero, -) - - task = "partners_ecg_sampling_frequency_pc" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_pc", cm_prefix='_'), + tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_pc", channel_prefix='_'), channel_map={'_0': 0, '_250': 1, '_500': 2, 'other': 3}, time_series_limit=0, validator=validator_not_all_zero, ) -task = "partners_ecg_sampling_frequency_pc_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_pc", cm_prefix='_'), - channel_map={'_0': 0, '_250': 1, '_500': 2, 'other': 3}, - validator=validator_not_all_zero, -) - - task = "partners_ecg_sampling_frequency_md" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_md", cm_prefix='_'), + tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_md", channel_prefix='_'), channel_map={'_0': 0, '_250': 1, '_500': 2, 'other': 3}, time_series_limit=0, validator=validator_not_all_zero, ) -task = "partners_ecg_sampling_frequency_md_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_md", cm_prefix='_'), - channel_map={'_0': 0, '_250': 1, '_500': 2, 'other': 3}, - validator=validator_not_all_zero, -) - - task = "partners_ecg_sampling_frequency_wv" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_samplebase", cm_prefix='_'), + tensor_from_file=make_partners_ecg_tensor(key="waveform_samplebase", channel_prefix='_'), channel_map={'_0': 0, '_240': 1, '_250': 2, '_500': 3, 'other': 4}, time_series_limit=0, validator=validator_not_all_zero, ) -task = "partners_ecg_sampling_frequency_wv_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_samplebase", cm_prefix='_'), - channel_map={'_0': 0, '_240': 1, '_250': 2, '_500': 3, 'other': 4}, - validator=validator_not_all_zero, -) - - task = "partners_ecg_sampling_frequency_continuous" TMAPS[task] = TensorMap( task, @@ -813,644 +617,319 @@ def sampling_frequency_from_file(tm: TensorMap, hd5: h5py.File, dependents: Dict ) -task = "partners_ecg_sampling_frequency_continuous_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_sampling_frequency_from_file(), - shape=(1,), - validator=validator_no_negative, -) - - task = "partners_ecg_sampling_frequency_pc_continuous" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_pc", fill=-1), - time_series_limit=0, - shape=(None, 1), - validator=validator_no_negative, -) - - -task = "partners_ecg_sampling_frequency_pc_continuous_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_pc", fill=-1), - shape=(1,), - validator=validator_no_negative, -) - - -task = "partners_ecg_sampling_frequency_md_continuous" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_md", fill=-1), - time_series_limit=0, - shape=(None, 1), - validator=validator_no_negative, -) - - -task = "partners_ecg_sampling_frequency_md_continuous_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_md", fill=-1), - shape=(1,), - validator=validator_no_negative, -) - - -task = "partners_ecg_sampling_frequency_wv_continuous" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_samplebase", fill=-1), - time_series_limit=0, - shape=(None, 1), - validator=validator_no_negative, -) - - -task = "partners_ecg_sampling_frequency_wv_continuous_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_samplebase", fill=-1), - shape=(1,), - validator=validator_no_negative, -) - - -task = "partners_ecg_time_resolution" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="intervalmeasurementtimeresolution", cm_prefix='_'), - channel_map={'_25': 0, '_50': 1, '_100': 2, 'other': 3}, - time_series_limit=0, - validator=validator_not_all_zero, -) - - -task = "partners_ecg_time_resolution_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="intervalmeasurementtimeresolution", cm_prefix='_'), - channel_map={'_25': 0, '_50': 1, '_100': 2, 'other': 3}, - validator=validator_not_all_zero, -) - - -task = "partners_ecg_amplitude_resolution" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="intervalmeasurementamplituderesolution", cm_prefix='_'), - channel_map={'_10': 0, '_20': 1, '_40': 2, 'other': 3}, - time_series_limit=0, - validator=validator_not_all_zero, -) - - -task = "partners_ecg_amplitude_resolution_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="intervalmeasurementamplituderesolution", cm_prefix='_'), - channel_map={'_10': 0, '_20': 1, '_40': 2, 'other': 3}, - validator=validator_not_all_zero, -) - - -task = "partners_ecg_measurement_filter" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="intervalmeasurementfilter", cm_prefix='_'), - time_series_limit=0, - channel_map={'_None': 0, '_40': 1, '_80': 2, 'other': 3}, - validator=validator_not_all_zero, -) - - -task = "partners_ecg_measurement_filter_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="intervalmeasurementfilter", cm_prefix='_'), - channel_map={'_None': 0, '_40': 1, '_80': 2, 'other': 3}, - validator=validator_not_all_zero, -) - - -task = "partners_ecg_high_pass_filter" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_highpassfilter", fill=-1), - time_series_limit=0, - shape=(None, 1), - validator=validator_no_negative, -) - - -task = "partners_ecg_high_pass_filter_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_highpassfilter", fill=-1), - shape=(1,), - validator=validator_no_negative, -) - - -task = "partners_ecg_low_pass_filter" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_lowpassfilter", fill=-1), - time_series_limit=0, - shape=(None, 1), - validator=validator_no_negative, -) - - -task = "partners_ecg_low_pass_filter_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_lowpassfilter", fill=-1), - shape=(1,), - validator=validator_no_negative, -) - - -task = "partners_ecg_ac_filter" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_acfilter", cm_prefix='_'), - time_series_limit=0, - channel_map={'_None': 0, '_50': 1, '_60': 2, 'other': 3}, - validator=validator_not_all_zero, -) - - -task = "partners_ecg_ac_filter_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CATEGORICAL, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=make_partners_ecg_tensor(key="waveform_acfilter", cm_prefix='_'), - channel_map={'_None': 0, '_50': 1, '_60': 2, 'other': 3}, - validator=validator_not_all_zero, -) - - -task = "partners_ecg_rate" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="ventricularrate_pc"), - shape=(None, 1), - time_series_limit=0, - validator=make_range_validator(10, 200), -) - - -task = "partners_ecg_rate_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="ventricularrate_pc"), - shape=(1,), - validator=make_range_validator(10, 200), -) - - -task = "partners_ecg_rate_md" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="ventricularrate_md"), - shape=(None, 1), - time_series_limit=0, - validator=make_range_validator(10, 200), -) - - -task = "partners_ecg_rate_md_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="ventricularrate_md"), - shape=(1,), - validator=make_range_validator(10, 200), -) - - -TMAPS['partners_ventricular_rate'] = TensorMap( - 'VentricularRate', - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="ventricularrate_pc"), - shape=(None, 1), - time_series_limit=0, - validator=make_range_validator(10, 200), - normalization={'mean': 59.3, 'std': 10.6}, -) - - -TMAPS['partners_ventricular_rate_newest'] = TensorMap( - 'VentricularRate_newest', - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="ventricularrate_pc"), - shape=(1,), - validator=make_range_validator(10, 200), - normalization={'mean': 59.3, 'std': 10.6}, -) - - -task = "partners_ecg_qrs" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qrsduration_pc"), - shape=(None, 1), - time_series_limit=0, - validator=make_range_validator(20, 400), -) - - -task = "partners_ecg_qrs_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - metrics=['mse'], - tensor_from_file=make_partners_ecg_tensor(key="qrsduration_pc"), - shape=(1,), - validator=make_range_validator(20, 400), -) - - -task = "partners_ecg_qrs_md" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qrsduration_md"), - shape=(None, 1), - time_series_limit=0, - validator=make_range_validator(20, 400), -) - - -task = "partners_ecg_qrs_md_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qrsduration_md"), - shape=(1,), - validator=make_range_validator(20, 400), + tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_pc", fill=-1), + time_series_limit=0, + shape=(None, 1), + validator=validator_no_negative, ) -task = "partners_ecg_pr" +task = "partners_ecg_sampling_frequency_md_continuous" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="printerval_pc"), - shape=(None, 1), + tensor_from_file=make_partners_ecg_tensor(key="ecgsamplebase_md", fill=-1), time_series_limit=0, - validator=make_range_validator(50, 500), + shape=(None, 1), + validator=validator_no_negative, ) -task = "partners_ecg_pr_newest" +task = "partners_ecg_sampling_frequency_wv_continuous" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="printerval_pc"), - shape=(1,), - validator=make_range_validator(50, 500), + tensor_from_file=make_partners_ecg_tensor(key="waveform_samplebase", fill=-1), + time_series_limit=0, + shape=(None, 1), + validator=validator_no_negative, ) -task = "partners_ecg_pr_md" +task = "partners_ecg_time_resolution" TMAPS[task] = TensorMap( task, - interpretation=Interpretation.CONTINUOUS, + interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="printerval_md"), - shape=(None, 1), + tensor_from_file=make_partners_ecg_tensor(key="intervalmeasurementtimeresolution", channel_prefix='_'), + channel_map={'_25': 0, '_50': 1, '_100': 2, 'other': 3}, time_series_limit=0, - validator=make_range_validator(50, 500), + validator=validator_not_all_zero, ) -task = "partners_ecg_pr_md_newest" +task = "partners_ecg_amplitude_resolution" TMAPS[task] = TensorMap( task, - interpretation=Interpretation.CONTINUOUS, + interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="printerval_md"), - shape=(1,), - validator=make_range_validator(50, 500), + tensor_from_file=make_partners_ecg_tensor(key="intervalmeasurementamplituderesolution", channel_prefix='_'), + channel_map={'_10': 0, '_20': 1, '_40': 2, 'other': 3}, + time_series_limit=0, + validator=validator_not_all_zero, ) -task = "partners_ecg_qt" +task = "partners_ecg_measurement_filter" TMAPS[task] = TensorMap( task, - interpretation=Interpretation.CONTINUOUS, + interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qtinterval_pc"), - shape=(None, 1), + tensor_from_file=make_partners_ecg_tensor(key="intervalmeasurementfilter", channel_prefix='_'), time_series_limit=0, - validator=make_range_validator(100, 800), + channel_map={'_None': 0, '_40': 1, '_80': 2, 'other': 3}, + validator=validator_not_all_zero, ) -task = "partners_ecg_qt_newest" +task = "partners_ecg_high_pass_filter" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qtinterval_pc"), - shape=(1,), - validator=make_range_validator(100, 800), + tensor_from_file=make_partners_ecg_tensor(key="waveform_highpassfilter", fill=-1), + time_series_limit=0, + shape=(None, 1), + validator=validator_no_negative, ) -task = "partners_ecg_qt_md" +task = "partners_ecg_low_pass_filter" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qtinterval_md"), - shape=(None, 1), + tensor_from_file=make_partners_ecg_tensor(key="waveform_lowpassfilter", fill=-1), time_series_limit=0, - validator=make_range_validator(100, 800), + shape=(None, 1), + validator=validator_no_negative, ) -task = "partners_ecg_qt_md_newest" +task = "partners_ecg_ac_filter" TMAPS[task] = TensorMap( task, - interpretation=Interpretation.CONTINUOUS, + interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qtinterval_md"), - shape=(1,), - validator=make_range_validator(100, 800), + tensor_from_file=make_partners_ecg_tensor(key="waveform_acfilter", channel_prefix='_'), + time_series_limit=0, + channel_map={'_None': 0, '_50': 1, '_60': 2, 'other': 3}, + validator=validator_not_all_zero, ) -task = "partners_ecg_qtc" +task = "partners_ecg_rate_pc" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qtcorrected_pc"), + tensor_from_file=make_partners_ecg_tensor(key="ventricularrate_pc"), shape=(None, 1), + normalization=Standardize(mean=59.3, std=10.6), time_series_limit=0, - validator=make_range_validator(100, 800), + validator=make_range_validator(10, 200), ) -task = "partners_ecg_qtc_newest" +task = "partners_ecg_rate_md" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qtcorrected_pc"), - shape=(1,), - validator=make_range_validator(100, 800), + tensor_from_file=make_partners_ecg_tensor(key="ventricularrate_md"), + shape=(None, 1), + time_series_limit=0, + validator=make_range_validator(10, 200), ) -task = "partners_ecg_qtc_md" +task = "partners_ecg_qrs_pc" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qtcorrected_md"), + tensor_from_file=make_partners_ecg_tensor(key="qrsduration_pc"), shape=(None, 1), time_series_limit=0, - validator=make_range_validator(100, 800), + validator=make_range_validator(20, 400), ) -task = "partners_ecg_qtc_md_newest" +task = "partners_ecg_qrs_md" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="qtcorrected_md"), - shape=(1,), - validator=make_range_validator(100, 800), + tensor_from_file=make_partners_ecg_tensor(key="qrsduration_md"), + shape=(None, 1), + time_series_limit=0, + validator=make_range_validator(20, 400), ) -task = "partners_ecg_paxis" +task = "partners_ecg_pr_pc" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="paxis_pc", fill=999), + tensor_from_file=make_partners_ecg_tensor(key="printerval_pc"), shape=(None, 1), time_series_limit=0, - validator=make_range_validator(-180, 180), + validator=make_range_validator(50, 500), ) -task = "partners_ecg_paxis_newest" +task = "partners_ecg_pr_md" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="paxis_pc", fill=999), - shape=(1,), - validator=make_range_validator(-180, 180), + tensor_from_file=make_partners_ecg_tensor(key="printerval_md"), + shape=(None, 1), + time_series_limit=0, + validator=make_range_validator(50, 500), ) -task = "partners_ecg_paxis_md" +task = "partners_ecg_qt_pc" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="paxis_md", fill=999), + tensor_from_file=make_partners_ecg_tensor(key="qtinterval_pc"), shape=(None, 1), time_series_limit=0, - validator=make_range_validator(-180, 180), + validator=make_range_validator(100, 800), ) -task = "partners_ecg_paxis_md_newest" +task = "partners_ecg_qt_md" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="paxis_md", fill=999), - shape=(1,), - validator=make_range_validator(-180, 180), + tensor_from_file=make_partners_ecg_tensor(key="qtinterval_md"), + shape=(None, 1), + time_series_limit=0, + validator=make_range_validator(100, 800), ) -task = "partners_ecg_raxis" +task = "partners_ecg_qtc_pc" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="raxis_pc", fill=999), + tensor_from_file=make_partners_ecg_tensor(key="qtcorrected_pc"), shape=(None, 1), time_series_limit=0, - validator=make_range_validator(-180, 180), + validator=make_range_validator(100, 800), ) -task = "partners_ecg_raxis_newest" +task = "partners_ecg_qtc_md" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="raxis_pc", fill=999), - shape=(1,), - validator=make_range_validator(-180, 180), + tensor_from_file=make_partners_ecg_tensor(key="qtcorrected_md"), + shape=(None, 1), + time_series_limit=0, + validator=make_range_validator(100, 800), ) -task = "partners_ecg_raxis_md" +task = "partners_ecg_paxis_pc" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="raxis_md", fill=999), + tensor_from_file=make_partners_ecg_tensor(key="paxis_pc", fill=999), shape=(None, 1), time_series_limit=0, validator=make_range_validator(-180, 180), ) -task = "partners_ecg_raxis_md_newest" +task = "partners_ecg_paxis_md" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="raxis_md", fill=999), - shape=(1,), + tensor_from_file=make_partners_ecg_tensor(key="paxis_md", fill=999), + shape=(None, 1), + time_series_limit=0, validator=make_range_validator(-180, 180), ) -task = "partners_ecg_taxis" +task = "partners_ecg_raxis_pc" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="taxis_pc", fill=999), + tensor_from_file=make_partners_ecg_tensor(key="raxis_pc", fill=999), shape=(None, 1), time_series_limit=0, validator=make_range_validator(-180, 180), ) -task = "partners_ecg_taxis_newest" +task = "partners_ecg_raxis_md" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="taxis_pc", fill=999), - shape=(1,), + tensor_from_file=make_partners_ecg_tensor(key="raxis_md", fill=999), + shape=(None, 1), + time_series_limit=0, validator=make_range_validator(-180, 180), ) -task = "partners_ecg_taxis_md" +task = "partners_ecg_taxis_pc" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="taxis_md", fill=999), + tensor_from_file=make_partners_ecg_tensor(key="taxis_pc", fill=999), shape=(None, 1), time_series_limit=0, validator=make_range_validator(-180, 180), ) -task = "partners_ecg_taxis_md_newest" +task = "partners_ecg_taxis_md" TMAPS[task] = TensorMap( task, interpretation=Interpretation.CONTINUOUS, path_prefix=PARTNERS_PREFIX, loss='logcosh', tensor_from_file=make_partners_ecg_tensor(key="taxis_md", fill=999), - shape=(1,), + shape=(None, 1), + time_series_limit=0, validator=make_range_validator(-180, 180), ) @@ -1468,18 +947,6 @@ def sampling_frequency_from_file(tm: TensorMap, hd5: h5py.File, dependents: Dict ) -task = "partners_ecg_weight_lbs_newest" -TMAPS[task] = TensorMap( - task, - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - loss='logcosh', - tensor_from_file=make_partners_ecg_tensor(key="weightlbs"), - shape=(1,), - validator=make_range_validator(100, 800), -) - - def partners_ecg_age(tm, hd5, dependents={}): ecg_dates = _get_ecg_dates(tm, hd5) dynamic, shape = _is_dynamic_shape(tm, len(ecg_dates)) @@ -1503,7 +970,6 @@ def partners_ecg_age(tm, hd5, dependents={}): TMAPS['partners_ecg_age'] = TensorMap('partners_ecg_age', path_prefix=PARTNERS_PREFIX, loss='logcosh', tensor_from_file=partners_ecg_age, shape=(None, 1), time_series_limit=0) -TMAPS['partners_ecg_age_newest'] = TensorMap('partners_ecg_age_newest', path_prefix=PARTNERS_PREFIX, loss='logcosh', tensor_from_file=partners_ecg_age, shape=(1,)) def partners_ecg_acquisition_year(tm, hd5, dependents={}): @@ -1521,7 +987,6 @@ def partners_ecg_acquisition_year(tm, hd5, dependents={}): TMAPS['partners_ecg_acquisition_year'] = TensorMap('partners_ecg_acquisition_year', path_prefix=PARTNERS_PREFIX, loss='logcosh', tensor_from_file=partners_ecg_acquisition_year, shape=(None, 1), time_series_limit=0) -TMAPS['partners_ecg_acquisition_year_newest'] = TensorMap('partners_ecg_acquisition_year_newest', path_prefix=PARTNERS_PREFIX, loss='logcosh', tensor_from_file=partners_ecg_acquisition_year, shape=(1,)) def partners_bmi(tm, hd5, dependents={}): @@ -1544,7 +1009,6 @@ def partners_bmi(tm, hd5, dependents={}): TMAPS['partners_ecg_bmi'] = TensorMap('partners_ecg_bmi', path_prefix=PARTNERS_PREFIX, channel_map={'bmi': 0}, tensor_from_file=partners_bmi, time_series_limit=0) -TMAPS['partners_ecg_bmi_newest'] = TensorMap('partners_ecg_bmi_newest', path_prefix=PARTNERS_PREFIX, channel_map={'bmi': 0}, tensor_from_file=partners_bmi) def partners_channel_string(hd5_key, race_synonyms={}, unspecified_key=None): @@ -1589,12 +1053,6 @@ def tensor_from_string(tm, hd5, dependents={}): ) -TMAPS['partners_ecg_race_newest'] = TensorMap( - 'partners_ecg_race_newest', interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, channel_map={'asian': 0, 'black': 1, 'hispanic': 2, 'white': 3, 'unknown': 4}, - tensor_from_file=partners_channel_string('race', race_synonyms), -) - - def _partners_adult(hd5_key, minimum_age=18): def tensor_from_string(tm, hd5, dependents={}): ecg_dates = _get_ecg_dates(tm, hd5) @@ -1623,18 +1081,12 @@ def tensor_from_string(tm, hd5, dependents={}): return tensor_from_string -TMAPS['partners_adult_gender'] = TensorMap( - 'adult_gender', interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, channel_map={'female': 0, 'male': 1}, +TMAPS['partners_adult_sex'] = TensorMap( + 'adult_sex', interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, channel_map={'female': 0, 'male': 1}, tensor_from_file=_partners_adult('gender'), time_series_limit=0, ) -TMAPS['partners_adult_gender_newest'] = TensorMap( - 'adult_gender_newest', interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, channel_map={'female': 0, 'male': 1}, - tensor_from_file=_partners_adult('gender'), -) - - def voltage_zeros(tm, hd5, dependents={}): ecg_dates = _get_ecg_dates(tm, hd5) dynamic, shape = _is_dynamic_shape(tm, len(ecg_dates)) @@ -1658,20 +1110,6 @@ def voltage_zeros(tm, hd5, dependents={}): time_series_limit=0, ) -TMAPS["voltage_zeros_newest"] = TensorMap( - "voltage_zeros_newest", - interpretation=Interpretation.CONTINUOUS, - path_prefix=PARTNERS_PREFIX, - tensor_from_file=voltage_zeros, - shape=(12,), - channel_map=ECG_REST_AMP_LEADS, -) - -TMAPS["lead_i_zeros"] = TensorMap("lead_i_zeros", shape=(None, 1), path_prefix=PARTNERS_PREFIX, tensor_from_file=voltage_zeros, channel_map={'I': 0}, time_series_limit=0) -TMAPS["lead_v6_zeros"] = TensorMap("lead_v6_zeros", shape=(None, 1), path_prefix=PARTNERS_PREFIX, tensor_from_file=voltage_zeros, channel_map={'V6': 0}, time_series_limit=0) -TMAPS["lead_i_zeros_newest"] = TensorMap("lead_i_zeros_newest", shape=(1,), path_prefix=PARTNERS_PREFIX, tensor_from_file=voltage_zeros, channel_map={'I': 0}) -TMAPS["lead_v6_zeros_newest"] = TensorMap("lead_v6_zeros_newest", shape=(1,), path_prefix=PARTNERS_PREFIX, tensor_from_file=voltage_zeros, channel_map={'V6': 0}) - def v6_zeros_validator(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): voltage = decompress_data(data_compressed=hd5['V6'][()], dtype=hd5['V6'].attrs['dtype']) @@ -1679,8 +1117,41 @@ def v6_zeros_validator(tm: TensorMap, tensor: np.ndarray, hd5: h5py.File): raise ValueError(f'TensorMap {tm.name} has too many zeros in V6.') +def build_partners_time_series_tensor_maps( + needed_tensor_maps: List[str], + time_series_limit: int = 1, +) -> Dict[str, TensorMap]: + name2tensormap: Dict[str:TensorMap] = {} + + for needed_name in needed_tensor_maps: + if needed_name.endswith('_newest'): + base_split = '_newest' + time_series_order = TimeSeriesOrder.NEWEST + elif needed_name.endswith('_oldest'): + base_split = '_oldest' + time_series_order = TimeSeriesOrder.OLDEST + elif needed_name.endswith('_random'): + base_split = '_random' + time_series_order = TimeSeriesOrder.RANDOM + else: + continue + + base_name = needed_name.split(base_split)[0] + if base_name not in TMAPS: + continue + + time_tmap = copy.deepcopy(TMAPS[base_name]) + time_tmap.name = needed_name + time_tmap.shape = time_tmap.shape[1:] + time_tmap.time_series_limit = time_series_limit + time_tmap.time_series_order = time_series_order + + name2tensormap[needed_name] = time_tmap + return name2tensormap + + # Date formatting -def _partners_str2date(d) -> datetime.datetime: +def _partners_str2date(d) -> datetime.date: return datetime.datetime.strptime(d, PARTNERS_DATE_FORMAT).date() @@ -1692,10 +1163,6 @@ def _cardiac_surgery_str2date(input_date: str, date_format: str = CARDIAC_SURGER return datetime.datetime.strptime(input_date, date_format) -def _hd5_filename_to_mrn_int(filename: str) -> int: - return int(os.path.basename(filename).split('.')[0]) - - def build_incidence_tensor_from_file( file_name: str, patient_column: str = 'Mrn', birth_column: str = 'birth_date', diagnosis_column: str = 'first_stroke', start_column: str = 'start_fu', @@ -2021,92 +1488,58 @@ def build_partners_tensor_maps(needed_tensor_maps: List[str]) -> Dict[str, Tenso return name2tensormap -def _dates_with_voltage_len(ecg_dates, voltage_len, tm, hd5, voltage_key = list(ECG_REST_AMP_LEADS.keys())[0]): - path = lambda ecg_date: _make_hd5_path(tm, ecg_date, voltage_key) - return [ecg_date for ecg_date in ecg_dates if hd5[path(ecg_date)].attrs['len'] == voltage_len] +def build_cardiac_surgery_dict( + filename: str = CARDIAC_SURGERY_OUTCOMES_CSV, + patient_column: str = 'medrecn', + date_column: str = 'surgdt', + additional_columns: List[str] = [], +) -> Dict[int, Dict[str, Union[int, str]]]: + keys = [date_column] + additional_columns + df = pd.read_csv( + filename, + low_memory=False, + usecols=[patient_column]+keys, + ).sort_values(by=[patient_column, date_column]) + # sort dataframe such that newest surgery per patient appears later and is used in lookup table + cardiac_surgery_dict = {} + for row in df.itertuples(): + patient_key = getattr(row, patient_column) + cardiac_surgery_dict[patient_key] = {key: getattr(row, key) for key in keys} + return cardiac_surgery_dict -def _date_in_window_from_dates(ecg_dates, surgery_date, day_window): - ecg_dates.sort(reverse=True) - for ecg_date in ecg_dates: - ecg_datetime = datetime.datetime.strptime(ecg_date, PARTNERS_DATETIME_FORMAT) - if datetime.timedelta(days=0) <= surgery_date - ecg_datetime <= datetime.timedelta(days=day_window): - return ecg_date - raise ValueError(f'No ECG in time window') +def build_date_interval_lookup( + cardiac_surgery_dict: Dict[int, Dict[str, Union[int, str]]], + start_column: str = 'surgdt', + start_offset: int = -30, + end_column: str = 'surgdt', + end_offset: int = 0, +) -> Dict[int, Tuple[str, str]]: + date_interval_lookup = {} + for mrn in cardiac_surgery_dict: + start_date = (_cardiac_surgery_str2date(cardiac_surgery_dict[mrn][start_column], PARTNERS_DATETIME_FORMAT.replace('T', ' ')) + datetime.timedelta(days=start_offset)).strftime(PARTNERS_DATETIME_FORMAT) + end_date = (_cardiac_surgery_str2date(cardiac_surgery_dict[mrn][end_column], PARTNERS_DATETIME_FORMAT.replace('T', ' ')) + datetime.timedelta(days=end_offset)).strftime(PARTNERS_DATETIME_FORMAT) + date_interval_lookup[mrn] = (start_date, end_date) + return date_interval_lookup -def build_cardiac_surgery_outcome_tensor_from_file( - file_name: str, - outcome2column: Dict[str, str], - patient_column: str = "medrecn", - start_column: str = "surgdt", - delimiter: str = ",", - day_window: int = 30, - require_exact_length: bool = False, +def make_cardiac_surgery_outcome_tensor_from_file( + cardiac_surgery_dict: Dict[int, Dict[str, Union[int, str]]], + outcome_column: str, ) -> Callable: - """Build a tensor_from_file function for outcomes given CSV of patients. - - The tensor_from_file function returned here should be used - with CATEGORICAL TensorMaps to classify patients by disease state. - - :param file_name: CSV or TSV file with header of patient IDs (MRNs) dates of enrollment and dates of diagnosis - :param patient_column: The header name of the column of patient ids - :param outcome2column: Dictionary mapping outcome names to the header name of the column with outcome status - :param start_column: The header name of the column of surgery dates - :param delimiter: The delimiter separating columns of the TSV or CSV - :return: The tensor_from_file function to provide to TensorMap constructors - """ - error = None - try: - with open(file_name, "r", encoding="utf-8") as f: - reader = csv.reader(f, delimiter=delimiter) - header = next(reader) - patient_index = header.index(patient_column) - date_index = header.index(start_column) - surgery_date_table = {} - outcome_table = defaultdict(dict) - for row in reader: - try: - patient_key = int(row[patient_index]) - surgery_date_table[patient_key] = _cardiac_surgery_str2date(row[date_index]) - for outcome in outcome2column: - outcome_table[outcome][patient_key] = int(row[header.index(outcome2column[outcome])]) - if len(outcome_table) % 1000 == 0: - logging.debug(f"Processed: {len(outcome_table)} outcome rows.") - except ValueError as e: - logging.debug(f'Value error {e}') - - logging.info(f"Processed outcomes:{list(outcome_table.keys())}. Got {len(surgery_date_table)} patients.") - - except FileNotFoundError as e: - error = e - - def tensor_from_file(tm: TensorMap, hd5: h5py.File, dependents=None): - if error: - raise error - - mrn_int = _hd5_filename_to_mrn_int(hd5.filename) - if mrn_int not in surgery_date_table: - raise KeyError(f"MRN not in STS outcomes CSV") - - ecg_dates = list(hd5[tm.path_prefix]) - if require_exact_length: - ecg_dates = _dates_with_voltage_len(ecg_dates, tm.shape[0], tm, hd5) + def tensor_from_file(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}): + mrn = _hd5_filename_to_mrn_int(hd5.filename) tensor = np.zeros(tm.shape, dtype=np.float32) - for dtm in tm.dependent_map: - dependents[tm.dependent_map[dtm]] = np.zeros(tm.dependent_map[dtm].shape, dtype=np.float32) - ecg_date = _date_in_window_from_dates(ecg_dates, surgery_date_table[mrn_int], day_window) - for cm in tm.channel_map: - path = _make_hd5_path(tm, ecg_date, cm) - voltage = decompress_data(data_compressed=hd5[path][()], dtype=hd5[path].attrs['dtype']) - if require_exact_length and len(voltage) != tm.shape[0]: - raise ValueError(f'lead {cm} voltage length {len(voltage)} did not match required length {tm.shape[0]}') - voltage = _resample_voltage(voltage, tm.shape[0]) - tensor[..., tm.channel_map[cm]] = voltage + outcome = cardiac_surgery_dict[mrn][outcome_column] + + if type(outcome) is float and not outcome.is_integer(): + raise ValueError(f'Cardiac Surgery categorical outcome {tm.name} ({outcome_column}) got non-discrete value: {outcome}') - for dtm in tm.dependent_map: - dependents[tm.dependent_map[dtm]][outcome_table[dtm][mrn_int]] = 1.0 + # ensure binary outcome + if outcome != 0 and outcome != 1: + raise ValueError(f'Cardiac Surgery categorical outcome {tm.name} ({outcome_column}) got non-binary value: {outcome}') + tensor[outcome] = 1 return tensor return tensor_from_file @@ -2119,136 +1552,39 @@ def build_cardiac_surgery_tensor_maps( "sts_death": "mtopd", "sts_stroke": "cnstrokp", "sts_renal_failure": "crenfail", - "sts_prolonged_ventilation": "crenfail", + "sts_prolonged_ventilation": "cpvntlng", "sts_dsw_infection": "deepsterninf", "sts_reoperation": "reop", "sts_any_morbidity": "anymorbidity", "sts_long_stay": "llos", } - dependent_maps = {} - for outcome in outcome2column: - channel_map = _outcome_channels(outcome) - dependent_maps[outcome] = TensorMap(outcome, Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, channel_map=channel_map) - - name = 'ecg_2500_sts' - if name in needed_tensor_maps: - tensor_from_file_fxn = build_cardiac_surgery_outcome_tensor_from_file( - file_name=CARDIAC_SURGERY_OUTCOMES_CSV, - outcome2column=outcome2column, - day_window=30, - ) - name2tensormap[name] = TensorMap( - name, - shape=(2500, 12), - path_prefix=PARTNERS_PREFIX, - dependent_map=dependent_maps, - channel_map=ECG_REST_AMP_LEADS, - tensor_from_file=tensor_from_file_fxn, - normalization=Standardize(mean=0, std=2000), - ) - name = 'ecg_5000_sts' - if name in needed_tensor_maps: - tensor_from_file_fxn = build_cardiac_surgery_outcome_tensor_from_file( - file_name=CARDIAC_SURGERY_OUTCOMES_CSV, - outcome2column=outcome2column, - day_window=30, - ) - name2tensormap[name] = TensorMap( - name, - shape=(5000, 12), - path_prefix=PARTNERS_PREFIX, - dependent_map=dependent_maps, - channel_map=ECG_REST_AMP_LEADS, - tensor_from_file=tensor_from_file_fxn, - normalization=Standardize(mean=0, std=2000), - ) - name = 'ecg_2500_sts_exact' - if name in needed_tensor_maps: - tensor_from_file_fxn = build_cardiac_surgery_outcome_tensor_from_file( - file_name=CARDIAC_SURGERY_OUTCOMES_CSV, - outcome2column=outcome2column, - day_window=30, - require_exact_length=True, - ) - name2tensormap[name] = TensorMap( - name, - shape=(2500, 12), - path_prefix=PARTNERS_PREFIX, - dependent_map=dependent_maps, - channel_map=ECG_REST_AMP_LEADS, - tensor_from_file=tensor_from_file_fxn, - normalization=Standardize(mean=0, std=2000), - ) - name = 'ecg_5000_sts_exact' - if name in needed_tensor_maps: - tensor_from_file_fxn = build_cardiac_surgery_outcome_tensor_from_file( - file_name=CARDIAC_SURGERY_OUTCOMES_CSV, - outcome2column=outcome2column, - day_window=30, - require_exact_length=True, - ) - name2tensormap[name] = TensorMap( - name, - shape=(5000, 12), - path_prefix=PARTNERS_PREFIX, - dependent_map=dependent_maps, - channel_map=ECG_REST_AMP_LEADS, - tensor_from_file=tensor_from_file_fxn, - normalization=Standardize(mean=0, std=2000), - ) - for outcome in outcome2column: - if outcome in needed_tensor_maps: - name2tensormap[outcome] = dependent_maps[outcome] - - name2tensormap.update(_build_cardiac_surgery_basic_tensor_maps(needed_tensor_maps)) - return name2tensormap - - -def build_date_interval_lookup( - file_name: str = CARDIAC_SURGERY_OUTCOMES_CSV, - delimiter: str = ',', - patient_column: str = 'medrecn', - start_column: str = 'surgdt', - start_offset: int = -30, - end_column: str = 'surgdt', - end_offset: int = 0, -) -> Dict[int, Tuple[str, str]]: - with open(file_name, 'r', encoding='utf-8') as f: - reader = csv.reader(f, delimiter=delimiter) - header = next(reader) - patient_index = header.index(patient_column) - start_index = header.index(start_column) - end_index = header.index(end_column) - date_interval_lookup = {} - for row in reader: - try: - patient_key = int(row[patient_index]) - start_date = (_cardiac_surgery_str2date(row[start_index]) + datetime.timedelta(days=start_offset)).strftime(PARTNERS_DATETIME_FORMAT) - end_date = (_cardiac_surgery_str2date(row[end_index]) + datetime.timedelta(days=end_offset)).strftime(PARTNERS_DATETIME_FORMAT) - date_interval_lookup[patient_key] = (start_date, end_date) - except ValueError as e: - logging.debug(f'Value error {e}') - return date_interval_lookup - - -def _build_cardiac_surgery_basic_tensor_maps( - needed_tensor_maps: List[str], -) -> Dict[str, TensorMap]: - name2tensormap: Dict[str:TensorMap] = {} - - date_interval_lookup = build_date_interval_lookup() + cardiac_surgery_dict = build_cardiac_surgery_dict(additional_columns=[column for outcome, column in outcome2column.items() if outcome in needed_tensor_maps]) + date_interval_lookup = build_date_interval_lookup(cardiac_surgery_dict) for needed_name in needed_tensor_maps: - if not needed_name.endswith('_sts'): - continue + if needed_name in outcome2column: + channel_map = _outcome_channels(needed_name) + sts_tmap = TensorMap( + needed_name, + Interpretation.CATEGORICAL, + path_prefix=PARTNERS_PREFIX, + tensor_from_file=make_cardiac_surgery_outcome_tensor_from_file(cardiac_surgery_dict, outcome2column[needed_name]), + channel_map=channel_map, + validator=validator_not_all_zero, + ) + else: + if not needed_name.endswith('_sts'): + continue - base_name = needed_name.split('_sts')[0] - if base_name not in TMAPS: - continue + base_name = needed_name.split('_sts')[0] + if base_name not in TMAPS: + TMAPS.update(build_partners_time_series_tensor_maps([base_name])) + if base_name not in TMAPS: + continue - sts_tmap = copy.deepcopy(TMAPS[base_name]) - sts_tmap.name = needed_name - sts_tmap.time_series_lookup = date_interval_lookup + sts_tmap = copy.deepcopy(TMAPS[base_name]) + sts_tmap.name = needed_name + sts_tmap.time_series_lookup = date_interval_lookup name2tensormap[needed_name] = sts_tmap