Skip to content

Commit

Permalink
use __concat__ to merge the self.dfs tables
Browse files Browse the repository at this point in the history
  • Loading branch information
Maria Gorodetski committed May 12, 2024
1 parent 9ccadcc commit 4906be4
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 16 deletions.
18 changes: 10 additions & 8 deletions nbs/05_pheno_loader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,9 @@
" continue\n",
" \n",
" if table_name == 'age_sex':\n",
" keep_undefined = True\n",
" # The 'age_sex' table does not contain 'undefined', so the merge will not cause a Cartesian product\n",
" keep_undefined = True \n",
" # Left join to keep only rows with real data points\n",
" how = 'left'\n",
" else: \n",
" keep_undefined = keep_undefined_research_stage\n",
Expand Down Expand Up @@ -581,13 +583,13 @@
" return False\n",
" \n",
" @staticmethod\n",
" def join_and_filter_undefined_research_stage(df1, df2, how='outer'):\n",
" def join_and_filter_undefined_research_stage(df1, df2, how='outer', lsuffix='', rsuffix=''):\n",
" df1_defined = df1[df1.index.get_level_values('research_stage') != 'undefined']\n",
" df2_defined = df2[df2.index.get_level_values('research_stage') != 'undefined']\n",
"\n",
" return df1_defined.join(df2_defined, how=how)\n",
" return df1_defined.join(df2_defined, how=how, lsuffix=lsuffix, rsuffix=rsuffix)\n",
"\n",
" def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer'):\n",
" def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer', lsuffix='', rsuffix=''):\n",
"\n",
" if df1.empty:\n",
" return df2\n",
Expand All @@ -598,10 +600,10 @@
" self.is_value_in_index(df2, 'undefined', 'research_stage') and not keep_undefined_research_stage:\n",
" \n",
" warnings.warn('filtering \"undefined\" research_stage..')\n",
" df = self.join_and_filter_undefined_research_stage(df1, df2, how)\n",
" df = self.join_and_filter_undefined_research_stage(df1, df2, how, lsuffix='', rsuffix='')\n",
" return df\n",
" \n",
" return df1.join(df2, how=how)\n",
" return df1.join(df2, how=how, lsuffix=lsuffix, rsuffix=rsuffix)\n",
" \n",
" def merge_all_tables(self) -> pd.DataFrame:\n",
" # merge all tables in self.dfs dictionary\n",
Expand All @@ -610,8 +612,8 @@
" if align_df is None:\n",
" align_df = df\n",
" else:\n",
" align_df = pd.merge(align_df, df, left_index=True, right_index=True, how='outer', suffixes=('', name))\n",
" \n",
" # Join the table with an 'undefined' research_stage to keep the maximum number of data points\n",
" align_df = self.__concat__(align_df, df, keep_undefined_research_stage=True, how='outer', lsuffix='', rsuffix= name) \n",
" return align_df\n",
"\n",
" def __load_age_sex__(self) -> None:\n",
Expand Down
18 changes: 10 additions & 8 deletions pheno_utils/pheno_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,9 @@ def get(self, fields: Union[str,List[str]], flexible: bool=None, not_bulk_field=
continue

if table_name == 'age_sex':
keep_undefined = True
# The 'age_sex' table does not contain 'undefined', so the merge will not cause a Cartesian product
keep_undefined = True
# Left join to keep only rows with real data points
how = 'left'
else:
keep_undefined = keep_undefined_research_stage
Expand Down Expand Up @@ -530,13 +532,13 @@ def is_value_in_index(self, df, value, index_name):
return False

@staticmethod
def join_and_filter_undefined_research_stage(df1, df2, how='outer'):
def join_and_filter_undefined_research_stage(df1, df2, how='outer', lsuffix='', rsuffix=''):
df1_defined = df1[df1.index.get_level_values('research_stage') != 'undefined']
df2_defined = df2[df2.index.get_level_values('research_stage') != 'undefined']

return df1_defined.join(df2_defined, how=how)
return df1_defined.join(df2_defined, how=how, lsuffix=lsuffix, rsuffix=rsuffix)

def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer'):
def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer', lsuffix='', rsuffix=''):

if df1.empty:
return df2
Expand All @@ -547,10 +549,10 @@ def __concat__(self, df1, df2, keep_undefined_research_stage=False, how='outer')
self.is_value_in_index(df2, 'undefined', 'research_stage') and not keep_undefined_research_stage:

warnings.warn('filtering "undefined" research_stage..')
df = self.join_and_filter_undefined_research_stage(df1, df2, how)
df = self.join_and_filter_undefined_research_stage(df1, df2, how, lsuffix='', rsuffix='')
return df

return df1.join(df2, how=how)
return df1.join(df2, how=how, lsuffix=lsuffix, rsuffix=rsuffix)

def merge_all_tables(self) -> pd.DataFrame:
# merge all tables in self.dfs dictionary
Expand All @@ -559,8 +561,8 @@ def merge_all_tables(self) -> pd.DataFrame:
if align_df is None:
align_df = df
else:
align_df = pd.merge(align_df, df, left_index=True, right_index=True, how='outer', suffixes=('', name))

# Join the table with an 'undefined' research_stage to keep the maximum number of data points
align_df = self.__concat__(align_df, df, keep_undefined_research_stage=True, how='outer', lsuffix='', rsuffix= name)
return align_df

def __load_age_sex__(self) -> None:
Expand Down

0 comments on commit 4906be4

Please sign in to comment.