From 6db73a947e144716d5658f05962b176fcff4de09 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Thu, 19 Oct 2023 17:47:55 -0400 Subject: [PATCH 01/13] update README file for renaming the output file name --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index abf16f2..1b4b8f0 100644 --- a/README.md +++ b/README.md @@ -179,8 +179,8 @@ The most important parameter of this script is `--ani_thresh`: this is average n | File (names starting with prefix) | Content | | ------------------------------------- | ------------------------------------------------------------ | | _config.json | A JSON file stores the required information needed to run the next YACHT algorithm | -| _manifest.tsv | A TSV file contains organisms and their relevant info after removing the similar ones. -| _rep_to_corr_orgas_mapping.tsv | A TSV file contains representative organisms and their similar organisms that have been removed | +| _manifest.tsv | A TSV file contains organisms and their relevant info after removing the similar ones | +| _removed_orgs_to_corr_orgas_mapping.tsv | A TSV file with two columns: removed organism names ('removed_org') and their similar genomes ('corr_orgs')|
From 75a7d938836c429311fa6123669fccd4fcc9257d Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Thu, 19 Oct 2023 17:49:01 -0400 Subject: [PATCH 02/13] update the functions for removing the similar organoms --- make_training_data_from_sketches.py | 21 +++---- srcs/utils.py | 96 +++++++++++------------------ 2 files changed, 44 insertions(+), 73 deletions(-) diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py index 190b021..85e9388 100644 --- a/make_training_data_from_sketches.py +++ b/make_training_data_from_sketches.py @@ -65,12 +65,12 @@ # Find the close related genomes with ANI > ani_thresh from the reference database logger.info("Find the close related genomes with ANI > ani_thresh from the reference database") - sig_same_genoms_dict = utils.run_multisearch(num_threads, ani_thresh, ksize, scale, path_to_temp_dir) + multisearch_result = utils.run_multisearch(num_threads, ani_thresh, ksize, scale, path_to_temp_dir) # remove the close related organisms: any organisms with ANI > ani_thresh # pick only the one with largest number of unique kmers from all the close related organisms logger.info("Removing the close related organisms with ANI > ani_thresh") - rep_remove_dict, manifest_df = utils.remove_corr_organisms_from_ref(sig_info_dict, sig_same_genoms_dict) + remove_corr_df, manifest_df = utils.remove_corr_organisms_from_ref(sig_info_dict, multisearch_result, ani_thresh, ksize) # write out the manifest file logger.info("Writing out the manifest file") @@ -79,22 +79,19 @@ # write out a mapping dataframe from representative organism to the close related organisms logger.info("Writing out a mapping dataframe from representative organism to the close related organisms") - if len(rep_remove_dict) == 0: - logger.warning("No close related organisms found. No mapping dataframe is written.") - rep_remove_df = pd.DataFrame(columns=['rep_org', 'corr_orgs']) - rep_remove_df_path = os.path.join(outdir, f'{prefix}_rep_to_corr_orgas_mapping.tsv') - rep_remove_df.to_csv(rep_remove_df_path, sep='\t', index=None) + if len(remove_corr_df) == 0: + logger.warning("No close related organisms found.") + remove_corr_df_indicator = "" else: - rep_remove_df = pd.DataFrame([(rep_org, ','.join(corr_org_list)) for rep_org, corr_org_list in rep_remove_dict.items()]) - rep_remove_df.columns = ['rep_org', 'corr_orgs'] - rep_remove_df_path = os.path.join(outdir, f'{prefix}_rep_to_corr_orgas_mapping.tsv') - rep_remove_df.to_csv(rep_remove_df_path, sep='\t', index=None) + remove_corr_df_path = os.path.join(outdir, f'{prefix}_removed_orgs_to_corr_orgas_mapping.tsv') + remove_corr_df.to_csv(remove_corr_df_path, sep='\t', index=None) + remove_corr_df_indicator = remove_corr_df_path # save the config file logger.info("Saving the config file") json_file_path = os.path.join(outdir, f'{prefix}_config.json') json.dump({'manifest_file_path': manifest_file_path, - 'rep_remove_df_path': rep_remove_df_path, + 'remove_cor_df_path': remove_corr_df_indicator, 'intermediate_files_dir': path_to_temp_dir, 'scale': scale, 'ksize': ksize, diff --git a/srcs/utils.py b/srcs/utils.py index 376b949..5f128a5 100644 --- a/srcs/utils.py +++ b/srcs/utils.py @@ -85,7 +85,7 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int, :param ksize: int (size of kmer) :param scale: int (scale factor) :param path_to_temp_dir: string (path to the folder to store the intermediate files) - :return: a dictionary mapping signature name to a list of its close related genomes (ANI > ani_thresh) + :return: a dataframe with symmetric pairwise multisearch result (query_name, match_name) """ results = {} @@ -96,7 +96,7 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int, sig_files.to_csv(sig_files_path, header=False, index=False) # convert ani threshold to containment threshold - containment_thresh = 0.9*(ani_thresh ** ksize) + containment_thresh = (ani_thresh ** ksize) cmd = f"sourmash scripts multisearch {sig_files_path} {sig_files_path} -k {ksize} -s {scale} -c {num_threads} -t {containment_thresh} -o {os.path.join(path_to_temp_dir, 'training_multisearch_result.csv')}" logger.info(f"Running sourmash multisearch with command: {cmd}") exit_code = os.system(cmd) @@ -105,82 +105,56 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int, # read the multisearch result multisearch_result = pd.read_csv(os.path.join(path_to_temp_dir, 'training_multisearch_result.csv'), sep=',', header=0) - multisearch_result = multisearch_result.drop_duplicates().reset_index(drop=True) multisearch_result = multisearch_result.query('query_name != match_name').reset_index(drop=True) + + # because the multisearch result is not symmetric, that is + # we have: A B score but not B A score + # we need to make it symmetric + A_TO_B = multisearch_result[['query_name','match_name']].drop_duplicates().reset_index(drop=True) + B_TO_A = A_TO_B[['match_name','query_name']].rename(columns={'match_name':'query_name','query_name':'match_name'}) + multisearch_result = pd.concat([A_TO_B, B_TO_A]).drop_duplicates().reset_index(drop=True) - for query_name, match_name in tqdm(multisearch_result[['query_name', 'match_name']].to_numpy()): - if str(query_name) not in results: - results[str(query_name)] = [str(match_name)] - else: - results[str(query_name)].append(str(match_name)) - - return results + return multisearch_result -def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, int, int]], sig_same_genoms_dict: Dict[str, List[str]]) -> Tuple[Dict[str, List[str]], pd.DataFrame]: +def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, int, int]], multisearch_result: pd.DataFrame, ani_thresh: float, ksize: int) -> Tuple[Dict[str, List[str]], pd.DataFrame]: """ Helper function that removes the close related organisms from the reference matrix. :param sig_info_dict: a dictionary mapping all signature name from reference data to a tuple (md5sum, minhash mean abundance, minhash hashes length, minhash scaled) - :param sig_same_genoms_dict: a dictionary mapping signature name to a list of its close related genomes (ANI > ani_thresh) + :param multisearch_result: a dataframe with symmetric pairwise multisearch result (query_name, match_name) :return - rep_remove_dict: a dictionary with key as representative signature name and value as a list of signatures to be removed + remove_corr_df: a dataframe with two columns: removed organism name and its close related organisms manifest_df: a dataframe containing the processed reference signature information """ - # for each genome with close related genomes, pick the one with largest number of unique kmers - rep_remove_dict = {} + # extract organisms that have close related organisms and their number of unique kmers + corr_organisms = [query_name for query_name in multisearch_result['query_name'].unique()] + sizes = np.array([sig_info_dict[organism][2] for organism in corr_organisms]) + # sort organisms by size in ascending order, so we keep the largest organism, discard the smallest + bysize = np.argsort(sizes) + corr_organisms_bysize = np.array(corr_organisms)[bysize].tolist() + + # remove the sorted organisms until all left genomes are distinct (e.g., ANI <= ani_thresh) temp_remove_set = set() - manifest_df = [] - for genome, same_genomes in tqdm(sig_same_genoms_dict.items()): - # skip if the genome has been removed - if genome in temp_remove_set: - continue - # keep same genome if it is not in the remove set - same_genomes = list(set(same_genomes).difference(temp_remove_set)) - # get the number of unique kmers for each genome - unique_kmers = np.array([sig_info_dict[genome][2]] + [sig_info_dict[same_genome][2] for same_genome in same_genomes]) - # get the index of the genome with largest number of unique kmers - rep_idx = np.argmax(unique_kmers) - # get the representative genome - rep_genome = genome if rep_idx == 0 else same_genomes[rep_idx-1] - # get the list of genomes to be removed - remove_genomes = same_genomes if rep_idx == 0 else [genome] + same_genomes[:rep_idx-1] + same_genomes[rep_idx:] - # update remove set - temp_remove_set.update(remove_genomes) - if len(remove_genomes) > 0: - rep_remove_dict[rep_genome] = remove_genomes + # loop through the organisms size in ascending order + for organism in tqdm(corr_organisms_bysize, desc='Removing close related organisms'): + ## for a given organism check its close related organisms, see if there are any organisms left after removing those in the remove set + ## if so, put this organism in the remove set + left_corr_orgs = set(multisearch_result.query(f'query_name == "{organism}"')['match_name']).difference(temp_remove_set) + if len(left_corr_orgs) > 0: + temp_remove_set.add(organism) + + # generate a dataframe with two columns: removed organism name and its close related organisms + logger.info(f'Generating a dataframe with two columns: removed organism name and its close related organisms.') + remove_corr_list = [(organism, ','.join(list(set(multisearch_result.query(f'query_name == "{organism}"')['match_name'])))) for organism in tqdm(temp_remove_set)] + remove_corr_df = pd.DataFrame(remove_corr_list, columns=['removed_org', 'corr_orgs']) # remove the close related organisms from the reference genome list + manifest_df = [] for sig_name, (md5sum, minhash_mean_abundance, minhash_hashes_len, minhash_scaled) in tqdm(sig_info_dict.items()): if sig_name not in temp_remove_set: manifest_df.append((sig_name, md5sum, minhash_hashes_len, get_num_kmers(minhash_mean_abundance, minhash_hashes_len, minhash_scaled, False), minhash_scaled)) manifest_df = pd.DataFrame(manifest_df, columns=['organism_name', 'md5sum', 'num_unique_kmers_in_genome_sketch', 'num_total_kmers_in_genome_sketch', 'genome_scale_factor']) - return rep_remove_dict, manifest_df - -# def compute_sample_vector(sample_hashes, hash_to_idx): -# """ -# Helper function that computes the sample vector for a given sample signature. -# :param sample_hashes: hashes in the sample signature -# :param hash_to_idx: dictionary mapping hashes to indices in the training dictionary -# :return: numpy array (sample vector) -# """ -# # total number of hashes in the training dictionary -# hash_to_idx_keys = set(hash_to_idx.keys()) - -# # total number of hashes in the sample -# sample_hashes_keys = set(sample_hashes.keys()) - -# # initialize the sample vector -# sample_vector = np.zeros(len(hash_to_idx_keys)) - -# # get the hashes that are in both the sample and the training dictionary -# sample_intersect_training_hashes = hash_to_idx_keys.intersection(sample_hashes_keys) - -# # fill in the sample vector -# for sh in tqdm(sample_intersect_training_hashes): -# sample_vector[hash_to_idx[sh]] = sample_hashes[sh] - -# return sample_vector - + return remove_corr_df, manifest_df class Prediction: """ From a2aaf4bea102d3c7e08ecd703a80da8d412509a6 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Thu, 19 Oct 2023 19:59:46 -0400 Subject: [PATCH 03/13] sort similar genome names in order to better check the removed genomes --- srcs/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/srcs/utils.py b/srcs/utils.py index 5f128a5..0f34575 100644 --- a/srcs/utils.py +++ b/srcs/utils.py @@ -126,7 +126,8 @@ def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, in manifest_df: a dataframe containing the processed reference signature information """ # extract organisms that have close related organisms and their number of unique kmers - corr_organisms = [query_name for query_name in multisearch_result['query_name'].unique()] + # sort name in order to better check the removed organisms + corr_organisms = sorted([query_name for query_name in multisearch_result['query_name'].unique()]) sizes = np.array([sig_info_dict[organism][2] for organism in corr_organisms]) # sort organisms by size in ascending order, so we keep the largest organism, discard the smallest bysize = np.argsort(sizes) From f005c21756c223c7cad5ca907c3cf298bbf404c2 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Thu, 19 Oct 2023 20:28:43 -0400 Subject: [PATCH 04/13] update test_workflow.py for the new change --- tests/test_workflow.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/test_workflow.py b/tests/test_workflow.py index ffd4241..57635f9 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -27,7 +27,7 @@ def test_full_workflow(): # In testdata/ # 20_genomes_trained_config.json # 20_genomes_trained_processed_manifest.tsv - # 20_genomes_trained_rep_to_corr_orgas_mapping.tsv + # 20_genomes_trained_removed_orgs_to_corr_orgas_mapping.tsv # /testdata/20_genomes_trained_intermediate_files$ tree . # . # ├── signatures @@ -62,7 +62,7 @@ def test_full_workflow(): shutil.rmtree(os.path.join(data_dir, intermediate_dir), ignore_errors=True) # python ../make_training_data_from_sketches.py --ref_file testdata/20_genomes_sketches.zip --ksize 31 --prefix 20_genomes_trained --outdir testdata/ cmd = f"python {os.path.join(script_dir, 'make_training_data_from_sketches.py')} --ref_file {reference_sketches}" \ - f" --prefix {full_out_prefix} --ksize 31 --outdir {data_dir}" + f" --prefix {out_prefix} --ksize 31 --outdir {data_dir}" res = subprocess.run(cmd, shell=True, check=True) # check that no errors were raised assert res.returncode == 0 @@ -71,7 +71,7 @@ def test_full_workflow(): assert exists(f) # check that the files are big enough for f in expected_files: - assert os.stat(f).st_size > 400 + assert os.stat(f).st_size > 300 # then do the presence/absence estimation if exists(abundance_file): os.remove(abundance_file) @@ -80,9 +80,6 @@ def test_full_workflow(): print(cmd) # ~/pycharm/YACHT/tests/testdata$ tree 20_genomes_trained_intermediate_files/ # 20_genomes_trained_intermediate_files/ - # ├── organism_sig_file.txt # <-- new - # ├── sample_multisearch_result.csv - # ├── sample_sig_file.txt # ├── signatures # │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz # │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz @@ -107,6 +104,14 @@ def test_full_workflow(): # ├── SOURMASH-MANIFEST.csv # ├── training_multisearch_result.csv # └── training_sig_files.txt + # ~/pycharm/YACHT/tests/testdata$ tree sample_sample_intermediate_files + # sample_sample_intermediate_files + # ├── organism_sig_file.txt + # ├── sample_multisearch_result.csv + # ├── sample_sig_file.txt + # ├── signatures + # │ └── 2b6488794c648f540068adacd5aef77e.sig.gz + # └── SOURMASH-MANIFEST.csv # print(cmd) res = subprocess.run(cmd, shell=True, check=True) # check that no errors were raised From 0c72b3ba6650dfebeaaf44a6717b139f206944b5 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Thu, 19 Oct 2023 21:29:44 -0400 Subject: [PATCH 05/13] add parameter "--outdir" in run_YACHT.py --- run_YACHT.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/run_YACHT.py b/run_YACHT.py index 28efc3a..df764aa 100644 --- a/run_YACHT.py +++ b/run_YACHT.py @@ -10,6 +10,7 @@ import json import warnings import zipfile +from pathlib import Path warnings.filterwarnings("ignore") from tqdm import tqdm from loguru import logger @@ -32,7 +33,8 @@ 'Each value should be between 0 and 1, with 0 being the most sensitive (and least ' 'precise) and 1 being the most precise (and least sensitive).', required=False, default=[1, 0.5, 0.1, 0.05, 0.01]) - parser.add_argument('--out_filename', help='Full path of output filename', required=False, default='result.xlsx') + parser.add_argument('--out_filename', help='output excel filename', required=False, default='result.xlsx') + parser.add_argument('--outdir', type=str, help='path to output directory', required=False, default=os.getcwd()) # parse the arguments args = parser.parse_args() @@ -44,6 +46,7 @@ show_all = args.show_all # Show all organisms (no matter if present) in output file. min_coverage_list = args.min_coverage_list # a list of percentages of unique k-mers covered by reads in the sample. out_filename = args.out_filename # output filename + outdir = str(Path(args.outdir).absolute()) # path to output directory # check if the json file exists utils.check_file_existence(json_file_path, f'Config file {json_file_path} does not exist. ' @@ -57,8 +60,8 @@ ani_thresh = config['ani_thresh'] # Make sure the output can be written to - if not os.access(os.path.abspath(os.path.dirname(out_filename)), os.W_OK): - raise FileNotFoundError(f"Cannot write to the location: {os.path.abspath(os.path.dirname(out_filename))}.") + if not os.access(outdir, os.W_OK): + raise FileNotFoundError(f"Cannot write to the location: {outdir}.") # check if min_coverage is between 0 and 1 for x in min_coverage_list: @@ -126,9 +129,9 @@ manifest_list = temp_manifest_list # save the results into Excel file - logger.info(f'Saving results to {os.path.dirname(out_filename)}.') + logger.info(f'Saving results to {outdir}.') # save the results with different min_coverage - with pd.ExcelWriter(out_filename, engine='openpyxl', mode='w') as writer: + with pd.ExcelWriter(os.path.join(outdir, out_filename), engine='openpyxl', mode='w') as writer: # save the raw results (i.e., min_coverage=1.0) if keep_raw: temp_mainifest = manifest_list[0].copy() From e98f3e5bf854af8a7ee18ddb597037d6187d80eb Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Thu, 19 Oct 2023 21:30:01 -0400 Subject: [PATCH 06/13] update test_workflow.py for the new change --- tests/test_workflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 5ae7609..0ac2852 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -61,7 +61,7 @@ def test_full_workflow(): # Remove the intermediate folder shutil.rmtree(os.path.join(data_dir, intermediate_dir), ignore_errors=True) # python ../make_training_data_from_sketches.py --ref_file testdata/20_genomes_sketches.zip --ksize 31 --prefix 20_genomes_trained --outdir testdata/ - cmd = f"python {os.path.join(script_dir, 'make_training_data_from_sketches.py')} --ref_file {reference_sketches}" \ + cmd = f"python {os.path.join(script_dir, 'make_training_data_from_sketches.py')} --force --ref_file {reference_sketches}" \ f" --prefix {out_prefix} --ksize 31 --outdir {data_dir}" res = subprocess.run(cmd, shell=True, check=True) # check that no errors were raised @@ -76,7 +76,7 @@ def test_full_workflow(): if exists(abundance_file): os.remove(abundance_file) # python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx --outdir testdata/ - cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_file {abundance_file} --show_all" + cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_filename {abundance_file} --show_all" print(cmd) # ~/pycharm/YACHT/tests/testdata$ tree 20_genomes_trained_intermediate_files/ # 20_genomes_trained_intermediate_files/ From 22840cdf4787b18457f6cff3da7145babbda8a74 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Thu, 19 Oct 2023 23:29:17 -0400 Subject: [PATCH 07/13] remove unnecessary comments --- tests/test_workflow.py | 68 +----------------------------------------- 1 file changed, 1 insertion(+), 67 deletions(-) diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 0ac2852..ae59c8f 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -24,36 +24,6 @@ def test_full_workflow(): "training_sig_files.txt"]))) # one of the signature files expected_files.extend(list(map(lambda x: os.path.join(data_dir, intermediate_dir, "signatures", x), ["04212e93c2172d4df49dc5d8c2973d8b.sig.gz"]))) - # In testdata/ - # 20_genomes_trained_config.json - # 20_genomes_trained_processed_manifest.tsv - # 20_genomes_trained_removed_orgs_to_corr_orgas_mapping.tsv - # /testdata/20_genomes_trained_intermediate_files$ tree . - # . - # ├── signatures - # │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz - # │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz - # │   ├── 0661ecab88c3d65d0f10e599a5ba1654.sig.gz - # │   ├── 06ebe48d527882bfa9505aba8e31ae23.sig.gz - # │   ├── 11fe9a00287c7ad086ebbc463724cf10.sig.gz - # │   ├── 16c6c1d37259d83088ab3a4f5b691631.sig.gz - # │   ├── 188d55801a78d4773cf6c0b46bca96ba.sig.gz - # │   ├── 1a121dca600c6504e88252e81004f0cf.sig.gz - # │   ├── 39ea7fd48ee7003587c9c763946d5d6e.sig.gz - # │   ├── 45f2675c1dca4ef1a24a05f5b268adbb.sig.gz - # │   ├── 7b312fffa3fb35440ba40203ba826c05.sig.gz - # │   ├── 8fb9b1a69838a58cc4f31c1e42a5f189.sig.gz - # │   ├── 92fb1b3e4baa6c474aff3efb84957687.sig.gz - # │   ├── 96cb85214535b0f9723a6abc17097821.sig.gz - # │   ├── a136145bee08846ed94c0406df3da2d4.sig.gz - # │   ├── b691deddf179ead0a006527330d86dde.sig.gz - # │   ├── b7f087146f5cc3121477c29ff003e3d0.sig.gz - # │   ├── c39c52d2d088348c950c2afe503b405b.sig.gz - # │   ├── c9eb6a9d058df8036ad93bc45d5bf260.sig.gz - # │   └── ce54d962851b0fdeefc624300036a133.sig.gz - # ├── SOURMASH-MANIFEST.csv - # ├── training_multisearch_result.csv - # └── training_sig_files.txt # remove the files if they exist for f in expected_files: if exists(f): @@ -75,45 +45,9 @@ def test_full_workflow(): # then do the presence/absence estimation if exists(abundance_file): os.remove(abundance_file) - # python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx --outdir testdata/ cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_filename {abundance_file} --show_all" print(cmd) - # ~/pycharm/YACHT/tests/testdata$ tree 20_genomes_trained_intermediate_files/ - # 20_genomes_trained_intermediate_files/ - # ├── signatures - # │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz - # │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz - # │   ├── 0661ecab88c3d65d0f10e599a5ba1654.sig.gz - # │   ├── 06ebe48d527882bfa9505aba8e31ae23.sig.gz - # │   ├── 11fe9a00287c7ad086ebbc463724cf10.sig.gz - # │   ├── 16c6c1d37259d83088ab3a4f5b691631.sig.gz - # │   ├── 188d55801a78d4773cf6c0b46bca96ba.sig.gz - # │   ├── 1a121dca600c6504e88252e81004f0cf.sig.gz - # │   ├── 39ea7fd48ee7003587c9c763946d5d6e.sig.gz - # │   ├── 45f2675c1dca4ef1a24a05f5b268adbb.sig.gz - # │   ├── 7b312fffa3fb35440ba40203ba826c05.sig.gz - # │   ├── 8fb9b1a69838a58cc4f31c1e42a5f189.sig.gz - # │   ├── 92fb1b3e4baa6c474aff3efb84957687.sig.gz - # │   ├── 96cb85214535b0f9723a6abc17097821.sig.gz - # │   ├── a136145bee08846ed94c0406df3da2d4.sig.gz - # │   ├── b691deddf179ead0a006527330d86dde.sig.gz - # │   ├── b7f087146f5cc3121477c29ff003e3d0.sig.gz - # │   ├── c39c52d2d088348c950c2afe503b405b.sig.gz - # │   ├── c9eb6a9d058df8036ad93bc45d5bf260.sig.gz - # │   └── ce54d962851b0fdeefc624300036a133.sig.gz - # ├── SOURMASH-MANIFEST.csv - # ├── training_multisearch_result.csv - # └── training_sig_files.txt - # ~/pycharm/YACHT/tests/testdata$ tree sample_sample_intermediate_files - # sample_sample_intermediate_files - # ├── organism_sig_file.txt - # ├── sample_multisearch_result.csv - # ├── sample_sig_file.txt - # ├── signatures - # │ └── 2b6488794c648f540068adacd5aef77e.sig.gz - # └── SOURMASH-MANIFEST.csv - # print(cmd) - # python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx + cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out_file {os.path.join(data_dir,abundance_file)} --show_all" res = subprocess.run(cmd, shell=True, check=True) # check that no errors were raised From 7733c0571e98664e6a82ed6db37a4bb163b94433 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Fri, 20 Oct 2023 07:29:26 -0400 Subject: [PATCH 08/13] fix a small bug --- srcs/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/srcs/utils.py b/srcs/utils.py index 0f34575..77a3434 100644 --- a/srcs/utils.py +++ b/srcs/utils.py @@ -127,7 +127,7 @@ def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, in """ # extract organisms that have close related organisms and their number of unique kmers # sort name in order to better check the removed organisms - corr_organisms = sorted([query_name for query_name in multisearch_result['query_name'].unique()]) + corr_organisms = sorted([str(query_name) for query_name in multisearch_result['query_name'].unique()]) sizes = np.array([sig_info_dict[organism][2] for organism in corr_organisms]) # sort organisms by size in ascending order, so we keep the largest organism, discard the smallest bysize = np.argsort(sizes) From a746799e78aef2fc8884b7d353eb6169a2a590df Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Fri, 20 Oct 2023 10:09:58 -0400 Subject: [PATCH 09/13] if use "--force", remove temporary directory --- make_training_data_from_sketches.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py index c8599a2..1dbd8c8 100644 --- a/make_training_data_from_sketches.py +++ b/make_training_data_from_sketches.py @@ -8,6 +8,7 @@ import srcs.utils as utils from loguru import logger import json +import shutil logger.remove() logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO") @@ -48,6 +49,11 @@ path_to_temp_dir = os.path.join(outdir, prefix+'_intermediate_files') if os.path.exists(path_to_temp_dir) and not force: raise ValueError(f"Temporary directory {path_to_temp_dir} already exists. Please remove it or given a new prefix name using parameter '--prefix'.") + else: + # remove the temporary directory if it exists + if os.path.exists(path_to_temp_dir): + logger.warning(f"Temporary directory {path_to_temp_dir} already exists. Removing it.") + shutil.rmtree(path_to_temp_dir) os.makedirs(path_to_temp_dir, exist_ok=True) # unzip the sourmash signature file to the temporary directory From 214d2cf997dc1725d5f40bbf74f9e5d81bace48c Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Fri, 20 Oct 2023 10:53:17 -0400 Subject: [PATCH 10/13] use the "--out" parameter only in run_YACHT.py to specify the result --- run_YACHT.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/run_YACHT.py b/run_YACHT.py index df764aa..fe7d99a 100644 --- a/run_YACHT.py +++ b/run_YACHT.py @@ -33,8 +33,7 @@ 'Each value should be between 0 and 1, with 0 being the most sensitive (and least ' 'precise) and 1 being the most precise (and least sensitive).', required=False, default=[1, 0.5, 0.1, 0.05, 0.01]) - parser.add_argument('--out_filename', help='output excel filename', required=False, default='result.xlsx') - parser.add_argument('--outdir', type=str, help='path to output directory', required=False, default=os.getcwd()) + parser.add_argument('--out', type=str, help='path to output excel file', required=False, default=os.path.join(os.getcwd(), 'result.xlsx')) # parse the arguments args = parser.parse_args() @@ -45,8 +44,13 @@ keep_raw = args.keep_raw # Keep raw results in output file. show_all = args.show_all # Show all organisms (no matter if present) in output file. min_coverage_list = args.min_coverage_list # a list of percentages of unique k-mers covered by reads in the sample. - out_filename = args.out_filename # output filename - outdir = str(Path(args.outdir).absolute()) # path to output directory + out = str(Path(args.out).absolute()) # full path to output excel file + outdir = os.path.dirname(out) # path to output directory + out_filename = os.path.basename(out) # output filename + + # check if the output filename is valid + if os.path.splitext(out_filename)[1] != '.xlsx': + raise ValueError(f'Output filename {out} is not a valid excel file. Please use .xlsx as the extension.') # check if the json file exists utils.check_file_existence(json_file_path, f'Config file {json_file_path} does not exist. ' @@ -101,10 +105,6 @@ # check that the sample scale factor is the same as the genome scale factor for all organisms if scale != sample_sig_info[4]: raise ValueError(f'Sample scale factor does not equal genome scale factor. Please check your input.') - - # check if the output filename is valid - if not isinstance(out_filename, str) and out_filename != '': - out_filename = 'result.xlsx' # compute hypothesis recovery logger.info('Computing hypothesis recovery.') @@ -131,7 +131,7 @@ # save the results into Excel file logger.info(f'Saving results to {outdir}.') # save the results with different min_coverage - with pd.ExcelWriter(os.path.join(outdir, out_filename), engine='openpyxl', mode='w') as writer: + with pd.ExcelWriter(out, engine='openpyxl', mode='w') as writer: # save the raw results (i.e., min_coverage=1.0) if keep_raw: temp_mainifest = manifest_list[0].copy() From be0d1a7151105f16ef4f3390d8223b0d5a03e72f Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Fri, 20 Oct 2023 10:54:08 -0400 Subject: [PATCH 11/13] make make_training_data_from_sketches.py run faster --- make_training_data_from_sketches.py | 2 +- srcs/utils.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py index 1dbd8c8..038d091 100644 --- a/make_training_data_from_sketches.py +++ b/make_training_data_from_sketches.py @@ -78,7 +78,7 @@ # remove the close related organisms: any organisms with ANI > ani_thresh # pick only the one with largest number of unique kmers from all the close related organisms logger.info("Removing the close related organisms with ANI > ani_thresh") - remove_corr_df, manifest_df = utils.remove_corr_organisms_from_ref(sig_info_dict, multisearch_result, ani_thresh, ksize) + remove_corr_df, manifest_df = utils.remove_corr_organisms_from_ref(sig_info_dict, multisearch_result) # write out the manifest file logger.info("Writing out the manifest file") diff --git a/srcs/utils.py b/srcs/utils.py index 77a3434..15f4f46 100644 --- a/srcs/utils.py +++ b/srcs/utils.py @@ -116,7 +116,7 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int, return multisearch_result -def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, int, int]], multisearch_result: pd.DataFrame, ani_thresh: float, ksize: int) -> Tuple[Dict[str, List[str]], pd.DataFrame]: +def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, int, int]], multisearch_result: pd.DataFrame) -> Tuple[Dict[str, List[str]], pd.DataFrame]: """ Helper function that removes the close related organisms from the reference matrix. :param sig_info_dict: a dictionary mapping all signature name from reference data to a tuple (md5sum, minhash mean abundance, minhash hashes length, minhash scaled) @@ -133,19 +133,24 @@ def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, in bysize = np.argsort(sizes) corr_organisms_bysize = np.array(corr_organisms)[bysize].tolist() + # use dictionary to store the removed organisms and their close related organisms + # key: removed organism name + # value: a set of close related organisms + mapping = multisearch_result.groupby('query_name')['match_name'].agg(set).to_dict() + # remove the sorted organisms until all left genomes are distinct (e.g., ANI <= ani_thresh) temp_remove_set = set() # loop through the organisms size in ascending order for organism in tqdm(corr_organisms_bysize, desc='Removing close related organisms'): ## for a given organism check its close related organisms, see if there are any organisms left after removing those in the remove set ## if so, put this organism in the remove set - left_corr_orgs = set(multisearch_result.query(f'query_name == "{organism}"')['match_name']).difference(temp_remove_set) + left_corr_orgs = mapping[organism].difference(temp_remove_set) if len(left_corr_orgs) > 0: temp_remove_set.add(organism) # generate a dataframe with two columns: removed organism name and its close related organisms logger.info(f'Generating a dataframe with two columns: removed organism name and its close related organisms.') - remove_corr_list = [(organism, ','.join(list(set(multisearch_result.query(f'query_name == "{organism}"')['match_name'])))) for organism in tqdm(temp_remove_set)] + remove_corr_list = [(organism, ','.join(list(mapping[organism]))) for organism in tqdm(temp_remove_set)] remove_corr_df = pd.DataFrame(remove_corr_list, columns=['removed_org', 'corr_orgs']) # remove the close related organisms from the reference genome list From e52a79ec4ef4cf4bd453c80b14fe3f41bc050e5a Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Fri, 20 Oct 2023 10:54:44 -0400 Subject: [PATCH 12/13] update README and test_workflow.py for the new change --- README.md | 7 +++---- tests/test_workflow.py | 7 ++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a668d3e..190eba6 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig. python ../make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force # run YACHT algorithm to check the presence of reference genomes in the query sample (inference step) -python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out_filename result.xlsx +python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx # convert result to CAMI profile format (Optional) python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./ @@ -190,7 +190,7 @@ The most important parameter of this script is `--ani_thresh`: this is average n After this, you are ready to perform the hypothesis test for each organism in your reference database. This can be accomplished with something like: ```bash -python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --outdir ./ +python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --out ./result.xlsx ``` #### Parameter @@ -207,8 +207,7 @@ The `--min_coverage_list` parameter dictates a list of `min_coverage` which indi | --keep_raw | keep the raw result (i.e. `min_coverage=1`) no matter if the user specifies it | | --show_all | Show all organisms (no matter if present) | | --min_coverage_list | a list of `min_coverage` values, see more detailed description above (default: 1, 0.5, 0.1, 0.05, 0.01) | -| --out_filename | filename of output excel result (default: 'result.xlsx') | -| --outdir | the path to output directory where the results and intermediate files will be genreated | +| --out | path to output excel result (default: './result.xlsx') | #### Output diff --git a/tests/test_workflow.py b/tests/test_workflow.py index ae59c8f..f8cef9f 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -45,10 +45,7 @@ def test_full_workflow(): # then do the presence/absence estimation if exists(abundance_file): os.remove(abundance_file) - cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_filename {abundance_file} --show_all" - print(cmd) - - cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out_file {os.path.join(data_dir,abundance_file)} --show_all" + cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out {os.path.join(data_dir,abundance_file)} --show_all" res = subprocess.run(cmd, shell=True, check=True) # check that no errors were raised assert res.returncode == 0 @@ -83,7 +80,7 @@ def test_demo_workflow(): _ = subprocess.run(cmd, shell=True, check=True) cmd = f"cd {demo_dir}; python ../make_training_data_from_sketches.py --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./" _ = subprocess.run(cmd, shell=True, check=True) - cmd = f"cd {demo_dir}; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out_filename result.xlsx" + cmd = f"cd {demo_dir}; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx" _ = subprocess.run(cmd, shell=True, check=True) cmd = f"cd {demo_dir}; python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./" _ = subprocess.run(cmd, shell=True, check=True) From 9aeb904315090133be302995bdf71c83af54b130 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Fri, 20 Oct 2023 10:59:39 -0400 Subject: [PATCH 13/13] fixed some small issues suggested by code smells --- srcs/utils.py | 3 +-- tests/test_workflow.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/srcs/utils.py b/srcs/utils.py index 15f4f46..4b7d974 100644 --- a/srcs/utils.py +++ b/srcs/utils.py @@ -87,7 +87,6 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int, :param path_to_temp_dir: string (path to the folder to store the intermediate files) :return: a dataframe with symmetric pairwise multisearch result (query_name, match_name) """ - results = {} # run the sourmash multisearch # save signature files to a text file @@ -149,7 +148,7 @@ def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, in temp_remove_set.add(organism) # generate a dataframe with two columns: removed organism name and its close related organisms - logger.info(f'Generating a dataframe with two columns: removed organism name and its close related organisms.') + logger.info('Generating a dataframe with two columns: removed organism name and its close related organisms.') remove_corr_list = [(organism, ','.join(list(mapping[organism]))) for organism in tqdm(temp_remove_set)] remove_corr_df = pd.DataFrame(remove_corr_list, columns=['removed_org', 'corr_orgs']) diff --git a/tests/test_workflow.py b/tests/test_workflow.py index f8cef9f..fb7406f 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -13,7 +13,6 @@ def test_full_workflow(): test_dir = os.path.join(script_dir, 'tests') data_dir = os.path.join(test_dir, 'testdata') out_prefix = "20_genomes_trained" - full_out_prefix = os.path.join(data_dir, out_prefix) abundance_file = os.path.join(data_dir, "result.xlsx") reference_sketches = os.path.join(data_dir, "20_genomes_sketches.zip") sample_sketches = os.path.join(data_dir, "sample.sig.zip")