From 6db73a947e144716d5658f05962b176fcff4de09 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 19 Oct 2023 17:47:55 -0400
Subject: [PATCH 01/13] update README file for renaming the output file name

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index abf16f2..1b4b8f0 100644
--- a/README.md
+++ b/README.md
@@ -179,8 +179,8 @@ The most important parameter of this script is `--ani_thresh`: this is average n
 | File (names starting with prefix)     | Content                                                      |
 | ------------------------------------- | ------------------------------------------------------------ |
 | _config.json                          | A JSON file stores the required information needed to run the next YACHT algorithm |
-| _manifest.tsv                         | A TSV file contains organisms and their relevant info after removing the similar ones.
-| _rep_to_corr_orgas_mapping.tsv       | A TSV file contains representative organisms and their similar organisms that have been removed |
+| _manifest.tsv                         | A TSV file contains organisms and their relevant info after removing the similar ones |
+| _removed_orgs_to_corr_orgas_mapping.tsv   | A TSV file with two columns: removed organism names ('removed_org') and their similar genomes ('corr_orgs')| 
 
 
 </br>

From 75a7d938836c429311fa6123669fccd4fcc9257d Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 19 Oct 2023 17:49:01 -0400
Subject: [PATCH 02/13] update the functions for removing the similar organoms

---
 make_training_data_from_sketches.py | 21 +++----
 srcs/utils.py                       | 96 +++++++++++------------------
 2 files changed, 44 insertions(+), 73 deletions(-)

diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py
index 190b021..85e9388 100644
--- a/make_training_data_from_sketches.py
+++ b/make_training_data_from_sketches.py
@@ -65,12 +65,12 @@
 
     # Find the close related genomes with ANI > ani_thresh from the reference database
     logger.info("Find the close related genomes with ANI > ani_thresh from the reference database")
-    sig_same_genoms_dict = utils.run_multisearch(num_threads, ani_thresh, ksize, scale, path_to_temp_dir)
+    multisearch_result = utils.run_multisearch(num_threads, ani_thresh, ksize, scale, path_to_temp_dir)
 
     # remove the close related organisms: any organisms with ANI > ani_thresh
     # pick only the one with largest number of unique kmers from all the close related organisms
     logger.info("Removing the close related organisms with ANI > ani_thresh")
-    rep_remove_dict, manifest_df = utils.remove_corr_organisms_from_ref(sig_info_dict, sig_same_genoms_dict)
+    remove_corr_df, manifest_df = utils.remove_corr_organisms_from_ref(sig_info_dict, multisearch_result, ani_thresh, ksize)
 
     # write out the manifest file
     logger.info("Writing out the manifest file")
@@ -79,22 +79,19 @@
 
     # write out a mapping dataframe from representative organism to the close related organisms
     logger.info("Writing out a mapping dataframe from representative organism to the close related organisms")
-    if len(rep_remove_dict) == 0:
-        logger.warning("No close related organisms found. No mapping dataframe is written.")
-        rep_remove_df = pd.DataFrame(columns=['rep_org', 'corr_orgs'])
-        rep_remove_df_path = os.path.join(outdir, f'{prefix}_rep_to_corr_orgas_mapping.tsv')
-        rep_remove_df.to_csv(rep_remove_df_path, sep='\t', index=None)
+    if len(remove_corr_df) == 0:
+        logger.warning("No close related organisms found.")
+        remove_corr_df_indicator = ""
     else:
-        rep_remove_df = pd.DataFrame([(rep_org, ','.join(corr_org_list)) for rep_org, corr_org_list in rep_remove_dict.items()])
-        rep_remove_df.columns = ['rep_org', 'corr_orgs']
-        rep_remove_df_path = os.path.join(outdir, f'{prefix}_rep_to_corr_orgas_mapping.tsv')
-        rep_remove_df.to_csv(rep_remove_df_path, sep='\t', index=None)
+        remove_corr_df_path = os.path.join(outdir, f'{prefix}_removed_orgs_to_corr_orgas_mapping.tsv')
+        remove_corr_df.to_csv(remove_corr_df_path, sep='\t', index=None)
+        remove_corr_df_indicator = remove_corr_df_path
 
     # save the config file
     logger.info("Saving the config file")
     json_file_path = os.path.join(outdir, f'{prefix}_config.json')
     json.dump({'manifest_file_path': manifest_file_path,
-               'rep_remove_df_path': rep_remove_df_path,
+               'remove_cor_df_path': remove_corr_df_indicator,
                'intermediate_files_dir': path_to_temp_dir,
                'scale': scale,
                'ksize': ksize,
diff --git a/srcs/utils.py b/srcs/utils.py
index 376b949..5f128a5 100644
--- a/srcs/utils.py
+++ b/srcs/utils.py
@@ -85,7 +85,7 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int,
     :param ksize: int (size of kmer)
     :param scale: int (scale factor)
     :param path_to_temp_dir: string (path to the folder to store the intermediate files)
-    :return: a dictionary mapping signature name to a list of its close related genomes (ANI > ani_thresh)
+    :return: a dataframe with symmetric pairwise multisearch result (query_name, match_name)
     """
     results = {}
     
@@ -96,7 +96,7 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int,
     sig_files.to_csv(sig_files_path, header=False, index=False)
     
     # convert ani threshold to containment threshold
-    containment_thresh = 0.9*(ani_thresh ** ksize)
+    containment_thresh = (ani_thresh ** ksize)
     cmd = f"sourmash scripts multisearch {sig_files_path} {sig_files_path} -k {ksize} -s {scale} -c {num_threads} -t {containment_thresh} -o {os.path.join(path_to_temp_dir, 'training_multisearch_result.csv')}"
     logger.info(f"Running sourmash multisearch with command: {cmd}")
     exit_code = os.system(cmd)
@@ -105,82 +105,56 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int,
     
     # read the multisearch result
     multisearch_result = pd.read_csv(os.path.join(path_to_temp_dir, 'training_multisearch_result.csv'), sep=',', header=0)
-    multisearch_result = multisearch_result.drop_duplicates().reset_index(drop=True)
     multisearch_result = multisearch_result.query('query_name != match_name').reset_index(drop=True)
+
+    # because the multisearch result is not symmetric, that is
+    # we have: A B score but not B A score
+    # we need to make it symmetric
+    A_TO_B = multisearch_result[['query_name','match_name']].drop_duplicates().reset_index(drop=True)
+    B_TO_A = A_TO_B[['match_name','query_name']].rename(columns={'match_name':'query_name','query_name':'match_name'})
+    multisearch_result = pd.concat([A_TO_B, B_TO_A]).drop_duplicates().reset_index(drop=True)
     
-    for query_name, match_name in tqdm(multisearch_result[['query_name', 'match_name']].to_numpy()):
-        if str(query_name) not in results:
-            results[str(query_name)] = [str(match_name)]
-        else:
-            results[str(query_name)].append(str(match_name))
-    
-    return results
+    return multisearch_result
 
-def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, int, int]], sig_same_genoms_dict: Dict[str, List[str]]) -> Tuple[Dict[str, List[str]], pd.DataFrame]:
+def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, int, int]], multisearch_result: pd.DataFrame, ani_thresh: float, ksize: int) -> Tuple[Dict[str, List[str]], pd.DataFrame]:
     """
     Helper function that removes the close related organisms from the reference matrix.
     :param sig_info_dict: a dictionary mapping all signature name from reference data to a tuple (md5sum, minhash mean abundance, minhash hashes length, minhash scaled)
-    :param sig_same_genoms_dict: a dictionary mapping signature name to a list of its close related genomes (ANI > ani_thresh)
+    :param multisearch_result: a dataframe with symmetric pairwise multisearch result (query_name, match_name)
     :return 
-        rep_remove_dict: a dictionary with key as representative signature name and value as a list of signatures to be removed
+        remove_corr_df: a dataframe with two columns: removed organism name and its close related organisms
         manifest_df: a dataframe containing the processed reference signature information
     """
-    # for each genome with close related genomes, pick the one with largest number of unique kmers
-    rep_remove_dict = {}
+    # extract organisms that have close related organisms and their number of unique kmers
+    corr_organisms = [query_name for query_name in multisearch_result['query_name'].unique()]
+    sizes = np.array([sig_info_dict[organism][2] for organism in corr_organisms])
+    # sort organisms by size in ascending order, so we keep the largest organism, discard the smallest
+    bysize = np.argsort(sizes)
+    corr_organisms_bysize = np.array(corr_organisms)[bysize].tolist()
+    
+    # remove the sorted organisms until all left genomes are distinct (e.g., ANI <= ani_thresh)
     temp_remove_set = set()
-    manifest_df = []
-    for genome, same_genomes in tqdm(sig_same_genoms_dict.items()):
-        # skip if the genome has been removed
-        if genome in temp_remove_set:
-            continue
-        # keep same genome if it is not in the remove set
-        same_genomes = list(set(same_genomes).difference(temp_remove_set))
-        # get the number of unique kmers for each genome
-        unique_kmers = np.array([sig_info_dict[genome][2]] + [sig_info_dict[same_genome][2] for same_genome in same_genomes])
-        # get the index of the genome with largest number of unique kmers
-        rep_idx = np.argmax(unique_kmers)
-        # get the representative genome
-        rep_genome = genome if rep_idx == 0 else same_genomes[rep_idx-1]
-        # get the list of genomes to be removed
-        remove_genomes = same_genomes if rep_idx == 0 else [genome] + same_genomes[:rep_idx-1] + same_genomes[rep_idx:]
-        # update remove set
-        temp_remove_set.update(remove_genomes)
-        if len(remove_genomes) > 0:
-            rep_remove_dict[rep_genome] = remove_genomes
+    # loop through the organisms size in ascending order
+    for organism in tqdm(corr_organisms_bysize, desc='Removing close related organisms'):
+        ## for a given organism check its close related organisms, see if there are any organisms left after removing those in the remove set
+        ## if so, put this organism in the remove set
+        left_corr_orgs = set(multisearch_result.query(f'query_name == "{organism}"')['match_name']).difference(temp_remove_set)
+        if len(left_corr_orgs) > 0:
+            temp_remove_set.add(organism)
+
+    # generate a dataframe with two columns: removed organism name and its close related organisms
+    logger.info(f'Generating a dataframe with two columns: removed organism name and its close related organisms.')
+    remove_corr_list = [(organism, ','.join(list(set(multisearch_result.query(f'query_name == "{organism}"')['match_name'])))) for organism in tqdm(temp_remove_set)]
+    remove_corr_df = pd.DataFrame(remove_corr_list, columns=['removed_org', 'corr_orgs'])
     
     # remove the close related organisms from the reference genome list
+    manifest_df = []
     for sig_name, (md5sum, minhash_mean_abundance, minhash_hashes_len, minhash_scaled) in tqdm(sig_info_dict.items()):
         if sig_name not in temp_remove_set:
             manifest_df.append((sig_name, md5sum, minhash_hashes_len, get_num_kmers(minhash_mean_abundance, minhash_hashes_len, minhash_scaled, False), minhash_scaled))
     manifest_df = pd.DataFrame(manifest_df, columns=['organism_name', 'md5sum', 'num_unique_kmers_in_genome_sketch', 'num_total_kmers_in_genome_sketch', 'genome_scale_factor'])
     
-    return rep_remove_dict, manifest_df
-    
-# def compute_sample_vector(sample_hashes, hash_to_idx):
-#     """
-#     Helper function that computes the sample vector for a given sample signature.
-#     :param sample_hashes: hashes in the sample signature
-#     :param hash_to_idx: dictionary mapping hashes to indices in the training dictionary
-#     :return: numpy array (sample vector)
-#     """
-#     # total number of hashes in the training dictionary
-#     hash_to_idx_keys = set(hash_to_idx.keys())
-    
-#     # total number of hashes in the sample
-#     sample_hashes_keys = set(sample_hashes.keys())
-    
-#     # initialize the sample vector
-#     sample_vector = np.zeros(len(hash_to_idx_keys))
-    
-#     # get the hashes that are in both the sample and the training dictionary
-#     sample_intersect_training_hashes = hash_to_idx_keys.intersection(sample_hashes_keys)
-    
-#     # fill in the sample vector
-#     for sh in tqdm(sample_intersect_training_hashes):
-#         sample_vector[hash_to_idx[sh]] = sample_hashes[sh]
-
-#     return sample_vector
-
+    return remove_corr_df, manifest_df
 
 class Prediction:
     """

From a2aaf4bea102d3c7e08ecd703a80da8d412509a6 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 19 Oct 2023 19:59:46 -0400
Subject: [PATCH 03/13] sort similar genome names in order to better check the
 removed genomes

---
 srcs/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/srcs/utils.py b/srcs/utils.py
index 5f128a5..0f34575 100644
--- a/srcs/utils.py
+++ b/srcs/utils.py
@@ -126,7 +126,8 @@ def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, in
         manifest_df: a dataframe containing the processed reference signature information
     """
     # extract organisms that have close related organisms and their number of unique kmers
-    corr_organisms = [query_name for query_name in multisearch_result['query_name'].unique()]
+    # sort name in order to better check the removed organisms
+    corr_organisms = sorted([query_name for query_name in multisearch_result['query_name'].unique()])
     sizes = np.array([sig_info_dict[organism][2] for organism in corr_organisms])
     # sort organisms by size in ascending order, so we keep the largest organism, discard the smallest
     bysize = np.argsort(sizes)

From f005c21756c223c7cad5ca907c3cf298bbf404c2 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 19 Oct 2023 20:28:43 -0400
Subject: [PATCH 04/13] update test_workflow.py for the new change

---
 tests/test_workflow.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index ffd4241..57635f9 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -27,7 +27,7 @@ def test_full_workflow():
     # In testdata/
     # 20_genomes_trained_config.json
     # 20_genomes_trained_processed_manifest.tsv
-    # 20_genomes_trained_rep_to_corr_orgas_mapping.tsv
+    # 20_genomes_trained_removed_orgs_to_corr_orgas_mapping.tsv
     # /testdata/20_genomes_trained_intermediate_files$ tree .
     # .
     # ├── signatures
@@ -62,7 +62,7 @@ def test_full_workflow():
     shutil.rmtree(os.path.join(data_dir, intermediate_dir), ignore_errors=True)
     #  python ../make_training_data_from_sketches.py --ref_file testdata/20_genomes_sketches.zip --ksize 31 --prefix 20_genomes_trained --outdir testdata/
     cmd = f"python {os.path.join(script_dir, 'make_training_data_from_sketches.py')} --ref_file {reference_sketches}" \
-          f" --prefix {full_out_prefix} --ksize 31 --outdir {data_dir}"
+          f" --prefix {out_prefix} --ksize 31 --outdir {data_dir}"
     res = subprocess.run(cmd, shell=True, check=True)
     # check that no errors were raised
     assert res.returncode == 0
@@ -71,7 +71,7 @@ def test_full_workflow():
         assert exists(f)
     # check that the files are big enough
     for f in expected_files:
-        assert os.stat(f).st_size > 400
+        assert os.stat(f).st_size > 300
     # then do the presence/absence estimation
     if exists(abundance_file):
         os.remove(abundance_file)
@@ -80,9 +80,6 @@ def test_full_workflow():
     print(cmd)
     # ~/pycharm/YACHT/tests/testdata$ tree 20_genomes_trained_intermediate_files/
     # 20_genomes_trained_intermediate_files/
-    # ├── organism_sig_file.txt  # <-- new
-    # ├── sample_multisearch_result.csv
-    # ├── sample_sig_file.txt
     # ├── signatures
     # │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz
     # │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz
@@ -107,6 +104,14 @@ def test_full_workflow():
     # ├── SOURMASH-MANIFEST.csv
     # ├── training_multisearch_result.csv
     # └── training_sig_files.txt
+    # ~/pycharm/YACHT/tests/testdata$ tree sample_sample_intermediate_files
+    # sample_sample_intermediate_files
+    # ├── organism_sig_file.txt
+    # ├── sample_multisearch_result.csv
+    # ├── sample_sig_file.txt
+    # ├── signatures
+    # │   └── 2b6488794c648f540068adacd5aef77e.sig.gz
+    # └── SOURMASH-MANIFEST.csv
     # print(cmd)
     res = subprocess.run(cmd, shell=True, check=True)
     # check that no errors were raised

From 0c72b3ba6650dfebeaaf44a6717b139f206944b5 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 19 Oct 2023 21:29:44 -0400
Subject: [PATCH 05/13] add parameter "--outdir" in run_YACHT.py

---
 run_YACHT.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/run_YACHT.py b/run_YACHT.py
index 28efc3a..df764aa 100644
--- a/run_YACHT.py
+++ b/run_YACHT.py
@@ -10,6 +10,7 @@
 import json
 import warnings
 import zipfile
+from pathlib import Path
 warnings.filterwarnings("ignore")
 from tqdm import tqdm
 from loguru import logger
@@ -32,7 +33,8 @@
                                                            'Each value should be between 0 and 1, with 0 being the most sensitive (and least '
                                                            'precise) and 1 being the most precise (and least sensitive).', 
                                                            required=False, default=[1, 0.5, 0.1, 0.05, 0.01])
-    parser.add_argument('--out_filename', help='Full path of output filename', required=False, default='result.xlsx')
+    parser.add_argument('--out_filename', help='output excel filename', required=False, default='result.xlsx')
+    parser.add_argument('--outdir', type=str, help='path to output directory', required=False, default=os.getcwd())
 
     # parse the arguments
     args = parser.parse_args()
@@ -44,6 +46,7 @@
     show_all = args.show_all  # Show all organisms (no matter if present) in output file.
     min_coverage_list = args.min_coverage_list  # a list of percentages of unique k-mers covered by reads in the sample.
     out_filename = args.out_filename  # output filename
+    outdir = str(Path(args.outdir).absolute())  # path to output directory
 
     # check if the json file exists
     utils.check_file_existence(json_file_path, f'Config file {json_file_path} does not exist. '
@@ -57,8 +60,8 @@
     ani_thresh = config['ani_thresh']
 
     # Make sure the output can be written to
-    if not os.access(os.path.abspath(os.path.dirname(out_filename)), os.W_OK):
-        raise FileNotFoundError(f"Cannot write to the location: {os.path.abspath(os.path.dirname(out_filename))}.")
+    if not os.access(outdir, os.W_OK):
+        raise FileNotFoundError(f"Cannot write to the location: {outdir}.")
 
     # check if min_coverage is between 0 and 1
     for x in min_coverage_list:
@@ -126,9 +129,9 @@
     manifest_list = temp_manifest_list
 
     # save the results into Excel file
-    logger.info(f'Saving results to {os.path.dirname(out_filename)}.')
+    logger.info(f'Saving results to {outdir}.')
     # save the results with different min_coverage
-    with pd.ExcelWriter(out_filename, engine='openpyxl', mode='w') as writer:
+    with pd.ExcelWriter(os.path.join(outdir, out_filename), engine='openpyxl', mode='w') as writer:
         # save the raw results (i.e., min_coverage=1.0)
         if keep_raw:
             temp_mainifest = manifest_list[0].copy()

From e98f3e5bf854af8a7ee18ddb597037d6187d80eb Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 19 Oct 2023 21:30:01 -0400
Subject: [PATCH 06/13] update test_workflow.py for the new change

---
 tests/test_workflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index 5ae7609..0ac2852 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -61,7 +61,7 @@ def test_full_workflow():
     # Remove the intermediate folder
     shutil.rmtree(os.path.join(data_dir, intermediate_dir), ignore_errors=True)
     #  python ../make_training_data_from_sketches.py --ref_file testdata/20_genomes_sketches.zip --ksize 31 --prefix 20_genomes_trained --outdir testdata/
-    cmd = f"python {os.path.join(script_dir, 'make_training_data_from_sketches.py')} --ref_file {reference_sketches}" \
+    cmd = f"python {os.path.join(script_dir, 'make_training_data_from_sketches.py')} --force --ref_file {reference_sketches}" \
           f" --prefix {out_prefix} --ksize 31 --outdir {data_dir}"
     res = subprocess.run(cmd, shell=True, check=True)
     # check that no errors were raised
@@ -76,7 +76,7 @@ def test_full_workflow():
     if exists(abundance_file):
         os.remove(abundance_file)
     # python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx --outdir testdata/
-    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_file {abundance_file} --show_all"
+    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_filename {abundance_file} --show_all"
     print(cmd)
     # ~/pycharm/YACHT/tests/testdata$ tree 20_genomes_trained_intermediate_files/
     # 20_genomes_trained_intermediate_files/

From 22840cdf4787b18457f6cff3da7145babbda8a74 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 19 Oct 2023 23:29:17 -0400
Subject: [PATCH 07/13] remove unnecessary comments

---
 tests/test_workflow.py | 68 +-----------------------------------------
 1 file changed, 1 insertion(+), 67 deletions(-)

diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index 0ac2852..ae59c8f 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -24,36 +24,6 @@ def test_full_workflow():
                                               "training_sig_files.txt"])))
     # one of the signature files
     expected_files.extend(list(map(lambda x: os.path.join(data_dir, intermediate_dir, "signatures", x), ["04212e93c2172d4df49dc5d8c2973d8b.sig.gz"])))
-    # In testdata/
-    # 20_genomes_trained_config.json
-    # 20_genomes_trained_processed_manifest.tsv
-    # 20_genomes_trained_removed_orgs_to_corr_orgas_mapping.tsv
-    # /testdata/20_genomes_trained_intermediate_files$ tree .
-    # .
-    # ├── signatures
-    # │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz
-    # │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz
-    # │   ├── 0661ecab88c3d65d0f10e599a5ba1654.sig.gz
-    # │   ├── 06ebe48d527882bfa9505aba8e31ae23.sig.gz
-    # │   ├── 11fe9a00287c7ad086ebbc463724cf10.sig.gz
-    # │   ├── 16c6c1d37259d83088ab3a4f5b691631.sig.gz
-    # │   ├── 188d55801a78d4773cf6c0b46bca96ba.sig.gz
-    # │   ├── 1a121dca600c6504e88252e81004f0cf.sig.gz
-    # │   ├── 39ea7fd48ee7003587c9c763946d5d6e.sig.gz
-    # │   ├── 45f2675c1dca4ef1a24a05f5b268adbb.sig.gz
-    # │   ├── 7b312fffa3fb35440ba40203ba826c05.sig.gz
-    # │   ├── 8fb9b1a69838a58cc4f31c1e42a5f189.sig.gz
-    # │   ├── 92fb1b3e4baa6c474aff3efb84957687.sig.gz
-    # │   ├── 96cb85214535b0f9723a6abc17097821.sig.gz
-    # │   ├── a136145bee08846ed94c0406df3da2d4.sig.gz
-    # │   ├── b691deddf179ead0a006527330d86dde.sig.gz
-    # │   ├── b7f087146f5cc3121477c29ff003e3d0.sig.gz
-    # │   ├── c39c52d2d088348c950c2afe503b405b.sig.gz
-    # │   ├── c9eb6a9d058df8036ad93bc45d5bf260.sig.gz
-    # │   └── ce54d962851b0fdeefc624300036a133.sig.gz
-    # ├── SOURMASH-MANIFEST.csv
-    # ├── training_multisearch_result.csv
-    # └── training_sig_files.txt
     # remove the files if they exist
     for f in expected_files:
         if exists(f):
@@ -75,45 +45,9 @@ def test_full_workflow():
     # then do the presence/absence estimation
     if exists(abundance_file):
         os.remove(abundance_file)
-    # python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx --outdir testdata/
     cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_filename {abundance_file} --show_all"
     print(cmd)
-    # ~/pycharm/YACHT/tests/testdata$ tree 20_genomes_trained_intermediate_files/
-    # 20_genomes_trained_intermediate_files/
-    # ├── signatures
-    # │   ├── 04212e93c2172d4df49dc5d8c2973d8b.sig.gz
-    # │   ├── 04f2b0e94f2d1f1f5b8355114b70274e.sig.gz
-    # │   ├── 0661ecab88c3d65d0f10e599a5ba1654.sig.gz
-    # │   ├── 06ebe48d527882bfa9505aba8e31ae23.sig.gz
-    # │   ├── 11fe9a00287c7ad086ebbc463724cf10.sig.gz
-    # │   ├── 16c6c1d37259d83088ab3a4f5b691631.sig.gz
-    # │   ├── 188d55801a78d4773cf6c0b46bca96ba.sig.gz
-    # │   ├── 1a121dca600c6504e88252e81004f0cf.sig.gz
-    # │   ├── 39ea7fd48ee7003587c9c763946d5d6e.sig.gz
-    # │   ├── 45f2675c1dca4ef1a24a05f5b268adbb.sig.gz
-    # │   ├── 7b312fffa3fb35440ba40203ba826c05.sig.gz
-    # │   ├── 8fb9b1a69838a58cc4f31c1e42a5f189.sig.gz
-    # │   ├── 92fb1b3e4baa6c474aff3efb84957687.sig.gz
-    # │   ├── 96cb85214535b0f9723a6abc17097821.sig.gz
-    # │   ├── a136145bee08846ed94c0406df3da2d4.sig.gz
-    # │   ├── b691deddf179ead0a006527330d86dde.sig.gz
-    # │   ├── b7f087146f5cc3121477c29ff003e3d0.sig.gz
-    # │   ├── c39c52d2d088348c950c2afe503b405b.sig.gz
-    # │   ├── c9eb6a9d058df8036ad93bc45d5bf260.sig.gz
-    # │   └── ce54d962851b0fdeefc624300036a133.sig.gz
-    # ├── SOURMASH-MANIFEST.csv
-    # ├── training_multisearch_result.csv
-    # └── training_sig_files.txt
-    # ~/pycharm/YACHT/tests/testdata$ tree sample_sample_intermediate_files
-    # sample_sample_intermediate_files
-    # ├── organism_sig_file.txt
-    # ├── sample_multisearch_result.csv
-    # ├── sample_sig_file.txt
-    # ├── signatures
-    # │   └── 2b6488794c648f540068adacd5aef77e.sig.gz
-    # └── SOURMASH-MANIFEST.csv
-    # print(cmd)
-    # python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx
+
     cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out_file {os.path.join(data_dir,abundance_file)} --show_all"
     res = subprocess.run(cmd, shell=True, check=True)
     # check that no errors were raised

From 7733c0571e98664e6a82ed6db37a4bb163b94433 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 20 Oct 2023 07:29:26 -0400
Subject: [PATCH 08/13] fix a small bug

---
 srcs/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/srcs/utils.py b/srcs/utils.py
index 0f34575..77a3434 100644
--- a/srcs/utils.py
+++ b/srcs/utils.py
@@ -127,7 +127,7 @@ def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, in
     """
     # extract organisms that have close related organisms and their number of unique kmers
     # sort name in order to better check the removed organisms
-    corr_organisms = sorted([query_name for query_name in multisearch_result['query_name'].unique()])
+    corr_organisms = sorted([str(query_name) for query_name in multisearch_result['query_name'].unique()])
     sizes = np.array([sig_info_dict[organism][2] for organism in corr_organisms])
     # sort organisms by size in ascending order, so we keep the largest organism, discard the smallest
     bysize = np.argsort(sizes)

From a746799e78aef2fc8884b7d353eb6169a2a590df Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 20 Oct 2023 10:09:58 -0400
Subject: [PATCH 09/13] if use "--force", remove temporary directory

---
 make_training_data_from_sketches.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py
index c8599a2..1dbd8c8 100644
--- a/make_training_data_from_sketches.py
+++ b/make_training_data_from_sketches.py
@@ -8,6 +8,7 @@
 import srcs.utils as utils
 from loguru import logger
 import json
+import shutil
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
@@ -48,6 +49,11 @@
     path_to_temp_dir = os.path.join(outdir, prefix+'_intermediate_files')
     if os.path.exists(path_to_temp_dir) and not force:
         raise ValueError(f"Temporary directory {path_to_temp_dir} already exists. Please remove it or given a new prefix name using parameter '--prefix'.")
+    else:
+        # remove the temporary directory if it exists
+        if os.path.exists(path_to_temp_dir):
+            logger.warning(f"Temporary directory {path_to_temp_dir} already exists. Removing it.")
+            shutil.rmtree(path_to_temp_dir)
     os.makedirs(path_to_temp_dir, exist_ok=True)
     
     # unzip the sourmash signature file to the temporary directory

From 214d2cf997dc1725d5f40bbf74f9e5d81bace48c Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 20 Oct 2023 10:53:17 -0400
Subject: [PATCH 10/13] use the "--out" parameter only in run_YACHT.py to
 specify the result

---
 run_YACHT.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/run_YACHT.py b/run_YACHT.py
index df764aa..fe7d99a 100644
--- a/run_YACHT.py
+++ b/run_YACHT.py
@@ -33,8 +33,7 @@
                                                            'Each value should be between 0 and 1, with 0 being the most sensitive (and least '
                                                            'precise) and 1 being the most precise (and least sensitive).', 
                                                            required=False, default=[1, 0.5, 0.1, 0.05, 0.01])
-    parser.add_argument('--out_filename', help='output excel filename', required=False, default='result.xlsx')
-    parser.add_argument('--outdir', type=str, help='path to output directory', required=False, default=os.getcwd())
+    parser.add_argument('--out', type=str, help='path to output excel file', required=False, default=os.path.join(os.getcwd(), 'result.xlsx'))
 
     # parse the arguments
     args = parser.parse_args()
@@ -45,8 +44,13 @@
     keep_raw = args.keep_raw  # Keep raw results in output file.
     show_all = args.show_all  # Show all organisms (no matter if present) in output file.
     min_coverage_list = args.min_coverage_list  # a list of percentages of unique k-mers covered by reads in the sample.
-    out_filename = args.out_filename  # output filename
-    outdir = str(Path(args.outdir).absolute())  # path to output directory
+    out = str(Path(args.out).absolute()) # full path to output excel file
+    outdir = os.path.dirname(out) # path to output directory
+    out_filename = os.path.basename(out) # output filename
+
+    # check if the output filename is valid
+    if os.path.splitext(out_filename)[1] != '.xlsx':
+        raise ValueError(f'Output filename {out} is not a valid excel file. Please use .xlsx as the extension.')
 
     # check if the json file exists
     utils.check_file_existence(json_file_path, f'Config file {json_file_path} does not exist. '
@@ -101,10 +105,6 @@
     # check that the sample scale factor is the same as the genome scale factor for all organisms
     if scale != sample_sig_info[4]:
         raise ValueError(f'Sample scale factor does not equal genome scale factor. Please check your input.')
-
-    # check if the output filename is valid
-    if not isinstance(out_filename, str) and out_filename != '':
-        out_filename = 'result.xlsx'
     
     # compute hypothesis recovery
     logger.info('Computing hypothesis recovery.')
@@ -131,7 +131,7 @@
     # save the results into Excel file
     logger.info(f'Saving results to {outdir}.')
     # save the results with different min_coverage
-    with pd.ExcelWriter(os.path.join(outdir, out_filename), engine='openpyxl', mode='w') as writer:
+    with pd.ExcelWriter(out, engine='openpyxl', mode='w') as writer:
         # save the raw results (i.e., min_coverage=1.0)
         if keep_raw:
             temp_mainifest = manifest_list[0].copy()

From be0d1a7151105f16ef4f3390d8223b0d5a03e72f Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 20 Oct 2023 10:54:08 -0400
Subject: [PATCH 11/13] make make_training_data_from_sketches.py run faster

---
 make_training_data_from_sketches.py |  2 +-
 srcs/utils.py                       | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py
index 1dbd8c8..038d091 100644
--- a/make_training_data_from_sketches.py
+++ b/make_training_data_from_sketches.py
@@ -78,7 +78,7 @@
     # remove the close related organisms: any organisms with ANI > ani_thresh
     # pick only the one with largest number of unique kmers from all the close related organisms
     logger.info("Removing the close related organisms with ANI > ani_thresh")
-    remove_corr_df, manifest_df = utils.remove_corr_organisms_from_ref(sig_info_dict, multisearch_result, ani_thresh, ksize)
+    remove_corr_df, manifest_df = utils.remove_corr_organisms_from_ref(sig_info_dict, multisearch_result)
 
     # write out the manifest file
     logger.info("Writing out the manifest file")
diff --git a/srcs/utils.py b/srcs/utils.py
index 77a3434..15f4f46 100644
--- a/srcs/utils.py
+++ b/srcs/utils.py
@@ -116,7 +116,7 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int,
     
     return multisearch_result
 
-def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, int, int]], multisearch_result: pd.DataFrame, ani_thresh: float, ksize: int) -> Tuple[Dict[str, List[str]], pd.DataFrame]:
+def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, int, int]], multisearch_result: pd.DataFrame) -> Tuple[Dict[str, List[str]], pd.DataFrame]:
     """
     Helper function that removes the close related organisms from the reference matrix.
     :param sig_info_dict: a dictionary mapping all signature name from reference data to a tuple (md5sum, minhash mean abundance, minhash hashes length, minhash scaled)
@@ -133,19 +133,24 @@ def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, in
     bysize = np.argsort(sizes)
     corr_organisms_bysize = np.array(corr_organisms)[bysize].tolist()
     
+    # use dictionary to store the removed organisms and their close related organisms
+    # key: removed organism name
+    # value: a set of close related organisms
+    mapping = multisearch_result.groupby('query_name')['match_name'].agg(set).to_dict()
+    
     # remove the sorted organisms until all left genomes are distinct (e.g., ANI <= ani_thresh)
     temp_remove_set = set()
     # loop through the organisms size in ascending order
     for organism in tqdm(corr_organisms_bysize, desc='Removing close related organisms'):
         ## for a given organism check its close related organisms, see if there are any organisms left after removing those in the remove set
         ## if so, put this organism in the remove set
-        left_corr_orgs = set(multisearch_result.query(f'query_name == "{organism}"')['match_name']).difference(temp_remove_set)
+        left_corr_orgs = mapping[organism].difference(temp_remove_set)
         if len(left_corr_orgs) > 0:
             temp_remove_set.add(organism)
 
     # generate a dataframe with two columns: removed organism name and its close related organisms
     logger.info(f'Generating a dataframe with two columns: removed organism name and its close related organisms.')
-    remove_corr_list = [(organism, ','.join(list(set(multisearch_result.query(f'query_name == "{organism}"')['match_name'])))) for organism in tqdm(temp_remove_set)]
+    remove_corr_list = [(organism, ','.join(list(mapping[organism]))) for organism in tqdm(temp_remove_set)]
     remove_corr_df = pd.DataFrame(remove_corr_list, columns=['removed_org', 'corr_orgs'])
     
     # remove the close related organisms from the reference genome list

From e52a79ec4ef4cf4bd453c80b14fe3f41bc050e5a Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 20 Oct 2023 10:54:44 -0400
Subject: [PATCH 12/13] update README and test_workflow.py for the new change

---
 README.md              | 7 +++----
 tests/test_workflow.py | 7 ++-----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index a668d3e..190eba6 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.
 python ../make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force
 
 # run YACHT algorithm to check the presence of reference genomes in the query sample (inference step)
-python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out_filename result.xlsx
+python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx
 
 # convert result to CAMI profile format (Optional)
 python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./
@@ -190,7 +190,7 @@ The most important parameter of this script is `--ani_thresh`: this is average n
 After this, you are ready to perform the hypothesis test for each organism in your reference database. This can be accomplished with something like:
 
 ```bash
-python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --outdir ./
+python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --out ./result.xlsx
 ```
 
 #### Parameter
@@ -207,8 +207,7 @@ The `--min_coverage_list` parameter dictates a list of `min_coverage` which indi
 | --keep_raw                              | keep the raw result (i.e. `min_coverage=1`) no matter if the user specifies it |
 | --show_all                              | Show all organisms (no matter if present) |
 | --min_coverage_list                     | a list of `min_coverage` values, see more detailed description above (default: 1, 0.5, 0.1, 0.05, 0.01) |
-| --out_filename                          | filename of output excel result (default: 'result.xlsx') |
-| --outdir                          | the path to output directory where the results and intermediate files will be genreated |
+| --out                          | path to output excel result (default: './result.xlsx') |
 
 #### Output
 
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index ae59c8f..f8cef9f 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -45,10 +45,7 @@ def test_full_workflow():
     # then do the presence/absence estimation
     if exists(abundance_file):
         os.remove(abundance_file)
-    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --outdir {data_dir} --out_filename {abundance_file} --show_all"
-    print(cmd)
-
-    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out_file {os.path.join(data_dir,abundance_file)} --show_all"
+    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out {os.path.join(data_dir,abundance_file)} --show_all"
     res = subprocess.run(cmd, shell=True, check=True)
     # check that no errors were raised
     assert res.returncode == 0
@@ -83,7 +80,7 @@ def test_demo_workflow():
     _ = subprocess.run(cmd, shell=True, check=True)
     cmd = f"cd {demo_dir}; python ../make_training_data_from_sketches.py --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
     _ = subprocess.run(cmd, shell=True, check=True)
-    cmd = f"cd {demo_dir}; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out_filename result.xlsx"
+    cmd = f"cd {demo_dir}; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx"
     _ = subprocess.run(cmd, shell=True, check=True)
     cmd = f"cd {demo_dir}; python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
     _ = subprocess.run(cmd, shell=True, check=True)

From 9aeb904315090133be302995bdf71c83af54b130 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 20 Oct 2023 10:59:39 -0400
Subject: [PATCH 13/13] fixed some small issues suggested by code smells

---
 srcs/utils.py          | 3 +--
 tests/test_workflow.py | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/srcs/utils.py b/srcs/utils.py
index 15f4f46..4b7d974 100644
--- a/srcs/utils.py
+++ b/srcs/utils.py
@@ -87,7 +87,6 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int,
     :param path_to_temp_dir: string (path to the folder to store the intermediate files)
     :return: a dataframe with symmetric pairwise multisearch result (query_name, match_name)
     """
-    results = {}
     
     # run the sourmash multisearch
     # save signature files to a text file
@@ -149,7 +148,7 @@ def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, in
             temp_remove_set.add(organism)
 
     # generate a dataframe with two columns: removed organism name and its close related organisms
-    logger.info(f'Generating a dataframe with two columns: removed organism name and its close related organisms.')
+    logger.info('Generating a dataframe with two columns: removed organism name and its close related organisms.')
     remove_corr_list = [(organism, ','.join(list(mapping[organism]))) for organism in tqdm(temp_remove_set)]
     remove_corr_df = pd.DataFrame(remove_corr_list, columns=['removed_org', 'corr_orgs'])
     
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index f8cef9f..fb7406f 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -13,7 +13,6 @@ def test_full_workflow():
     test_dir = os.path.join(script_dir, 'tests')
     data_dir = os.path.join(test_dir, 'testdata')
     out_prefix = "20_genomes_trained"
-    full_out_prefix = os.path.join(data_dir, out_prefix)
     abundance_file = os.path.join(data_dir, "result.xlsx")
     reference_sketches = os.path.join(data_dir, "20_genomes_sketches.zip")
     sample_sketches = os.path.join(data_dir, "sample.sig.zip")