bgc_detection/candidates/threshold_candidates.py

#!/usr/bin/env python
# David Prihoda
# Convert a Domain CSV file with predictions into a BGC candidate CSV file using a given prediction threshold
# Generated by averaging domain predictions by protein and then merging consecutive proteins with prediction satisfying given threshold
# Candidates can be postprocessed by merging and filtering

import argparse
import pandas as pd
import json
import os
import numpy as np
from multiprocessing import Pool
import hashlib
from average_protein_prediction import average_protein_prediction

def num_bio_pfams(pfam_ids):
    """
    Calculate number of unique biosynthetic pfams in given list of pfams
    :param pfam_ids: List of pfam_ids
    :return: number of unique biosynthetic pfams in given list of pfams
    """
    from biosynthetic_pfams import AS_BIO_PFAM_IDS as BIO_PFAM_IDS
    return len(set(pfam_ids).intersection(BIO_PFAM_IDS))

def parse_cand_pfam_ids(pfam_string):
    """
    Get flat list of pfam_ids from candidate csv field 'pfam_ids'
    :param pfam_string: pfam_ids string separated by '|' (between each protein) and ';' (between each pfam_id)
    :return: flat list of pfam_ids
    """
    return [pfam_id for pfam_ids in pfam_string.split('|') for pfam_id in pfam_ids.split(';')]

def filter_cands_min_bio_pfams(cands, min_pfams, verbose=1):
    """
    Filter candidates by minimum number of biosynthetic pfams
    :param cands: Candidate DataFrame to filter
    :param min_pfams: Minimum number of biosynthetic pfams
    :param verbose: Verbosity
    :return: candidates filtered by minimum number of biosynthetic pfams
    """
    if cands.empty:
        return cands
    filtered = cands[cands['num_bio_domains'] >= min_pfams]
    if verbose:
        print('Filtered candidates by minimum of {} biosynthetic pfams, {}/{} remained.'.format(min_pfams, len(filtered), len(cands)))
    return filtered

def filter_cands_min_nucleotides(cands, min_nucleotides, verbose=1):
    """
    Filter candidates by minimum number of nucleotides
    :param cands: Candidate DataFrame to filter
    :param min_nucleotides: Minimum number of nucleotides
    :param verbose: Verbosity
    :return: candidates filtered by minimum number of nucleotides
    """
    if cands.empty:
        return cands
    filtered = cands[cands['nucl_length'] >= min_nucleotides]
    if verbose:
        print('Filtered candidates by minimum of {} nucleotides, {}/{} remained.'.format(min_nucleotides, len(filtered), len(cands)))
    return filtered

def filter_cands_min_proteins(cands, min_proteins, verbose=1):
    """
    Filter candidates by minimum number of proteins
    :param cands: Candidate DataFrame to filter
    :param min_proteins: Minimum number of proteins
    :param verbose: Verbosity
    :return: candidates filtered by minimum number of proteins
    """
    if cands.empty:
        return cands
    filtered = cands[cands['num_proteins'] >= min_proteins]
    if verbose:
        print('Filtered candidates by minimum of {} proteins, {}/{} remained.'.format(min_proteins, len(filtered), len(cands)))
    return filtered

def get_candidate(start, end, pfam_ids, protein_ids, protein_predictions):
    """
    Get single BGC candidate dictionary
    :param start: nucleotide coordinate start
    :param end: nucleotide coordinate end
    :param pfam_ids: list of pfam ids in candidate
    :param protein_ids: list of protein ids in candidate
    :param protein_predictions: list of protein model prediction outputs
    :return: BGC candidate dictionary
    """
    return {
        'nucl_start': start,
        'nucl_end': end,
        'num_proteins': len(protein_ids),
        'num_bio_domains': num_bio_pfams(pfam_ids),
        'num_all_domains': len(pfam_ids),
        'protein_ids': ';'.join(protein_ids),
        'pfam_ids': ';'.join(pfam_ids),
        'avg_prediction': np.mean(protein_predictions)
    }

def threshold_contig_candidates(domain_predictions, threshold, max_protein_gap=0, max_nucl_gap=0):
    """
    Get a BGC candidate DataFrame for domain predictions in a single contig.
    Generated by averaging domain predictions by protein and then merging consecutive proteins with prediction satisfying given threshold
    :param domain_predictions: DataFrame of domains and their 'prediction' column
    :param threshold: Averaged protein prediction threshold (inclusive) used to include or discard BGC proteins
    :param max_protein_gap: Merge candidates with given (or smaller) number of non-BGC proteins between them
    :param max_nucl_gap: Merge candidates with given (or smaller) number of nucleotides between them
    :return: DataFrame of BGC candidates
    """
    candidates = []

    protein_predictions = average_protein_prediction(domain_predictions, y=domain_predictions['prediction'], concat_domains=True)

    candidate_start = None
    candidate_end = None
    candidate_domains = []
    candidate_proteins = []
    candidate_predictions = []
    gap_domains = []
    gap_proteins = []
    gap_predictions = []
    for i, protein in protein_predictions.iterrows():
        prediction = protein['prediction']
        # Inactive protein, add to gap
        if prediction < threshold:
            gap_proteins.append(protein['protein_id'])
            gap_domains += protein['pfam_ids'].split(';')
            gap_predictions.append(prediction)
            # We just changed from active to inactive, add previous region as candidate
            if candidate_start is not None:
                candidates.append((candidate_start, candidate_end, candidate_domains, candidate_proteins, candidate_predictions))
                candidate_start = None
                candidate_end = None
                candidate_domains = []
                candidate_proteins = []
                candidate_predictions = []
        # Active protein
        else:
            if not candidate_start:
                candidate_start = protein['gene_start']
                if candidates:
                    # Check if we should merge with the previous candidate
                    prev_start, prev_end, prev_domains, prev_proteins, prev_predictions = candidates[-1]
                    if len(gap_proteins) <= max_protein_gap or (candidate_start - prev_end) <= max_nucl_gap:
                        # Remove previous candidate and continue where it started
                        candidates = candidates[:-1]
                        candidate_start = prev_start
                        candidate_domains = prev_domains + gap_domains
                        candidate_proteins = prev_proteins + gap_proteins
                        candidate_predictions = prev_predictions + gap_predictions

            candidate_end = protein['gene_end']
            candidate_proteins.append(protein['protein_id'])
            candidate_domains += protein['pfam_ids'].split(';')
            candidate_predictions.append(prediction)
            gap_domains = []
            gap_proteins = []
            gap_predictions = []

    # Last protein was active, add previous region as candidate
    if candidate_start is not None:
        candidates.append((candidate_start, candidate_end, candidate_domains, candidate_proteins, candidate_predictions))

    candidates = [get_candidate(*args) for args in candidates]
    return pd.DataFrame(candidates)

def contig_id_from_filename(path):
    """
    Create a basic contig_id from a file name without extension
    :param path: Path of file
    :return: file name without extension that can be used as contig_id
    """
    return os.path.splitext(os.path.basename(path))[0]

def threshold_candidates(predictions, threshold, max_protein_gap=0, max_nucl_gap=0, min_bio_domains=0,
                         min_proteins=0, min_nucleotides=0, i=0, verbose=1):
    """
    Get a BGC candidate DataFrame for domain predictions in multiple contigs.
    Generated by averaging domain predictions by protein and then merging consecutive proteins with prediction satisfying given threshold
    Candidates can be postprocessed by merging and filtering

    :param predictions: DataFrame of domains and their 'prediction' column
    :param threshold: Averaged protein prediction threshold (inclusive) used to include or discard BGC proteins
    :param max_protein_gap: Merge candidates with given (or smaller) number of non-BGC proteins between them
    :param max_nucl_gap: Merge candidates with given (or smaller) number of nucleotides between them
    :param min_bio_domains: Discard candidates with less than min_bio_domains biosynthetic domains
    :param min_proteins: Discard candidates with less than min_proteins proteins
    :param min_nucleotides: Discard candidates with less than min_nucleotides nucleotides
    :param i: Work index for logging purposes
    :param verbose: Verbosity
    :return: DataFrame of BGC candidates
    """
    groups = predictions.groupby('contig_id')
    # One file can contain multiple contigs, get candidates separately for each contig_id group
    candidates = []
    for contig_id, sample_predictions in groups:
        if verbose:
            print('Thresholding {} predictions in {} (#{})'.format(len(sample_predictions), contig_id, i))
        cands = threshold_contig_candidates(sample_predictions, threshold, max_protein_gap=max_protein_gap, max_nucl_gap=max_nucl_gap)
        if cands.empty:
            continue
        cands['contig_id'] = contig_id
        cands['nucl_start'] = cands['nucl_start'].astype('int64')
        cands['nucl_end'] = cands['nucl_end'].astype('int64')
        cands['nucl_length'] = cands['nucl_end'] - cands['nucl_start'] + 1
        cands['candidate_hash'] = cands['pfam_ids'].apply(
            lambda pfam_ids: hashlib.md5(pfam_ids.encode('utf-8')).hexdigest())
        cands['candidate_id'] = cands.apply(
            lambda row: '{}({}-{})'.format(row['contig_id'], row['nucl_start'], row['nucl_end']), axis=1)
        cands = cands[['contig_id', 'candidate_id', 'candidate_hash', 'avg_prediction', 'nucl_length', 'nucl_start', 'nucl_end',
             'num_bio_domains', 'num_all_domains', 'num_proteins', 'protein_ids', 'pfam_ids']]
        if min_bio_domains:
            cands = filter_cands_min_bio_pfams(cands, min_pfams=min_bio_domains, verbose=verbose)
        if min_proteins:
            cands = filter_cands_min_proteins(cands, min_proteins, verbose=verbose)
        if min_nucleotides:
            cands = filter_cands_min_nucleotides(cands, min_nucleotides, verbose=verbose)
        candidates.append(cands)
    return pd.concat(candidates) if candidates else pd.DataFrame()

def task(args):
    """
    Single parallel task wrapper for the threshold_candidates function
    Processes a single Domain CSV file
    :param args: Task arguments
    :return: DataFrame of BGC candidates, result of threshold_candidates
    """
    i, path, threshold, options = args
    predictions = pd.read_csv(path)

    if 'contig_id' not in predictions.columns:
        contig_id = contig_id_from_filename(path)
        predictions['contig_id'] = contig_id

    return threshold_candidates(
        predictions=predictions,
        threshold=threshold,
        i=i,
        max_protein_gap=options.genegap,
        max_nucl_gap=options.nuclgap,
        min_bio_domains=options.min_bio_domains,
        min_proteins=options.min_proteins,
        min_nucleotides=options.min_nucleotides
    )

if __name__ == "__main__":
    # Parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("-t", "--threshold", dest="threshold", required=False, type=float,
                      help="Prediction threshold to select proteins.", metavar="INT")
    parser.add_argument("-gg", "--gene-gap", dest="genegap", required=False, default=0, type=int,
                      help="Merge candidates with gene-gap or less genes in between.", metavar="INT")
    parser.add_argument("-ng", "--nucl-gap", dest="nuclgap", required=False, default=0, type=int,
                      help="Merge candidates with nucl-gap or less nucleotides in between.", metavar="INT")
    parser.add_argument("-md", "--min-bio-domains", dest="min_bio_domains", required=False, default=0, type=int,
                      help="Include only candidate with at least given number of known biosynthetic protein domains.", metavar="INT")
    parser.add_argument("-mp", "--min-proteins", dest="min_proteins", required=False, default=0, type=int,
                      help="Include only candidate with at least given number of proteins.", metavar="INT")
    parser.add_argument("-mn", "--min-nucleotides", dest="min_nucleotides", required=False, default=0, type=int,
                      help="Include only candidate with at least given number of nucleotides.", metavar="INT")
    parser.add_argument("-c", "--confusion", dest="confusion", required=False,
                      help="Confusion matrix JSON file to get threshold from.", metavar="FILE")
    parser.add_argument("-o", "--output", dest="output", required=True,
                      help="Output csv file path.", metavar="FILE")
    parser.add_argument(dest='predictions', nargs='+',
                        help="Paths to domain sequence CSVs with predictions.", metavar="FILE")

    options = parser.parse_args()

    if options.threshold:
        threshold = options.threshold
    elif options.confusion:
        with open(options.confusion) as f:
            rates = json.load(f)
        threshold = rates['threshold']
        print('Threshold {:.5f} with {}'.format(threshold, rates))
    else:
        raise AttributeError('Specify either threshold or path to validation file')

    paths = []
    for path in options.predictions:
        if os.path.isdir(path):
            files = [os.path.join(path, name) for name in os.listdir(path)]
            print('Adding {} files from directory: {}'.format(len(files), path))
            paths += files
        else:
            paths.append(path)

    pool = Pool()
    candidates = pool.map(task, [(i, path, threshold, options) for i, path in enumerate(paths)])
    candidates: pd.DataFrame = pd.concat(candidates)
    candidates.to_csv(options.output, index=False)

    print('Saved {} candidates to {}'.format(len(candidates), options.output))