diff --git a/data/workflow/CMakeLists.txt b/data/workflow/CMakeLists.txt index f002a27db..3f006a286 100644 --- a/data/workflow/CMakeLists.txt +++ b/data/workflow/CMakeLists.txt @@ -3,6 +3,7 @@ set(GENERATED_WORKFLOWS workflow/easycluster.sh workflow/easytaxonomy.sh workflow/easyrbh.sh + workflow/easyproteomecluster.sh workflow/blastp.sh workflow/blastpgp.sh workflow/map.sh diff --git a/data/workflow/easyproteomecluster.sh b/data/workflow/easyproteomecluster.sh new file mode 100644 index 000000000..3a8bc85bd --- /dev/null +++ b/data/workflow/easyproteomecluster.sh @@ -0,0 +1,70 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + + +if notExists "${TMP_PATH}/input.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \ + || fail "query createdb died" +fi + +if notExists "${TMP_PATH}/clu.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \ + || fail "Search died" +fi + +if notExists "${RESULTS}_protein_cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \ + || fail "createtsv protein cluster died" +fi + +if notExists "${TMP_PATH}/aln.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" "${TMP_PATH}/cluster_count" ${PROTEOMECLUSTER_PAR} \ + || fail "Convert Alignments died" +fi + +if notExists "${RESULTS}_protein_align.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \ + || fail "createtsv protein align died" +fi + +if notExists "${RESULTS}_proteome_cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \ + || fail "createtsv proteome cluster died" +fi + +if notExists "${RESULTS}_cluster_count.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/cluster_count" "${RESULTS}_cluster_count.tsv" ${THREADS_PAR} \ + || fail "createtsv proteome cluster count report died" +fi + + +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/aln_protein" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/aln_proteome" ${VERBOSITY_PAR} + rm -rf "${TMP_PATH}/clu_tmp" + rm -f "${TMP_PATH}/easyproteomecluster.sh" +fi diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index 3c4abb4e0..35939c4e9 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -4,6 +4,7 @@ extern int align(int argc, const char **argv, const Command& command); extern int alignall(int argc, const char **argv, const Command& command); +extern int proteomecluster(int argc, const char **argv, const Command& command); extern int alignbykmer(int argc, const char **argv, const Command& command); extern int appenddbtoindex(int argc, const char **argv, const Command& command); extern int apply(int argc, const char **argv, const Command& command); @@ -43,6 +44,7 @@ extern int diffseqdbs(int argc, const char **argv, const Command& command); extern int easycluster(int argc, const char **argv, const Command& command); extern int easyrbh(int argc, const char **argv, const Command& command); extern int easylinclust(int argc, const char **argv, const Command& command); +extern int easyproteomecluster(int argc, const char **argv, const Command& command); extern int easysearch(int argc, const char **argv, const Command& command); extern int easylinsearch(int argc, const char **argv, const Command& command); extern int tsv2exprofiledb(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 8325f0d4a..84bf73726 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -11,6 +11,10 @@ std::vector baseCommands = { "Sensitive homology search", "# Search multiple FASTA against FASTA (like BLASTP, TBLASTN, BLASTX, BLASTN --search-type 3, TBLASTX --search-type 2)\n" "mmseqs easy-search examples/QUERY.fasta examples/QUERY.fasta examples/DB.fasta result.m8 tmp\n\n" + "# Search multiple query fasta files against target fasta files using a tsv file containing filepaths\n" + "echo -e \"dir1/QUERY1.fasta\\ndir2/QUERY2.fasta\" > examples/queries.tsv\n" + "echo -e \"dir3/TARGET1.fasta\\ndir4/TARGET2.fasta\" > examples/targets.tsv\n" + "mmseqs easy-search examples/queries.tsv examples/targets.tsv result.m8 tmp\n\n" "# Iterative profile search from stdin (like PSI-BLAST)\n" "cat examples/QUERY.fasta | mmseqs easy-search stdin examples/DB.fasta result.m8 tmp --num-iterations 2\n\n" "# Profile search against small databases (e.g. PFAM, eggNOG)\n" @@ -79,6 +83,19 @@ std::vector baseCommands = { CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin }, {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, + {"easy-proteomecluster", easyproteomecluster, &par.easyproteomeclusterworkflow, COMMAND_EASY, + "Proteome clustering and alignment", + "mmseqs easy-proteomecluster examples/DB.fasta result tmp\n\n" + "# ProteomeCluster output\n" + "# - protein_cluster.tsv: Protein linclust result\n" + "# - proteome_cluster.tsv: Proteome redundancy clustering result\n" + "# - protein_align.tsv: Protein alignment list\n" + "mmseqs easy-proteomecluster examples/DB.fasta result tmp --proteome-similarity 0.9 --proteome-relative-similarity 0.9\n", + "Martin Steinegger & Gyuri Kim ", + " ... ", + CITATION_MMSEQS2, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin }, + {"outputReports", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }, + {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, {"easy-taxonomy", easytaxonomy, &par.easytaxonomy, COMMAND_EASY, "Taxonomic classification", "# Assign taxonomic labels to FASTA sequences\n" @@ -125,7 +142,10 @@ std::vector baseCommands = { "# Create a seqDB from stdin\n" "cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n" "# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n" - "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n", + "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n\n" + "# Create a seqDB from a tsv file containing filepaths of multiple FASTA files in each line\n" + "echo -e \"dir1/bacteria.fasta\\ndir2/archea.fasta.gz\" > filepaths.tsv\n" + "mmseqs createdb filepaths.tsv sequenceDB\n", "Martin Steinegger ", " ... | ", CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric }, @@ -632,6 +652,16 @@ std::vector baseCommands = { {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, + {"proteomecluster", proteomecluster, &par.proteomecluster, COMMAND_ALIGNMENT, + "Proteome clustering and alignment", + NULL, + "Martin Steinegger & Gyuri Kim ", + " baseCommands = { CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, + {"transitivealign", transitivealign, &par.align, COMMAND_ALIGNMENT, "Transfer alignments via transitivity", //"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B", @@ -1298,5 +1329,6 @@ std::vector baseCommands = { NULL, "", "", - CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}} + CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}, + }; diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h index a9c28beb4..90185de63 100644 --- a/src/alignment/Matcher.h +++ b/src/alignment/Matcher.h @@ -140,6 +140,13 @@ class Matcher{ } } } + + float getSeqId(){ + return seqId; + } + unsigned int getAlnLength(){ + return alnLength; + } }; Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m, @@ -217,7 +224,6 @@ class Matcher{ static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress = true, bool addOrfPosition = false); - static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart); static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer, diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp index f52d2369c..1eb80fd06 100644 --- a/src/commons/DBReader.cpp +++ b/src/commons/DBReader.cpp @@ -112,6 +112,7 @@ template bool DBReader::open(int accessType){ } totalDataSize = 0; dataFileCnt = dataFileNames.size(); + dataSizeOffset = new size_t[dataFileNames.size() + 1]; dataFiles = new char*[dataFileNames.size()]; for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){ @@ -155,6 +156,22 @@ template bool DBReader::open(int accessType){ } lookupData.close(); } + if (dataMode & USE_SOURCE){ + std::string sourceFilename = (std::string(dataFileName) + ".source"); + MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); + if (sourceData.isValid() == false) { + Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n"; + EXIT(EXIT_FAILURE); + } + char* sourceDataChar = (char *) sourceData.getData(); + size_t sourceDataSize = sourceData.size(); + sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads); + source = new(std::nothrow) SourceEntry[this->sourceSize]; + incrementMemory(sizeof(SourceEntry) * this->sourceSize); + readSource(sourceDataChar, sourceDataSize, source); + + } + bool isSortedById = false; if (externalData == false) { MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); @@ -707,6 +724,24 @@ template unsigned int DBReader::getLookupFileNumber(size_t id){ return lookup[id].fileNumber; } +template std::string DBReader::getSourceFileName (size_t id){ + if (id >= sourceSize){ + Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n"; + Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n"; + EXIT(EXIT_FAILURE); + } + return source[id].fileName; +} + +template T DBReader::getSourceKey(size_t id){ + if (id >= sourceSize){ + Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n"; + Debug(Debug::ERROR) << "getSource id: local id (" << id << ") >= db size (" << sourceSize << ")\n"; + EXIT(EXIT_FAILURE); + } + return source[id].id; +} + template<> void DBReader::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) { buffer.append(SSTR(entry.id)); @@ -1048,6 +1083,36 @@ void DBReader::readLookup(char *data, size_t dataSize, DBReader::LookupEntry currPos = lookupData - (char *) data; i++; + + + } +} + +template +void DBReader::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) { + size_t i=0; + size_t currPos = 0; + char* sourceData = (char *) data; + const char * cols[3]; + while (currPos < dataSize){ + if (i >= this->sourceSize) { + Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n"; + EXIT(EXIT_FAILURE); + } + Util::getFieldsOfLine(sourceData, cols, 3); + source[i].id = Util::fast_atoi(cols[0]); + std::string fileName = std::string(cols[1], (cols[2] - cols[1])); + size_t lastDotPosition = fileName.rfind('.'); + + if (lastDotPosition != std::string::npos) { + fileName = fileName.substr(0, lastDotPosition); + } + source[i].fileName = fileName; + sourceData = Util::skipLine(sourceData); + + currPos = sourceData - (char *) data; + + i++; } } diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h index 57589b1f6..2c61664e1 100644 --- a/src/commons/DBReader.h +++ b/src/commons/DBReader.h @@ -98,7 +98,10 @@ class DBReader : public MemoryTracker { return false; } }; - + struct SourceEntry{ + T id; + std::string fileName; + }; struct LookupEntry { T id; std::string entryName; @@ -248,6 +251,8 @@ class DBReader : public MemoryTracker { void lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry); LookupEntry* getLookup() { return lookup; }; + std::string getSourceFileName(size_t id); + T getSourceKey(size_t id); static const int NOSORT = 0; static const int SORT_BY_LENGTH = 1; static const int LINEAR_ACCCESS = 2; @@ -266,6 +271,7 @@ class DBReader : public MemoryTracker { static const unsigned int USE_FREAD = 4; static const unsigned int USE_LOOKUP = 8; static const unsigned int USE_LOOKUP_REV = 16; + static const unsigned int USE_SOURCE = 32; // compressed @@ -309,6 +315,8 @@ class DBReader : public MemoryTracker { void readLookup(char *data, size_t dataSize, LookupEntry *lookup); + void readSource(char *data, size_t dataSize, SourceEntry *source); + void readIndexId(T* id, char * line, const char** cols); unsigned int indexIdToNum(T* id); @@ -485,7 +493,9 @@ class DBReader : public MemoryTracker { Index * index; size_t lookupSize; + size_t sourceSize; LookupEntry * lookup; + SourceEntry * source; bool sortedByOffset; unsigned int * id2local; diff --git a/src/commons/DBWriter.h b/src/commons/DBWriter.h index 190015cb3..21e5377bb 100644 --- a/src/commons/DBWriter.h +++ b/src/commons/DBWriter.h @@ -66,6 +66,7 @@ class DBWriter : public MemoryTracker { } static void sortIndex(const char *inFileNameIndex, const char *outFileNameIndex, const bool lexicographicOrder); + private: size_t addToThreadBuffer(const void *data, size_t itmesize, size_t nitems, int threadIdx); void writeThreadBuffer(unsigned int idx, size_t dataSize); diff --git a/src/commons/IndexReader.h b/src/commons/IndexReader.h index 17d637f12..298c2bb8c 100644 --- a/src/commons/IndexReader.h +++ b/src/commons/IndexReader.h @@ -95,6 +95,8 @@ class IndexReader { }else{ failSuffix = "_aln"; } + } else if (databaseType & SOURCE) { + failSuffix = ""; } } sequenceReader = new DBReader( @@ -115,6 +117,7 @@ class IndexReader { static const unsigned int SRC_HEADERS = 4; static const unsigned int SRC_SEQUENCES = 8; static const unsigned int ALIGNMENTS = 16; + static const unsigned int SOURCE = 32; static const unsigned int USER_SELECT = 1 << 31; static unsigned int makeUserDatabaseType(unsigned int baseKey) { diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 34e429811..fe35ab4c2 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -86,6 +86,8 @@ Parameters::Parameters(): PARAM_ALT_ALIGNMENT(PARAM_ALT_ALIGNMENT_ID, "--alt-ali", "Alternative alignments", "Show up to this many alternative alignments", typeid(int), (void *) &altAlignment, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN), PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), + PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), + PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-relative-similarity", "Proteome relative similarity", "Proteome relative similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeRelativeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), #ifdef GAP_POS_SCORING PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT), #endif @@ -338,6 +340,7 @@ Parameters::Parameters(): alignall.push_back(&PARAM_SUB_MAT); alignall.push_back(&PARAM_ADD_BACKTRACE); alignall.push_back(&PARAM_ALIGNMENT_MODE); + alignall.push_back(&PARAM_REALIGN_SCORE_BIAS); // alignall.push_back(&PARAM_WRAPPED_SCORING); alignall.push_back(&PARAM_E); alignall.push_back(&PARAM_MIN_SEQ_ID); @@ -365,6 +368,41 @@ Parameters::Parameters(): alignall.push_back(&PARAM_COMPRESSED); alignall.push_back(&PARAM_V); + //proteomecluster + proteomecluster.push_back(&PARAM_SUB_MAT); + proteomecluster.push_back(&PARAM_ADD_BACKTRACE); + proteomecluster.push_back(&PARAM_ALIGNMENT_MODE); + proteomecluster.push_back(&PARAM_REALIGN_SCORE_BIAS); +// proteomecluster.push_back(&PARAM_WRAPPED_SCORING); + proteomecluster.push_back(&PARAM_E); + proteomecluster.push_back(&PARAM_MIN_SEQ_ID); + proteomecluster.push_back(&PARAM_MIN_ALN_LEN); + proteomecluster.push_back(&PARAM_SEQ_ID_MODE); +// proteomecluster.push_back(&PARAM_ALT_ALIGNMENT); + proteomecluster.push_back(&PARAM_C); + proteomecluster.push_back(&PARAM_COV_MODE); + proteomecluster.push_back(&PARAM_MAX_SEQ_LEN); + proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR); + proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE); + proteomecluster.push_back(&PARAM_PROTEOME_SIMILARITY); + proteomecluster.push_back(&PARAM_PROTEOME_RELATIVE_SIMILARITY); + + proteomecluster.push_back(&PARAM_REALIGN); +// proteomecluster.push_back(&PARAM_MAX_REJECTED); +// proteomecluster.push_back(&PARAM_MAX_ACCEPT); + proteomecluster.push_back(&PARAM_INCLUDE_IDENTITY); + proteomecluster.push_back(&PARAM_PRELOAD_MODE); + proteomecluster.push_back(&PARAM_PCA); + proteomecluster.push_back(&PARAM_PCB); + proteomecluster.push_back(&PARAM_SCORE_BIAS); + proteomecluster.push_back(&PARAM_GAP_OPEN); + proteomecluster.push_back(&PARAM_GAP_EXTEND); + proteomecluster.push_back(&PARAM_ZDROP); + proteomecluster.push_back(&PARAM_THREADS); + proteomecluster.push_back(&PARAM_COMPRESSED); + proteomecluster.push_back(&PARAM_V); + + // alignment align.push_back(&PARAM_SUB_MAT); align.push_back(&PARAM_ADD_BACKTRACE); @@ -1311,6 +1349,9 @@ Parameters::Parameters(): // easylinclustworkflow easylinclustworkflow = combineList(linclustworkflow, createdb); + // easyproteomeclusterworkflow + easyproteomeclusterworkflow = combineList(easylinclustworkflow, proteomecluster); + // clustering workflow clusterworkflow = combineList(prefilter, align); clusterworkflow = combineList(clusterworkflow, rescorediagonal); @@ -2315,6 +2356,8 @@ void Parameters::setDefaults() { maxRejected = INT_MAX; maxAccept = INT_MAX; seqIdThr = 0.0; + proteomeSimThr = 0.9; + proteomeRelativeSimThr = 0.9; alnLenThr = 0; altAlignment = 0; gapOpen = MultiParam>(NuclAA(11, 5)); diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 745d2f809..0d4b3eee3 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -90,6 +90,7 @@ class Parameters { static const unsigned int DBTYPE_EXTENDED_COMPRESSED = 1; static const unsigned int DBTYPE_EXTENDED_INDEX_NEED_SRC = 2; static const unsigned int DBTYPE_EXTENDED_CONTEXT_PSEUDO_COUNTS = 4; + static const unsigned int DBTYPE_EXTENDED_SRC_IDENTIFIER = 8; // don't forget to add new database types to DBReader::getDbTypeName and Parameters::PARAM_OUTPUT_DBTYPE @@ -434,6 +435,8 @@ class Parameters { int maxAccept; // after n accepted sequences stop int altAlignment; // show up to this many alternative alignments float seqIdThr; // sequence identity threshold for acceptance + float proteomeSimThr; // proteome similarity threshold for acceptance + float proteomeRelativeSimThr; // proteome similarity threshold normalized by proteome size for acceptance int alnLenThr; // min. alignment length bool addBacktrace; // store backtrace string (M=Match, D=deletion, I=insertion) bool realign; // realign hit with more conservative score @@ -787,6 +790,8 @@ class Parameters { PARAMETER(PARAM_ALT_ALIGNMENT) PARAMETER(PARAM_GAP_OPEN) PARAMETER(PARAM_GAP_EXTEND) + PARAMETER(PARAM_PROTEOME_SIMILARITY) + PARAMETER(PARAM_PROTEOME_RELATIVE_SIMILARITY) #ifdef GAP_POS_SCORING PARAMETER(PARAM_GAP_PSEUDOCOUNT) #endif @@ -1078,6 +1083,7 @@ class Parameters { std::vector threadsandcompression; std::vector alignall; + std::vector proteomecluster; std::vector align; std::vector rescorediagonal; std::vector alignbykmer; @@ -1115,6 +1121,7 @@ class Parameters { std::vector kmersearch; std::vector countkmer; std::vector easylinclustworkflow; + std::vector easyproteomeclusterworkflow; std::vector linclustworkflow; std::vector easysearchworkflow; std::vector searchworkflow; diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index c43c3356d..6a5ee5a36 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -80,5 +80,6 @@ set(util_source_files util/proteinaln2nucl.cpp util/versionstring.cpp util/diskspaceavail.cpp + util/proteomecluster.cpp PARENT_SCOPE ) diff --git a/src/util/alignall.cpp b/src/util/alignall.cpp index 3a07596f6..39f2e199c 100644 --- a/src/util/alignall.cpp +++ b/src/util/alignall.cpp @@ -27,7 +27,8 @@ int alignall(int argc, const char **argv, const Command &command) { } unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr); - DBReader tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); + DBReader tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX|DBReader::USE_LOOKUP); + tdbr.open(DBReader::NOSORT); if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) { tdbr.readMmapedDataInMemory(); @@ -80,7 +81,6 @@ int alignall(int argc, const char **argv, const Command &command) { #pragma omp for schedule(dynamic, 1) for (size_t id = start; id < (start + bucketSize); id++) { progress.updateProgress(); - const unsigned int key = dbr_res.getDbKey(id); char *data = dbr_res.getData(id, thread_idx); diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp index 18d2fc7d0..e1d4ead3d 100644 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -19,6 +19,28 @@ int createdb(int argc, const char **argv, const Command& command) { std::vector filenames(par.filenames); std::string dataFile = filenames.back(); filenames.pop_back(); + if (Util::endsWith(".tsv", filenames[0])) { + if (filenames.size() > 1) { + Debug(Debug::ERROR) << "Only one tsv file can be given\n"; + EXIT(EXIT_FAILURE); + } + std::string tsv = filenames.back(); + filenames.pop_back(); + + FILE* file = FileUtil::openFileOrDie(tsv.c_str(), "r", true); + char* line = NULL; + size_t len = 0; + ssize_t read; + while ((read = getline(&line, &len, file)) != -1) { + if (line[read - 1] == '\n') { + line[read - 1] = '\0'; + read--; + } + filenames.push_back(line); + } + free(line); + fclose(file); + } for (size_t i = 0; i < filenames.size(); i++) { if (FileUtil::directoryExists(filenames[i].c_str()) == true) { diff --git a/src/util/createtsv.cpp b/src/util/createtsv.cpp index 7b75c0f9a..e8e87089c 100644 --- a/src/util/createtsv.cpp +++ b/src/util/createtsv.cpp @@ -18,18 +18,34 @@ int createtsv(int argc, const char **argv, const Command &command) { Parameters &par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + const bool hasTargetDB = par.filenames.size() > 3; + DBReader *reader; + if (hasTargetDB) { + reader = new DBReader(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + } else { + reader = new DBReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + } + reader->open(DBReader::LINEAR_ACCCESS); + const bool useSourceIdentifier = DBReader::getExtendedDbtype(reader->getDbtype()) & Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER; bool queryNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db1.c_str()), Parameters::DBTYPE_NUCLEOTIDES); bool targetNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db2.c_str()), Parameters::DBTYPE_NUCLEOTIDES); const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); - int queryHeaderType = (queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS; + int queryHeaderType = (useSourceIdentifier) ? IndexReader::SOURCE : + (queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS; queryHeaderType = (par.idxSeqSrc == 0) ? queryHeaderType : (par.idxSeqSrc == 1) ? IndexReader::HEADERS : IndexReader::SRC_HEADERS; - IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); + unsigned int preloadMode = (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0; + unsigned int dataMode = (useSourceIdentifier) + ? (DBReader::USE_INDEX | DBReader::USE_DATA | DBReader::USE_SOURCE) + : (DBReader::USE_INDEX | DBReader::USE_DATA); + IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, preloadMode, dataMode); + + // IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); IndexReader * tDbrHeader=NULL; DBReader * queryDB = qDbrHeader.sequenceReader; DBReader * targetDB = NULL; bool sameDB = (par.db2.compare(par.db1) == 0); - const bool hasTargetDB = par.filenames.size() > 3; + DBReader::Index * qHeaderIndex = qDbrHeader.sequenceReader->getIndex(); DBReader::Index * tHeaderIndex = NULL; @@ -49,23 +65,13 @@ int createtsv(int argc, const char **argv, const Command &command) { } } - DBReader *reader; - if (hasTargetDB) { - - reader = new DBReader(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - } else { - - reader = new DBReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - } - reader->open(DBReader::LINEAR_ACCCESS); - const std::string& dataFile = hasTargetDB ? par.db4 : par.db3; const std::string& indexFile = hasTargetDB ? par.db4Index : par.db3Index; const bool shouldCompress = par.dbOut == true && par.compressed == true; const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE; DBWriter writer(dataFile.c_str(), indexFile.c_str(), par.threads, shouldCompress, dbType); writer.open(); - + const size_t targetColumn = (par.targetTsvColumn == 0) ? SIZE_T_MAX : par.targetTsvColumn - 1; #pragma omp parallel { @@ -84,6 +90,10 @@ int createtsv(int argc, const char **argv, const Command &command) { for (size_t i = 0; i < reader->getSize(); ++i) { unsigned int queryKey = reader->getDbKey(i); size_t queryIndex = queryDB->getId(queryKey); + size_t querySourceIndex = SIZE_T_MAX; + if (useSourceIdentifier){ + querySourceIndex = queryDB->getSourceKey(queryKey); + } char *headerData = queryDB->getData(queryIndex, thread_idx); if (headerData == NULL) { @@ -92,7 +102,10 @@ int createtsv(int argc, const char **argv, const Command &command) { } std::string queryHeader; - if (par.fullHeader) { + if (useSourceIdentifier){ + queryHeader = queryDB->getSourceFileName(querySourceIndex); + } + else if (par.fullHeader) { queryHeader = "\""; queryHeader.append(headerData, qHeaderIndex[queryIndex].length - 2); queryHeader.append("\""); @@ -118,12 +131,19 @@ int createtsv(int argc, const char **argv, const Command &command) { } else if (hasTargetDB) { unsigned int targetKey = (unsigned int) strtoul(dbKey, NULL, 10); size_t targetIndex = targetDB->getId(targetKey); + size_t targetSourceIdex = SIZE_T_MAX; + if (useSourceIdentifier){ + targetSourceIdex = targetDB->getSourceKey(targetKey); + } char *targetData = targetDB->getData(targetIndex, thread_idx); if (targetData == NULL) { Debug(Debug::WARNING) << "Invalid header entry in query " << queryKey << " and target " << targetKey << "!\n"; continue; } - if (par.fullHeader) { + if (useSourceIdentifier){ + targetAccession = targetDB->getSourceFileName(targetSourceIdex); + } + else if (par.fullHeader) { targetAccession = "\""; targetAccession.append(targetData, tHeaderIndex[targetIndex].length - 2); targetAccession.append("\""); diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp new file mode 100644 index 000000000..40fb104ec --- /dev/null +++ b/src/util/proteomecluster.cpp @@ -0,0 +1,557 @@ +#include "Util.h" +#include "Parameters.h" +#include "Matcher.h" +#include "Debug.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "NucleotideMatrix.h" +#include "SubstitutionMatrix.h" +#include "Alignment.h" +#include "itoa.h" +#include "Timer.h" + +#ifdef OPENMP +#include +#endif + +struct ProteomeEntry{ + int proteomeKey; + unsigned int proteomeAALen; + int repProtKey; + float protSimScore; + unsigned int clusterCount; + float totalSeqId; + unsigned int AAMatchCount; + float relativeSimScore; + + ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f, unsigned int matchCount = 0, float relSimScore = 0.0f) + : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId), AAMatchCount(matchCount), relativeSimScore(relSimScore) {} + + int getRepProtKey() { + return repProtKey; + } + int getProteomeKey() { + return proteomeKey; + } + bool isCovered(){ + if (repProtKey == -1) { + return false; + } + return true; + } + void computeRedundancy(unsigned int repProteomeSize) { + protSimScore = totalSeqId / proteomeAALen; + relativeSimScore = static_cast (AAMatchCount * 2) / static_cast (repProteomeSize + proteomeAALen); + if (relativeSimScore >= 1.0) { + relativeSimScore = 1.0; + } + } + void addSeqId(float seqId) { + totalSeqId += seqId; + } + void addSeqLen(unsigned int seqLength) { + proteomeAALen += seqLength; + } + void resetProteomeInfo(){ + clusterCount = 0; + totalSeqId = 0.0f; + relativeSimScore = 0.0f; + protSimScore = 0.0f; + AAMatchCount = 0; + } + float getProtSimScore() { + return protSimScore; + } + float getrelativeSimScore() { + return relativeSimScore; + } + static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) { + if (a.repProtKey < b.repProtKey){ + return true; + } + if (a.repProtKey > b.repProtKey){ + return false; + } + if (a.proteomeKey == a.repProtKey && b.proteomeKey != b.repProtKey){ + return true; + } + if (a.proteomeKey != a.repProtKey && b.proteomeKey == b.repProtKey){ + return false; + } + if (a.proteomeKey != a.repProtKey && b.proteomeKey != b.repProtKey) { + return a.protSimScore > b.protSimScore; + } + return false; + } +}; + +struct __attribute__((__packed__)) MemberProteinEntry{ + unsigned int proteomeKey; + unsigned int proteinKey; +}; + +struct ClusterEntry { + bool isAvailable; + std::vector memberProteins; + + ClusterEntry() : isAvailable(false) {} + + ClusterEntry(size_t memberProteinSize) : isAvailable(true) { + memberProteins.reserve(memberProteinSize); + } + + void resetClusterInfo(){ + memberProteins.clear(); + isAvailable = false; + } +}; + +void calculateProteomeLength(std::vector& proteomeList, DBReader::LookupEntry* const& lookup, size_t lookupSize, DBReader& tProteinDB) { + for (size_t i = 0; i < lookupSize; i++) { + const unsigned int ProteomeId = lookup[i].fileNumber; + const unsigned int ProteinId = lookup[i].id; + proteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId)); + if (proteomeList[ProteomeId].proteomeKey == -1) { + proteomeList[ProteomeId].proteomeKey = ProteomeId; + } + } +} + +void initLocalClusterReps(size_t& id, std::vector& localClusterReps, std::vector& localProteomeCount, DBReader& tProteinDB, DBReader& linResDB, unsigned int thread_idx) { + std::vector memberKeys; + memberKeys.reserve(50); // store key for every protein in a cluster + + std::vector isProteomeInCluster(localProteomeCount.size(), 0); + char buffer[1024 + 32768*4]; + char *clustData = linResDB.getData(id, thread_idx); + + + while (*clustData != '\0') { + Util::parseKey(clustData, buffer); + const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10); + memberKeys.push_back(key); + clustData = Util::skipLine(clustData); + } + + if (memberKeys.size() <= 1) { + return; //singleton protein member cluster (1 member only) + } + + ClusterEntry eachClusterRep(memberKeys.size()); + int uniqueProteomeCount = 0; + //init MemberProteinEntry and add it to memberProteins vector + for (auto& eachMemberKey : memberKeys) { + const unsigned int proteinId = tProteinDB.getId(eachMemberKey); + const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId); + MemberProteinEntry mem; + mem.proteomeKey = proteomeKey; + mem.proteinKey = proteinId; + eachClusterRep.memberProteins.push_back(mem); + if (isProteomeInCluster[proteomeKey] == 0) { //If gene is from new proteome(source) + isProteomeInCluster[proteomeKey] = 1; + uniqueProteomeCount++; + } + } + + if (uniqueProteomeCount <= 1) { + return; //singleton proteome cluster (geneDuplication_1 proteome only with multiple members, exclude cloud genome) + } + + localClusterReps.push_back(eachClusterRep); + for (size_t i = 0; i < localProteomeCount.size(); i++) { + if (isProteomeInCluster[i]) { + localProteomeCount[i]++; + } + } +} + +bool FindNextRepProteome(std::vector& proteomeList, unsigned int& RepProteomeId) { + bool isAllProteomeRedundancyCovered = true; + + unsigned int maxMemberCount = 0; + unsigned int remainingProteomeCount = 0; + unsigned int lastProteomeKey = -1; + + for (size_t idx = 0; idx < proteomeList.size(); idx++) { + if (proteomeList[idx].isCovered()) { + continue; + }else{ + isAllProteomeRedundancyCovered = false; + remainingProteomeCount++; + lastProteomeKey = idx; + if (proteomeList[idx].clusterCount > maxMemberCount) { + maxMemberCount = proteomeList[idx].clusterCount; + RepProteomeId = idx; + } + } + } + + if (isAllProteomeRedundancyCovered){ + return false; + }else if (remainingProteomeCount == 1) { + //Only proteome is left. It isn't redundant with any other proteome + proteomeList[lastProteomeKey].repProtKey = proteomeList[lastProteomeKey].proteomeKey; + proteomeList[lastProteomeKey].protSimScore = 1.0; + proteomeList[lastProteomeKey].relativeSimScore = 1.0; + return false; + }else{ + return true; + } +} + +void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, std::vector& localMatchCount, DBWriter& proteinClustWriter) { + char buffer[1024]; + bool isRepFound = false; + unsigned int lastqLen = 0; + unsigned int qproteinKey = 0; + unsigned int qproteomeKey =0; + //find representative query protein which has the longest sequence length + for (auto& eachMember : clusterRep.memberProteins){ + if (eachMember.proteomeKey == RepProteomeId) { + isRepFound = true; + const unsigned int queryId = eachMember.proteinKey; + if (lastqLen < tProteinDB.getSeqLen(queryId)) { + lastqLen = tProteinDB.getSeqLen(queryId); + qproteinKey = eachMember.proteinKey; + qproteomeKey = eachMember.proteomeKey; + } + } + } + if (isRepFound){ + proteinClustWriter.writeStart(thread_idx); + const unsigned int queryId = qproteinKey; + const unsigned int queryKey = tProteinDB.getDbKey(queryId); + char* querySeq = tProteinDB.getData(queryId, thread_idx); + query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId)); + matcher.initQuery(&query); + const unsigned int queryProteomeLength = proteomeList[qproteomeKey].proteomeAALen; + + // Alignment by representative protein itself : same query and target (for createtsv) + Matcher::result_t rep_result = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true); + size_t rep_len = Matcher::resultToBuffer(buffer, rep_result, par.addBacktrace); + proteinClustWriter.writeAdd(buffer, rep_len, thread_idx); + localSeqIds[qproteomeKey] += rep_result.getSeqId()*query.L; + localMatchCount[qproteomeKey] += static_cast (rep_result.getSeqId() * static_cast (rep_result.getAlnLength()) + 0.5); + + + //Alignment by representative protein and other proteins in the cluster + for (auto& eachTargetMember : clusterRep.memberProteins){ + if (eachTargetMember.proteomeKey == RepProteomeId) { + continue; + } + // if query Proteome's length < target Proteome's length * proteomeSimThr, skip + const unsigned int targetProteomeLength = proteomeList[eachTargetMember.proteomeKey].proteomeAALen; + if (queryProteomeLength < targetProteomeLength * par.proteomeSimThr) { + continue; + } + const unsigned int targetId = eachTargetMember.proteinKey; + const unsigned int targetKey = tProteinDB.getDbKey(targetId); + unsigned int tproteomeKey = eachTargetMember.proteomeKey; + + char* targetSeq = tProteinDB.getData(targetId, thread_idx); + target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId)); + + if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) { + continue; + } + + const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false; + Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity); + + if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) { + if (query.L >= target.L*0.9) { // -s2 lenght difference parameter in cd-hit-2d + size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); + proteinClustWriter.writeAdd(buffer, len, thread_idx); + localSeqIds[tproteomeKey] += result.getSeqId() * target.L; + unsigned int matchCount = static_cast ((result.getSeqId() * static_cast (result.getAlnLength())) + 0.5); + localMatchCount[tproteomeKey] += matchCount; + } + } + } + proteinClustWriter.writeEnd(queryKey, thread_idx); + } +} + +void updateProteomeRedundancyInfo(std::vector& proteomeList, const unsigned int& RepProteomeId, Parameters& par) { + const unsigned int& repProteomeAASize = proteomeList[RepProteomeId].proteomeAALen; + for (size_t i = 0; i < proteomeList.size(); i++) { + if (proteomeList[i].isCovered() == false) { + if (i == RepProteomeId){ + proteomeList[i].repProtKey = RepProteomeId; + proteomeList[i].protSimScore = 1.0; + proteomeList[i].relativeSimScore = 1.0; + }else{ + proteomeList[i].computeRedundancy(repProteomeAASize); + if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr && proteomeList[i].getrelativeSimScore() >= par.proteomeRelativeSimThr){ + proteomeList[i].repProtKey = RepProteomeId; + }else{ + proteomeList[i].resetProteomeInfo(); + } + } + } + } + +} + +void updateClusterReps(ClusterEntry& clusterRep, std::vector& proteomeList, std::vector& localProteomeCount){ + std::vector isProteomeInCluster(localProteomeCount.size(), 0); + if (clusterRep.isAvailable) { + unsigned int notCoveredMemberCount = 0; + unsigned int uniqueProteomeCount = 0; + //update cluster member info + for (auto& eachMember : clusterRep.memberProteins) { + if (proteomeList[eachMember.proteomeKey].isCovered() == false) { + notCoveredMemberCount++; + if (isProteomeInCluster[eachMember.proteomeKey] == 0) { + isProteomeInCluster[eachMember.proteomeKey] = 1; + uniqueProteomeCount++; + } + } + } + if (notCoveredMemberCount <= 1 || uniqueProteomeCount <= 1 ) { + //notCoveredMemberCount : All members(genes)' proteome are covere(0) or only one member(gene) is not covered(1) + //uniqueProteomeCount : Remaining members(genes) are from same(one) proteome + clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; + return; + } + + } + + //update localProteomeCount -> update clusterCount for next repProteome finding + if (clusterRep.isAvailable) { + for (size_t i=0; i < localProteomeCount.size(); i++) { + if (isProteomeInCluster[i]) { + localProteomeCount[i]++; + } + } + } + +} + + +void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector &proteomeList) { + std::vector proteomeClusterIdx; + char proteomeBuffer[1024]; + int repProtIdCluster = proteomeList[0].getRepProtKey(); + + for (size_t idx = 0; idx < proteomeList.size(); idx++) { + int repProtId = proteomeList[idx].getRepProtKey(); + if (repProtIdCluster != repProtId) { + proteomeClustWriter.writeStart(); + for (auto &eachIdx : proteomeClusterIdx) { + char *basePos = proteomeBuffer; + char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer); + *(tmpProteomeBuffer - 1) = '\t'; + tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer); + *(tmpProteomeBuffer - 1) = '\t'; + tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getrelativeSimScore(), tmpProteomeBuffer); + *(tmpProteomeBuffer - 1) = '\n'; + proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos); + } + proteomeClustWriter.writeEnd(repProtIdCluster); + // Reset + repProtIdCluster = repProtId; + proteomeClusterIdx.clear(); + proteomeClusterIdx.push_back(idx); + } else { + proteomeClusterIdx.push_back(idx); + } + + if (idx == proteomeList.size() - 1) { + proteomeClustWriter.writeStart(); + for (auto &eachIdx : proteomeClusterIdx) { + char *basePos = proteomeBuffer; + char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer); + *(tmpProteomeBuffer - 1) = '\t'; + tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer); + *(tmpProteomeBuffer - 1) = '\t'; + tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getrelativeSimScore(), tmpProteomeBuffer); + *(tmpProteomeBuffer - 1) = '\n'; + proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos); + } + proteomeClustWriter.writeEnd(repProtIdCluster); + proteomeClusterIdx.clear(); + } + } +} + +static char* fastfloatToBuffer(float value, char* buffer) { + value *= 100; + int integerPart = (int)value; + buffer = Itoa::i32toa_sse2(integerPart, buffer); + *(buffer-1) = '.'; + float fractionalPart = (value - integerPart) * 100; + int fractionalInt = (int)fractionalPart; + if (fractionalInt < 10) { + *(buffer) = '0'; + buffer++; + } + buffer = Itoa::i32toa_sse2(fractionalInt, buffer); + + *(buffer-1) = '%'; + buffer++; + return buffer; +} + +int proteomecluster(int argc, const char **argv, const Command &command){ + //Initialize parameters + Parameters &par = Parameters::getInstance(); + par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0); + par.parseParameters(argc, argv, command, true, 0, 0); + + if (par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED) { + Debug(Debug::ERROR) << "Use rescorediagonal for ungapped alignment mode.\n"; + EXIT(EXIT_FAILURE); + } + if (par.addBacktrace == true) { + par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + } + + unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr); + + //Open the target protein database + DBReader tProteinDB(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX|DBReader::USE_LOOKUP|DBReader::USE_SOURCE); + tProteinDB.open(DBReader::NOSORT); + const int tProteinSeqType = tProteinDB.getDbtype(); + + if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) { + tProteinDB.readMmapedDataInMemory(); + } + //Open lookup table to get the source of the protein and the protein length + DBReader::LookupEntry* tLookup = tProteinDB.getLookup(); + const size_t tLookupSize = tProteinDB.getLookupSize(); + unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber; + std::vector proteomeList(totalProteomeNumber + 1); + calculateProteomeLength(proteomeList, tLookup, tLookupSize, tProteinDB); + + //Open the linclust result + DBReader linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); + linResDB.open(DBReader::LINEAR_ACCCESS); + + //Open the DBWriter + DBWriter proteinClustWriter(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB); + proteinClustWriter.open(); + int proteomeDBType = DBReader::setExtendedDbtype(Parameters::DBTYPE_GENERIC_DB, Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER); + DBWriter proteomeClustWriter(par.db4.c_str(), par.db4Index.c_str(), 1, par.compressed, proteomeDBType); + proteomeClustWriter.open(); + DBWriter clusterCountWriter(par.db5.c_str(), par.db5Index.c_str(), 1, par.compressed, proteomeDBType); + clusterCountWriter.open(); + + std::vector clusterReps; + + int gapOpen, gapExtend; + // BaseMatrix *subMat; + SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); + gapOpen = par.gapOpen.values.aminoacid(); + gapExtend = par.gapExtend.values.aminoacid(); + EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend); + Debug(Debug::INFO) << "Initialization "; + Timer timer; + + #pragma omp parallel + { + unsigned int thread_idx = 0; + #ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); + #endif + std::vector localClusterReps; + std::vector localProteomeCount(proteomeList.size(), 0); + #pragma omp for schedule(dynamic, 1) + for (size_t id = 0; id < linResDB.getSize(); id++) { + initLocalClusterReps(id, localClusterReps, localProteomeCount, tProteinDB, linResDB, thread_idx); + } + + for (size_t idx = 0; idx < localProteomeCount.size(); idx++) { + __sync_fetch_and_add(&proteomeList[idx].clusterCount, localProteomeCount[idx]); + } + + #pragma omp critical + { + clusterReps.insert(clusterReps.end(), + std::make_move_iterator(localClusterReps.begin()), + std::make_move_iterator(localClusterReps.end())); + } + + } + Debug(Debug::INFO) << timer.lap() << "\n"; + + unsigned int totalClusterCount = clusterReps.size(); + Debug(Debug::INFO) << "Total Cluster Count : " << totalClusterCount << "\n"; + for (size_t idx=0; idx < proteomeList.size(); idx++){ + float clusterCountRatio = static_cast (proteomeList[idx].clusterCount) / static_cast (totalClusterCount); + + char proteomeBuffer[1024]; + clusterCountWriter.writeStart(); + char *basePos = proteomeBuffer; + char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[idx].clusterCount, proteomeBuffer); + *(tmpProteomeBuffer - 1) = '\t'; + tmpProteomeBuffer = fastfloatToBuffer(clusterCountRatio, tmpProteomeBuffer); + *(tmpProteomeBuffer - 1) = '\n'; + clusterCountWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos); + clusterCountWriter.writeEnd(proteomeList[idx].proteomeKey); + } + + unsigned int RepProteomeId = -1; + Debug(Debug::INFO) << "Proteome Clustering "; + timer.reset(); + while (FindNextRepProteome(proteomeList, RepProteomeId)) { + #pragma omp parallel + { + unsigned int thread_idx = 0; + #ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); + #endif + Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, &subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop); + Sequence query(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection); + Sequence target(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection); + std::vector localSeqIds(proteomeList.size(), 0.0f); + std::vector localMatchCount(proteomeList.size(), 0); + #pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < clusterReps.size(); i++) { + if (clusterReps[i].isAvailable) { + runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, localMatchCount, proteinClustWriter); + } + + } + #pragma omp critical + { + for (size_t idx = 0; idx < proteomeList.size(); idx++) { + if (proteomeList[idx].isCovered() == false) { + proteomeList[idx].addSeqId(localSeqIds[idx]); + proteomeList[idx].AAMatchCount += localMatchCount[idx]; + } + } + } + } + + // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n"; + updateProteomeRedundancyInfo(proteomeList, RepProteomeId, par); + + // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n"; + #pragma omp parallel + { + std::vector localProteomeCount(proteomeList.size(), 0); + #pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < clusterReps.size(); i++) { + updateClusterReps(clusterReps[i], proteomeList, localProteomeCount); + } + + for (size_t i = 0; i < proteomeList.size(); i++) { + __sync_fetch_and_add(&proteomeList[i].clusterCount, localProteomeCount[i]); + } + } + } + Debug(Debug::INFO) << timer.lap() << "\n"; + + //sort proteomeList by repProtKey + SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey); + //write _proteome_cluster + writeProteomeClusters(proteomeClustWriter, proteomeList); + + proteomeClustWriter.close(); + proteinClustWriter.close(); + clusterCountWriter.close(); + tProteinDB.close(); + linResDB.close(); + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt index ce266e9c7..5aa4e0bd9 100644 --- a/src/workflow/CMakeLists.txt +++ b/src/workflow/CMakeLists.txt @@ -14,6 +14,7 @@ set(workflow_source_files workflow/Search.cpp workflow/Taxonomy.cpp workflow/EasyTaxonomy.cpp + workflow/EasyProteomecluster.cpp workflow/CreateIndex.cpp PARENT_SCOPE ) diff --git a/src/workflow/EasyLinclust.cpp b/src/workflow/EasyLinclust.cpp index 375ff1610..181968a8b 100644 --- a/src/workflow/EasyLinclust.cpp +++ b/src/workflow/EasyLinclust.cpp @@ -28,7 +28,6 @@ void setEasyLinclustMustPassAlong(Parameters *p) { p->PARAM_REMOVE_TMP_FILES.wasSet = true; p->PARAM_C.wasSet = true; p->PARAM_E.wasSet = true; - //p->PARAM_ALIGNMENT_MODE.wasSet = true; p->PARAM_ORF_START_MODE.wasSet = true; p->PARAM_ORF_MIN_LENGTH.wasSet = true; p->PARAM_ORF_MAX_LENGTH.wasSet = true; diff --git a/src/workflow/EasyProteomecluster.cpp b/src/workflow/EasyProteomecluster.cpp new file mode 100644 index 000000000..c8e60e18e --- /dev/null +++ b/src/workflow/EasyProteomecluster.cpp @@ -0,0 +1,55 @@ +#include + +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" +#include "Parameters.h" + +#include "easyproteomecluster.sh.h" + +void setEasyproteomeclusterDefaults(Parameters *p) { + p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + p->proteomeSimThr = 0.9; + p->proteomeRelativeSimThr = 0.0; +} + +void setEasyproteomeclusterMustPassAlong(Parameters *p){ + p->PARAM_REMOVE_TMP_FILES.wasSet = true; + p->PARAM_ALIGNMENT_MODE.wasSet = true; + +} + +int easyproteomecluster(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + setEasyproteomeclusterDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + setEasyproteomeclusterMustPassAlong(&par); + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); + + CommandCaller cmd; + cmd.addVariable("TMP_PATH", tmpDir.c_str()); + cmd.addVariable("RESULTS", par.filenames.back().c_str()); + par.filenames.pop_back(); + cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); + + cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.createdb).c_str()); + cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clusterworkflow, true).c_str()); + cmd.addVariable("PROTEOMECLUSTER_PAR", par.createParameterString(par.proteomecluster).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); + cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); + + std::string program = tmpDir + "/easyproteomecluster.sh"; + FileUtil::writeFile(program, easyproteomecluster_sh, easyproteomecluster_sh_len); + cmd.execProgram(program.c_str(), par.filenames); + + // Should never get here + assert(false); + return 0; +}