From 1cb20742c4a4554cc9d55b578a54d25d40c06528 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Fri, 16 Aug 2024 21:34:03 +0900 Subject: [PATCH 01/31] alignproteome added --- src/CommandDeclarations.h | 1 + src/MMseqsBase.cpp | 10 + src/alignment/Matcher.h | 6 +- src/commons/CMakeLists.txt | 2 + src/commons/ClusterSpecies.cpp | 32 +++ src/commons/ClusterSpecies.h | 42 ++++ src/commons/DBReader.cpp | 73 ++++++ src/commons/DBReader.h | 15 +- src/commons/DBWriter.cpp | 1 + src/commons/Parameters.cpp | 41 ++- src/commons/Parameters.h | 3 + src/util/CMakeLists.txt | 1 + src/util/alignall.cpp | 6 +- src/util/alignproteome.cpp | 443 +++++++++++++++++++++++++++++++++ 14 files changed, 671 insertions(+), 5 deletions(-) create mode 100644 src/commons/ClusterSpecies.cpp create mode 100644 src/commons/ClusterSpecies.h create mode 100644 src/util/alignproteome.cpp diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index 3c4abb4e0..369034025 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -4,6 +4,7 @@ extern int align(int argc, const char **argv, const Command& command); extern int alignall(int argc, const char **argv, const Command& command); +extern int alignproteome(int argc, const char **argv, const Command& command); extern int alignbykmer(int argc, const char **argv, const Command& command); extern int appenddbtoindex(int argc, const char **argv, const Command& command); extern int apply(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 720aeaccb..e91d6d0a6 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -632,6 +632,15 @@ std::vector baseCommands = { {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, + {"alignproteome", alignproteome, &par.alignproteome, COMMAND_ALIGNMENT, + "Within-result all-vs-all gapped local alignment", + NULL, + "Martin Steinegger ", + " ", + CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, + {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, + {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, + {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, {"alignall", alignall, &par.alignall, COMMAND_ALIGNMENT, "Within-result all-vs-all gapped local alignment", NULL, @@ -640,6 +649,7 @@ std::vector baseCommands = { CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, + {"transitivealign", transitivealign, &par.align, COMMAND_ALIGNMENT, "Transfer alignments via transitivity", //"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B", diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h index a9c28beb4..a89ae9c5a 100644 --- a/src/alignment/Matcher.h +++ b/src/alignment/Matcher.h @@ -140,6 +140,10 @@ class Matcher{ } } } + + float getSeqId(){ + return seqId; + } }; Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m, @@ -217,7 +221,7 @@ class Matcher{ static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress = true, bool addOrfPosition = false); - + static size_t resultToBuffer_str(char * buffer, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition=false); static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart); static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer, diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt index dc7ab3e74..a58b5322a 100644 --- a/src/commons/CMakeLists.txt +++ b/src/commons/CMakeLists.txt @@ -8,6 +8,7 @@ set(commons_header_files commons/Command.h commons/CommandCaller.h commons/Concat.h + # commons/ClusterSpecies.h commons/DBConcat.h commons/DBReader.h commons/DBWriter.h @@ -52,6 +53,7 @@ set(commons_source_files commons/BaseMatrix.cpp commons/Command.cpp commons/CommandCaller.cpp + # commons/ClusterSpecies.cpp commons/DBConcat.cpp commons/DBReader.cpp commons/DBWriter.cpp diff --git a/src/commons/ClusterSpecies.cpp b/src/commons/ClusterSpecies.cpp new file mode 100644 index 000000000..3216cc210 --- /dev/null +++ b/src/commons/ClusterSpecies.cpp @@ -0,0 +1,32 @@ +#include "ClusterSpecies.h" +void ClusterSpecies::addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId) { + clusterEntriesPerSpecies[speciesId][clusterRepId].push_back(proteinId); +// speciesClusterMemCounts[speciesId]++; +} + + +void ClusterSpecies::removeSpecies(unsigned int speciesId) { + clusterEntriesPerSpecies.erase(speciesId); + // speciesClusterMemCounts.erase(speciesId); +} + +int ClusterSpecies::findRepSpecies(){ + //find max entry in speciesClusterMemCounts + unsigned int maxSpeciesId = 0; + size_t maxSpeciesSize = 0; + for (auto const& species : clusterEntriesPerSpecies){ + size_t speiciesSize = species.second.size(); + if (speiciesSize > maxSpeciesSize){ + maxSpeciesId = species.first; + } + } + + return maxSpeciesId; + // for (auto const& x : speciesClusterMemCounts) + // { + // if (x.second > maxSpeciesCount){ + // maxSpeciesCount = x.second; + // maxSpeciesId = x.first; + // } + // } +} \ No newline at end of file diff --git a/src/commons/ClusterSpecies.h b/src/commons/ClusterSpecies.h new file mode 100644 index 000000000..96b1e8688 --- /dev/null +++ b/src/commons/ClusterSpecies.h @@ -0,0 +1,42 @@ +#ifndef CLUSTER_SPECIES_H +#define CLUSTER_SPECIES_H + +#include +#include + + +class ClusterSpecies { +public: + + // std::unordered_map>> clusterEntriesPerSpecies; + // // std::unordered_map speciesClusterMemCounts; + // void addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId); + // void removeSpecies(unsigned int speciesId); + // int findRepSpecies(); + + // template + // struct ProteomeLength{ + // T id; + // unsigned int length; + // }; + + // std::vector> proteomeAAToatalLength; + + // void addProteomeLength(unsigned int id, unsigned int length){ + // ProteomeLength pl; + // pl.id = id; + // pl.length = length; + // proteomeAAToatalLength.push_back(pl); + // } + + // unsigned int getProteomeLength(unsigned int id){ + // for (auto const& pl : proteomeAAToatalLength){ + // if (pl.id == id){ + // return pl.length; + // } + // } + // return 0; + // } +}; + +#endif //CLUSTER_SPECIES_H \ No newline at end of file diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp index f52d2369c..70216cedd 100644 --- a/src/commons/DBReader.cpp +++ b/src/commons/DBReader.cpp @@ -112,6 +112,7 @@ template bool DBReader::open(int accessType){ } totalDataSize = 0; dataFileCnt = dataFileNames.size(); + dataSizeOffset = new size_t[dataFileNames.size() + 1]; dataFiles = new char*[dataFileNames.size()]; for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){ @@ -136,6 +137,8 @@ template bool DBReader::open(int accessType){ } } if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) { + Debug(Debug::INFO) << "ReadLookup file: " << dataFileName << "\n"; //gyuri + std::string lookupFilename = (std::string(dataFileName) + ".lookup"); MemoryMapped lookupData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); if (lookupData.isValid() == false) { @@ -144,7 +147,9 @@ template bool DBReader::open(int accessType){ } char* lookupDataChar = (char *) lookupData.getData(); size_t lookupDataSize = lookupData.size(); + Debug(Debug::INFO) << "Lookup Data size is " << lookupDataSize << "\n"; //gyuri lookupSize = Util::ompCountLines(lookupDataChar, lookupDataSize, threads); + Debug(Debug::INFO) << "Lookup size is " << lookupSize << "\n"; //gyuri lookup = new(std::nothrow) LookupEntry[this->lookupSize]; incrementMemory(sizeof(LookupEntry) * this->lookupSize); readLookup(lookupDataChar, lookupDataSize, lookup); @@ -155,6 +160,22 @@ template bool DBReader::open(int accessType){ } lookupData.close(); } + if (dataMode & USE_SOURCE){ + std::string sourceFilename = (std::string(dataFileName) + ".source"); + MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); + if (sourceData.isValid() == false) { + Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n"; + EXIT(EXIT_FAILURE); + } + char* sourceDataChar = (char *) sourceData.getData(); + size_t sourceDataSize = sourceData.size(); + sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads); + source = new(std::nothrow) SourceEntry[this->sourceSize]; + incrementMemory(sizeof(SourceEntry) * this->sourceSize); + readSource(sourceDataChar, sourceDataSize, source); + + } + bool isSortedById = false; if (externalData == false) { MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); @@ -707,6 +728,15 @@ template unsigned int DBReader::getLookupFileNumber(size_t id){ return lookup[id].fileNumber; } +template std::string DBReader::getSourceFileName (size_t id){ + if (id >= sourceSize){ + Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n"; + Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n"; + EXIT(EXIT_FAILURE); + } + return source[id].fileName; +} + template<> void DBReader::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) { buffer.append(SSTR(entry.id)); @@ -995,6 +1025,19 @@ size_t DBReader::getOffset(size_t id) { return index[id].offset; } +template +size_t DBReader::getIndexLen(size_t id) { + if (id >= size){ + Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << indexFileName << "\n"; + Debug(Debug::ERROR) << "getOffset: local id (" << id << ") >= db size (" << size << ")\n"; + EXIT(EXIT_FAILURE); + } + if (local2id != NULL) { + id = local2id[id]; + } + return index[id].length; +} + template size_t DBReader::findNextOffsetid(size_t id) { size_t idOffset = getOffset(id); @@ -1048,6 +1091,36 @@ void DBReader::readLookup(char *data, size_t dataSize, DBReader::LookupEntry currPos = lookupData - (char *) data; i++; + + + } +} + +template +void DBReader::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) { + size_t i=0; + size_t currPos = 0; + char* sourceData = (char *) data; + const char * cols[3]; + while (currPos < dataSize){ + if (i >= this->sourceSize) { + Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n"; + EXIT(EXIT_FAILURE); + } + Util::getFieldsOfLine(sourceData, cols, 3); + source[i].id = Util::fast_atoi(cols[0]); + std::string fileName = std::string(cols[1], (cols[2] - cols[1])); + size_t lastDotPosition = fileName.rfind('.'); + + if (lastDotPosition != std::string::npos) { + fileName = fileName.substr(0, lastDotPosition); + } + source[i].fileName = fileName; + sourceData = Util::skipLine(sourceData); + + currPos = sourceData - (char *) data; + + i++; } } diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h index 57589b1f6..198fa9ba5 100644 --- a/src/commons/DBReader.h +++ b/src/commons/DBReader.h @@ -98,7 +98,10 @@ class DBReader : public MemoryTracker { return false; } }; - + struct SourceEntry{ + T id; + std::string fileName; + }; struct LookupEntry { T id; std::string entryName; @@ -184,6 +187,8 @@ class DBReader : public MemoryTracker { size_t getSize() const; + unsigned int getProteomeTotalLen(size_t id); //gyuri + unsigned int getMaxSeqLen(){ return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ? (std::max(maxSeqLen, 1u)) / Sequence::PROFILE_READIN_SIZE : @@ -248,6 +253,7 @@ class DBReader : public MemoryTracker { void lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry); LookupEntry* getLookup() { return lookup; }; + std::string getSourceFileName(size_t id); static const int NOSORT = 0; static const int SORT_BY_LENGTH = 1; static const int LINEAR_ACCCESS = 2; @@ -266,6 +272,7 @@ class DBReader : public MemoryTracker { static const unsigned int USE_FREAD = 4; static const unsigned int USE_LOOKUP = 8; static const unsigned int USE_LOOKUP_REV = 16; + static const unsigned int USE_SOURCE = 32; // compressed @@ -309,6 +316,8 @@ class DBReader : public MemoryTracker { void readLookup(char *data, size_t dataSize, LookupEntry *lookup); + void readSource(char *data, size_t dataSize, SourceEntry *source); + void readIndexId(T* id, char * line, const char** cols); unsigned int indexIdToNum(T* id); @@ -437,6 +446,8 @@ class DBReader : public MemoryTracker { size_t findNextOffsetid(size_t id); + size_t getIndexLen(size_t id); + int isCompressed(){ return isCompressed(dbtype); } @@ -485,7 +496,9 @@ class DBReader : public MemoryTracker { Index * index; size_t lookupSize; + size_t sourceSize; LookupEntry * lookup; + SourceEntry * source; bool sortedByOffset; unsigned int * id2local; diff --git a/src/commons/DBWriter.cpp b/src/commons/DBWriter.cpp index 9ae84bae1..36aac0eaf 100644 --- a/src/commons/DBWriter.cpp +++ b/src/commons/DBWriter.cpp @@ -283,6 +283,7 @@ size_t DBWriter::writeAdd(const char* data, size_t dataSize, unsigned int thrIdx } size_t totalWriten = 0; if(isCompressedDB && (state[thrIdx] == INIT_STATE || state[thrIdx] == COMPRESSED) ) { + Debug(Debug::INFO) << "Compressing data in thread " << thrIdx << "\n"; //gyuri state[thrIdx] = COMPRESSED; // zstd seems to have a hard time with elements < 60 ZSTD_inBuffer input = { data, dataSize, 0 }; diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index afc56ba96..ab8ef3ad8 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -86,6 +86,8 @@ Parameters::Parameters(): PARAM_ALT_ALIGNMENT(PARAM_ALT_ALIGNMENT_ID, "--alt-ali", "Alternative alignments", "Show up to this many alternative alignments", typeid(int), (void *) &altAlignment, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN), PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), + PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), + #ifdef GAP_POS_SCORING PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT), #endif @@ -338,6 +340,7 @@ Parameters::Parameters(): alignall.push_back(&PARAM_SUB_MAT); alignall.push_back(&PARAM_ADD_BACKTRACE); alignall.push_back(&PARAM_ALIGNMENT_MODE); + alignall.push_back(&PARAM_REALIGN_SCORE_BIAS); //gyuri // alignall.push_back(&PARAM_WRAPPED_SCORING); alignall.push_back(&PARAM_E); alignall.push_back(&PARAM_MIN_SEQ_ID); @@ -350,7 +353,7 @@ Parameters::Parameters(): alignall.push_back(&PARAM_NO_COMP_BIAS_CORR); alignall.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE); -// alignall.push_back(&PARAM_REALIGN); + alignall.push_back(&PARAM_REALIGN); // alignall.push_back(&PARAM_MAX_REJECTED); // alignall.push_back(&PARAM_MAX_ACCEPT); alignall.push_back(&PARAM_INCLUDE_IDENTITY); @@ -365,6 +368,41 @@ Parameters::Parameters(): alignall.push_back(&PARAM_COMPRESSED); alignall.push_back(&PARAM_V); + //alignproteome + // alignproteome + alignproteome.push_back(&PARAM_SUB_MAT); + alignproteome.push_back(&PARAM_ADD_BACKTRACE); + alignproteome.push_back(&PARAM_ALIGNMENT_MODE); + alignproteome.push_back(&PARAM_REALIGN_SCORE_BIAS); //gyuri +// alignproteome.push_back(&PARAM_WRAPPED_SCORING); + alignproteome.push_back(&PARAM_E); + alignproteome.push_back(&PARAM_MIN_SEQ_ID); + alignproteome.push_back(&PARAM_MIN_ALN_LEN); + alignproteome.push_back(&PARAM_SEQ_ID_MODE); +// alignproteome.push_back(&PARAM_ALT_ALIGNMENT); + alignproteome.push_back(&PARAM_C); + alignproteome.push_back(&PARAM_COV_MODE); + alignproteome.push_back(&PARAM_MAX_SEQ_LEN); + alignproteome.push_back(&PARAM_NO_COMP_BIAS_CORR); + alignproteome.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE); + alignproteome.push_back(&PARAM_PROTEOME_SIMILARITY); + + alignproteome.push_back(&PARAM_REALIGN); +// alignproteome.push_back(&PARAM_MAX_REJECTED); +// alignproteome.push_back(&PARAM_MAX_ACCEPT); + alignproteome.push_back(&PARAM_INCLUDE_IDENTITY); + alignproteome.push_back(&PARAM_PRELOAD_MODE); + alignproteome.push_back(&PARAM_PCA); + alignproteome.push_back(&PARAM_PCB); + alignproteome.push_back(&PARAM_SCORE_BIAS); + alignproteome.push_back(&PARAM_GAP_OPEN); + alignproteome.push_back(&PARAM_GAP_EXTEND); + alignproteome.push_back(&PARAM_ZDROP); + alignproteome.push_back(&PARAM_THREADS); + alignproteome.push_back(&PARAM_COMPRESSED); + alignproteome.push_back(&PARAM_V); + + // alignment align.push_back(&PARAM_SUB_MAT); align.push_back(&PARAM_ADD_BACKTRACE); @@ -2315,6 +2353,7 @@ void Parameters::setDefaults() { maxRejected = INT_MAX; maxAccept = INT_MAX; seqIdThr = 0.0; + proteomeSimThr = 0.0; alnLenThr = 0; altAlignment = 0; gapOpen = MultiParam>(NuclAA(11, 5)); diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 745d2f809..838e0ed69 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -434,6 +434,7 @@ class Parameters { int maxAccept; // after n accepted sequences stop int altAlignment; // show up to this many alternative alignments float seqIdThr; // sequence identity threshold for acceptance + float proteomeSimThr; int alnLenThr; // min. alignment length bool addBacktrace; // store backtrace string (M=Match, D=deletion, I=insertion) bool realign; // realign hit with more conservative score @@ -787,6 +788,7 @@ class Parameters { PARAMETER(PARAM_ALT_ALIGNMENT) PARAMETER(PARAM_GAP_OPEN) PARAMETER(PARAM_GAP_EXTEND) + PARAMETER(PARAM_PROTEOME_SIMILARITY) #ifdef GAP_POS_SCORING PARAMETER(PARAM_GAP_PSEUDOCOUNT) #endif @@ -1078,6 +1080,7 @@ class Parameters { std::vector threadsandcompression; std::vector alignall; + std::vector alignproteome; std::vector align; std::vector rescorediagonal; std::vector alignbykmer; diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index c43c3356d..35d72b32f 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -80,5 +80,6 @@ set(util_source_files util/proteinaln2nucl.cpp util/versionstring.cpp util/diskspaceavail.cpp + util/alignproteome.cpp PARENT_SCOPE ) diff --git a/src/util/alignall.cpp b/src/util/alignall.cpp index 3a07596f6..3832ef9ee 100644 --- a/src/util/alignall.cpp +++ b/src/util/alignall.cpp @@ -27,7 +27,9 @@ int alignall(int argc, const char **argv, const Command &command) { } unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr); - DBReader tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); + // DBReader tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); + DBReader tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX|DBReader::USE_LOOKUP); + tdbr.open(DBReader::NOSORT); if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) { tdbr.readMmapedDataInMemory(); @@ -56,6 +58,7 @@ int alignall(int argc, const char **argv, const Command &command) { EvalueComputation evaluer(tdbr.getAminoAcidDBSize(), subMat, gapOpen, gapExtend); const size_t flushSize = 100000000; size_t iterations = static_cast(ceil(static_cast(dbr_res.getSize()) / static_cast(flushSize))); + Debug(Debug::INFO) << "Number of iterations: " << iterations << " Size of linclust dbr_res : " << dbr_res.getSize() << '\n'; for (size_t i = 0; i < iterations; i++) { size_t start = (i * flushSize); @@ -80,7 +83,6 @@ int alignall(int argc, const char **argv, const Command &command) { #pragma omp for schedule(dynamic, 1) for (size_t id = start; id < (start + bucketSize); id++) { progress.updateProgress(); - const unsigned int key = dbr_res.getDbKey(id); char *data = dbr_res.getData(id, thread_idx); diff --git a/src/util/alignproteome.cpp b/src/util/alignproteome.cpp new file mode 100644 index 000000000..197955da1 --- /dev/null +++ b/src/util/alignproteome.cpp @@ -0,0 +1,443 @@ +#include "Util.h" +#include "Parameters.h" +#include "Matcher.h" +#include "Debug.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "NucleotideMatrix.h" +#include "SubstitutionMatrix.h" +#include "Alignment.h" +#include "itoa.h" + +#ifdef OPENMP +#include +#endif + +struct __attribute__((__packed__)) ProteomeEntry{ + unsigned int proteomeAALen; + int repProtKey; + float protSimScore; + unsigned int memCount; + float totalSeqId; + + ProteomeEntry(unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f) + : proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {} + + void incrementMemCount(int count) { + memCount += count; + } + void decrementmemCount() { + memCount--; + } + + int getRepProtKey() { + return repProtKey; + } + + bool isCovered(){ + if (repProtKey == -1) { + return false; + } + return true; + } + + void computeRedundancy() { + protSimScore = totalSeqId / proteomeAALen; + + } + + void addSeqId(float seqId) { + totalSeqId += seqId; + } + + void addSeqLength(unsigned int seqLength) { + proteomeAALen += seqLength; + } + + void resetProteomeInfo(){ + memCount = 0; + totalSeqId = 0.0f; + } + + float getprotSimScore() { + return protSimScore; + } +}; + +struct __attribute__((__packed__)) memberProteinEntry{ + unsigned int proteomeKey; + unsigned int proteinKey; +}; + +struct ClusterEntry { + bool isAvailable; + std::vector memberProteins; + + ClusterEntry() : isAvailable(false) {} + + ClusterEntry(size_t memberProteinSize) : isAvailable(true) { + memberProteins.reserve(memberProteinSize); + } + + void resetClusterInfo(){ + memberProteins.clear(); + isAvailable = false; + } +}; + +void calculateProteomeLength(std::vector& ProteomeList, DBReader::LookupEntry* const& lookup, size_t lookupSize, DBReader& tProteinDB) { + for (size_t i = 0; i < lookupSize; i++) { + const unsigned int ProteomeId = lookup[i].fileNumber; + const unsigned int ProteinId = lookup[i].id; + ProteomeList[ProteomeId].addSeqLength(tProteinDB.getIndexLen(ProteinId) - 2); + } +} + +void initLocalClusterReps(size_t& id, std::vector& localClusterReps, std::vector& localMemCount, DBReader& tProteinDB, DBReader& linResDB, unsigned int thread_idx){ + std::vector memberKeys; + char buffer[1024 + 32768*4]; ///why? + memberKeys.reserve(50); // store key for every protein in a cluster + char *clustData = linResDB.getData(id, thread_idx); + while (*clustData != '\0') { + Util::parseKey(clustData, buffer); + const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10); + memberKeys.push_back(key); + clustData = Util::skipLine(clustData); + } + if (memberKeys.size() > 1) { //IF Not a singleton cluster + ClusterEntry eachClusterRep(memberKeys.size()); + + for (auto& eachMemberKey : memberKeys){ + const unsigned int proteinId = tProteinDB.getId(eachMemberKey); + const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId); + memberProteinEntry mem; + mem.proteomeKey = proteomeKey; + mem.proteinKey = proteinId; + eachClusterRep.memberProteins.push_back(mem); + localMemCount[proteomeKey]++; + } + localClusterReps.push_back(eachClusterRep); + } +} + +bool FindNextRepProteome(std::vector& ProteomeList, unsigned int& RepProteomeId) { + bool isAllCovered = true; + + unsigned int maxMemberCount = 0; + unsigned int notCoveredProtCount = 0; + + for (size_t idx = 0; idx < ProteomeList.size(); idx++){ + if (ProteomeList[idx].isCovered()) { + continue; + }else{ + isAllCovered = false; + notCoveredProtCount++; + if (ProteomeList[idx].memCount > maxMemberCount) { + maxMemberCount = ProteomeList[idx].memCount; + RepProteomeId = idx; + } + } + } + + if (isAllCovered){ + return false; + }else if (notCoveredProtCount == 1) { + //last one and it is singleton + ProteomeList[RepProteomeId].repProtKey = RepProteomeId; //todo + ProteomeList[RepProteomeId].protSimScore = 1.0; + return false; + }else{ + return true; + } +} + +void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& ProteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinAlignWriter) { + char buffer[1024]; //todo - How to calculate buffer size? + bool isRepFound = false; + unsigned int lastqLen = 0; + unsigned int qproteinKey = 0; + unsigned int qproteomeKey =0; + //find Rep + for (auto& eachMember : clusterRep.memberProteins){ + if (eachMember.proteomeKey == RepProteomeId) { + isRepFound = true; + const unsigned int queryId = eachMember.proteinKey; + if( lastqLen < tProteinDB.getSeqLen(queryId)){ + lastqLen = tProteinDB.getSeqLen(queryId); + qproteinKey = eachMember.proteinKey; + qproteomeKey = eachMember.proteomeKey; + } + } + } + if (isRepFound){ + const unsigned int queryId = qproteinKey; + const unsigned int queryKey = tProteinDB.getDbKey(queryId); + char* querySeq = tProteinDB.getData(queryId, thread_idx); //todo need openmp + query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId)); + matcher.initQuery(&query); + const unsigned int queryProteomeLength = ProteomeList[qproteomeKey].proteomeAALen; + char * tmpBuff = Itoa::u32toa_sse2((uint32_t) queryKey, buffer); + *(tmpBuff-1) = '\t'; + const unsigned int queryIdLen = tmpBuff - buffer; + + for (auto& eachTargetMember : clusterRep.memberProteins){ + // if query Proteome's length < target Proteome's length * 0.9, skip + const unsigned int targetProteomeLength = ProteomeList[eachTargetMember.proteomeKey].proteomeAALen; + if (queryProteomeLength < targetProteomeLength * 0.9) { + continue; + } + const unsigned int targetId = eachTargetMember.proteinKey; + unsigned int targetProteomeId = eachTargetMember.proteomeKey; + const unsigned int targetKey = tProteinDB.getDbKey(targetId); + char* targetSeq = tProteinDB.getData(targetId, thread_idx); + target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId)); + + if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) { + continue; + }; + + const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false; + Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity); + + if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) { + // Placeholder for writing results, uncomment and implement as needed + // size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); + // protRedunWriter.writeAdd(buffer, len, thread_idx); + // ProteomeList[targetProteomeId].addSeqId(result.getSeqId()); + size_t len = Matcher::resultToBuffer(tmpBuff, result, par.addBacktrace); + proteinAlignWriter.writeAdd(buffer, queryIdLen+len, thread_idx); + localSeqIds[targetProteomeId] += result.getSeqId()*target.L; + } + } + + } +} + +bool updateProteomeList(std::vector& ProteomeList, const unsigned int& RepProteomeId){ + bool isRepSingleton = true; + + #pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < ProteomeList.size(); i++) { + if (ProteomeList[i].isCovered() == false) { + if (i == RepProteomeId){ + ProteomeList[i].repProtKey = RepProteomeId; + ProteomeList[i].protSimScore = 1.0; + }else{ + ProteomeList[i].computeRedundancy(); + if (ProteomeList[i].getprotSimScore() >= 0.9) { + ProteomeList[i].repProtKey = RepProteomeId; + isRepSingleton = false; + }else{ + ProteomeList[i].resetProteomeInfo(); + } + } + } + } + return isRepSingleton; +} + +void updateClusterReps(std::vector& clusterReps, std::vector& ProteomeList, std::vector& localMemCount){ + #pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < clusterReps.size(); i++) { + if (clusterReps[i].isAvailable) { + bool allMemberCovered = true; + unsigned int notCoveredCount = 0; + unsigned int lastProteomeKey = 0; + for (auto& eachMember : clusterReps[i].memberProteins) { + if (ProteomeList[eachMember.proteomeKey].isCovered() == false) { + notCoveredCount++; + lastProteomeKey = eachMember.proteomeKey; + allMemberCovered = false; + localMemCount[eachMember.proteomeKey]++; + } + } + if (allMemberCovered) { + clusterReps[i].resetClusterInfo(); + } + + if (notCoveredCount == 1) { //singleton Cluster + localMemCount[lastProteomeKey]--; + clusterReps[i].resetClusterInfo(); + } + } + } + +} + +size_t resultToBuffer(const std::string& proteomeName, const std::string& repProtName, float similarityScore, char* buffer){ + char* basePos = buffer; + std::strncpy(buffer, proteomeName.c_str(), proteomeName.length()); + buffer += proteomeName.length()+1; + *(buffer-1) = '\t'; // 탭 구분자 추가 + + // repProtName을 버퍼에 복사 + std::strncpy(buffer, repProtName.c_str(), repProtName.length()); + buffer += repProtName.length()+1; + *(buffer-1) = '\t'; // 탭 구분자 추가 + + // similarityScore를 버퍼에 추가 (Util::fastSeqIdToBuffer 사용) + buffer = Util::fastSeqIdToBuffer(similarityScore, buffer); + *(buffer-1) = '\n'; // 줄바꿈 추가 + // *(buffer) = '\0'; // 문자열 끝 추가 + return buffer - basePos; +} + +int alignproteome(int argc, const char **argv, const Command &command){ + //Initialize parameters + Parameters &par = Parameters::getInstance(); + par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0); + par.parseParameters(argc, argv, command, true, 0, 0); + + if (par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED) { + Debug(Debug::ERROR) << "Use rescorediagonal for ungapped alignment mode.\n"; + EXIT(EXIT_FAILURE); + } + if (par.addBacktrace == true) { + par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + } + + unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr); + + //Open the target protein database + DBReader tProteinDB(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX|DBReader::USE_LOOKUP|DBReader::USE_SOURCE); + tProteinDB.open(DBReader::NOSORT); + const int tProteinSeqType = tProteinDB.getDbtype(); + + if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) { + tProteinDB.readMmapedDataInMemory(); + } + //Open lookup table to get the source of the protein and the protein length + DBReader::LookupEntry* tLookup = tProteinDB.getLookup(); + const size_t tLookupSize = tProteinDB.getLookupSize(); + unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber; + std::vector ProteomeList(totalProteomeNumber + 1); + calculateProteomeLength(ProteomeList, tLookup, tLookupSize, tProteinDB); + + //For Debug + // for (size_t i=0; i < ProteomeList.size(); i++) { + // Debug(Debug::INFO) << "Proteome " << i << " has " << ProteomeList[i].proteomeAALen << " amino acids.\n"; + // } + + //Open the linclust result + DBReader linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); + linResDB.open(DBReader::LINEAR_ACCCESS); + + //Open the DBWriter + DBWriter protRedunWriter(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_GENERIC_DB); + protRedunWriter.open(); + DBWriter proteinAlignWriter(par.db4.c_str(), par.db4Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB); + proteinAlignWriter.open(); + + std::vector clusterReps; + + int gapOpen, gapExtend; + BaseMatrix *subMat; + subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); + gapOpen = par.gapOpen.values.aminoacid(); + gapExtend = par.gapExtend.values.aminoacid(); + EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend); + + Debug(Debug::INFO) << "Start Initialization\n"; + #pragma omp parallel + { + unsigned int thread_idx = 0; + #ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); + #endif + std::vector localClusterReps; + std::vector localMemCount(ProteomeList.size(), 0); + + #pragma omp for schedule(dynamic, 1) + for (size_t id = 0; id < linResDB.getSize(); id++) { + initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx); + } + + #pragma omp critical + { + for (size_t idx=0; idx < localMemCount.size(); idx++){ + ProteomeList[idx].incrementMemCount(localMemCount[idx]); + } + clusterReps.insert(clusterReps.end(), + std::make_move_iterator(localClusterReps.begin()), + std::make_move_iterator(localClusterReps.end())); + } + + } + + unsigned int RepProteomeId = -1; + Debug(Debug::INFO) << "Run Alignment\n"; + while (FindNextRepProteome(ProteomeList, RepProteomeId)) { + // if (FindNextRepProteome(ProteomeList, RepProteomeId) == false) { + // Debug(Debug::INFO) << "All Proteome is covered. Done.\n"; + // break; + // } + // Debug(Debug::INFO) << "New Rep Found : " << RepProteomeId << "\n"; + // Debug(Debug::INFO) << "1. Run alignment for each cluster\n"; + #pragma omp parallel + { + unsigned int thread_idx = 0; + #ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); + #endif + Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop); + Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); + Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); + std::vector localSeqIds(clusterReps.size(), 0.0f); + proteinAlignWriter.writeStart(thread_idx); + #pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < clusterReps.size(); i++) { + if (clusterReps[i].isAvailable) { + runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, ProteomeList, par, thread_idx, swMode, localSeqIds, proteinAlignWriter); + } + + } + proteinAlignWriter.writeEnd(RepProteomeId, thread_idx); //gg + #pragma omp critical + { + for (size_t idx = 0; idx < ProteomeList.size(); idx++) { + ProteomeList[idx].addSeqId(localSeqIds[idx]); + } + } + } + + // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n"; + + bool isRepSingleton = updateProteomeList(ProteomeList, RepProteomeId); + + if (isRepSingleton) { + ProteomeList[RepProteomeId].repProtKey = RepProteomeId; + ProteomeList[RepProteomeId].protSimScore = 1.0; + } + + // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n"; + std::vector localMemCount(ProteomeList.size(), 0); + + updateClusterReps(clusterReps, ProteomeList, localMemCount); + + #pragma omp critical + { + for (size_t i = 0; i < ProteomeList.size(); i++) { + ProteomeList[i].memCount += localMemCount[i]; + } + } + } + // Debug(Debug::INFO) << "4. Write ProteomeList to file\n"; + char protRedunBuffer[1024]; //todo - How to calculate buffer size? + protRedunWriter.writeStart(); + for (size_t idx=0; idx < ProteomeList.size(); idx++){ + std::string proteomeName = tProteinDB.getSourceFileName(idx); + std::string repProtName = tProteinDB.getSourceFileName(ProteomeList[idx].getRepProtKey()); + float similarityScore = ProteomeList[idx].getprotSimScore(); + // Debug(Debug::INFO) << "Proteome " << idx << " : " << proteomeName << " has similarity score : " << similarityScore << " with " << repProtName << "\n"; + size_t result = resultToBuffer(proteomeName, repProtName, similarityScore,protRedunBuffer); + protRedunWriter.writeAdd(protRedunBuffer, result); + } + protRedunWriter.writeEnd(1); //Don't need index file (todo) + + protRedunWriter.close(); + proteinAlignWriter.close(); + tProteinDB.close(); + delete subMat; + linResDB.close(); + return EXIT_SUCCESS; +} \ No newline at end of file From 12c0c81bd66b3ab9cc077fef07728d5a2ba3cde5 Mon Sep 17 00:00:00 2001 From: Gyuuul2 Date: Mon, 19 Aug 2024 17:51:44 +0900 Subject: [PATCH 02/31] add workflow, revise based on feedback --- data/workflow/CMakeLists.txt | 1 + data/workflow/easyalignproteome.sh | 53 ++++++++++++++++++++++++++++++ src/CommandDeclarations.h | 1 + src/MMseqsBase.cpp | 10 ++++-- src/alignment/Matcher.h | 1 - src/commons/CMakeLists.txt | 2 -- src/commons/ClusterSpecies.cpp | 32 ------------------ src/commons/ClusterSpecies.h | 42 ----------------------- src/commons/DBReader.cpp | 17 ---------- src/commons/DBReader.h | 4 +-- src/commons/DBWriter.cpp | 1 - src/commons/Parameters.cpp | 6 +++- src/commons/Parameters.h | 1 + src/util/alignall.cpp | 2 -- src/util/alignproteome.cpp | 10 +++--- src/workflow/CMakeLists.txt | 1 + src/workflow/EasyAlignproteome.cpp | 41 +++++++++++++++++++++++ 17 files changed, 117 insertions(+), 108 deletions(-) create mode 100644 data/workflow/easyalignproteome.sh delete mode 100644 src/commons/ClusterSpecies.cpp delete mode 100644 src/commons/ClusterSpecies.h create mode 100644 src/workflow/EasyAlignproteome.cpp diff --git a/data/workflow/CMakeLists.txt b/data/workflow/CMakeLists.txt index f002a27db..3e94e1817 100644 --- a/data/workflow/CMakeLists.txt +++ b/data/workflow/CMakeLists.txt @@ -3,6 +3,7 @@ set(GENERATED_WORKFLOWS workflow/easycluster.sh workflow/easytaxonomy.sh workflow/easyrbh.sh + workflow/easyalignproteome.sh workflow/blastp.sh workflow/blastpgp.sh workflow/map.sh diff --git a/data/workflow/easyalignproteome.sh b/data/workflow/easyalignproteome.sh new file mode 100644 index 000000000..b27b8187a --- /dev/null +++ b/data/workflow/easyalignproteome.sh @@ -0,0 +1,53 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + + +if notExists "${TMP_PATH}/input.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \ + || fail "query createdb died" +fi + +if notExists "${TMP_PATH}/clu.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \ + || fail "Search died" +fi + +if notExists "${TMP_PATH}/aln.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" alignproteome "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${ALIGNPROTEOME_PAR} \ + || fail "Convert Alignments died" +fi + +if notExists "${RESULTS}_protein_cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \ + || fail "createtsv protein died" +fi + +if notExists "${RESULTS}_proteome_cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \ + || fail "createtsv proteome died" +fi + +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR} + rm -rf "${TMP_PATH}/clu_tmp" + rm -f "${TMP_PATH}/easyproteomecluster.sh" +fi diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index 369034025..04fbd521d 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -44,6 +44,7 @@ extern int diffseqdbs(int argc, const char **argv, const Command& command); extern int easycluster(int argc, const char **argv, const Command& command); extern int easyrbh(int argc, const char **argv, const Command& command); extern int easylinclust(int argc, const char **argv, const Command& command); +extern int easyalignproteome(int argc, const char **argv, const Command& command); extern int easysearch(int argc, const char **argv, const Command& command); extern int easylinsearch(int argc, const char **argv, const Command& command); extern int tsv2exprofiledb(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index e91d6d0a6..3b64eb556 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -79,6 +79,12 @@ std::vector baseCommands = { CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin }, {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, + {"easy-alignproteome", easyalignproteome, &par.easyalignproteome, COMMAND_EASY, + "Proteome clustering and alignemnt", + " ... " + // CITATION_MMSEQS2, + + }, {"easy-taxonomy", easytaxonomy, &par.easytaxonomy, COMMAND_EASY, "Taxonomic classification", "# Assign taxonomic labels to FASTA sequences\n" @@ -633,9 +639,9 @@ std::vector baseCommands = { {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, {"alignproteome", alignproteome, &par.alignproteome, COMMAND_ALIGNMENT, - "Within-result all-vs-all gapped local alignment", + "Proteome clustering and alignment", NULL, - "Martin Steinegger ", + "Martin Steinegger & Gyuri Kim ", " ", CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h index a89ae9c5a..6e1d0ab8f 100644 --- a/src/alignment/Matcher.h +++ b/src/alignment/Matcher.h @@ -221,7 +221,6 @@ class Matcher{ static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress = true, bool addOrfPosition = false); - static size_t resultToBuffer_str(char * buffer, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition=false); static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart); static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer, diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt index a58b5322a..dc7ab3e74 100644 --- a/src/commons/CMakeLists.txt +++ b/src/commons/CMakeLists.txt @@ -8,7 +8,6 @@ set(commons_header_files commons/Command.h commons/CommandCaller.h commons/Concat.h - # commons/ClusterSpecies.h commons/DBConcat.h commons/DBReader.h commons/DBWriter.h @@ -53,7 +52,6 @@ set(commons_source_files commons/BaseMatrix.cpp commons/Command.cpp commons/CommandCaller.cpp - # commons/ClusterSpecies.cpp commons/DBConcat.cpp commons/DBReader.cpp commons/DBWriter.cpp diff --git a/src/commons/ClusterSpecies.cpp b/src/commons/ClusterSpecies.cpp deleted file mode 100644 index 3216cc210..000000000 --- a/src/commons/ClusterSpecies.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include "ClusterSpecies.h" -void ClusterSpecies::addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId) { - clusterEntriesPerSpecies[speciesId][clusterRepId].push_back(proteinId); -// speciesClusterMemCounts[speciesId]++; -} - - -void ClusterSpecies::removeSpecies(unsigned int speciesId) { - clusterEntriesPerSpecies.erase(speciesId); - // speciesClusterMemCounts.erase(speciesId); -} - -int ClusterSpecies::findRepSpecies(){ - //find max entry in speciesClusterMemCounts - unsigned int maxSpeciesId = 0; - size_t maxSpeciesSize = 0; - for (auto const& species : clusterEntriesPerSpecies){ - size_t speiciesSize = species.second.size(); - if (speiciesSize > maxSpeciesSize){ - maxSpeciesId = species.first; - } - } - - return maxSpeciesId; - // for (auto const& x : speciesClusterMemCounts) - // { - // if (x.second > maxSpeciesCount){ - // maxSpeciesCount = x.second; - // maxSpeciesId = x.first; - // } - // } -} \ No newline at end of file diff --git a/src/commons/ClusterSpecies.h b/src/commons/ClusterSpecies.h deleted file mode 100644 index 96b1e8688..000000000 --- a/src/commons/ClusterSpecies.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef CLUSTER_SPECIES_H -#define CLUSTER_SPECIES_H - -#include -#include - - -class ClusterSpecies { -public: - - // std::unordered_map>> clusterEntriesPerSpecies; - // // std::unordered_map speciesClusterMemCounts; - // void addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId); - // void removeSpecies(unsigned int speciesId); - // int findRepSpecies(); - - // template - // struct ProteomeLength{ - // T id; - // unsigned int length; - // }; - - // std::vector> proteomeAAToatalLength; - - // void addProteomeLength(unsigned int id, unsigned int length){ - // ProteomeLength pl; - // pl.id = id; - // pl.length = length; - // proteomeAAToatalLength.push_back(pl); - // } - - // unsigned int getProteomeLength(unsigned int id){ - // for (auto const& pl : proteomeAAToatalLength){ - // if (pl.id == id){ - // return pl.length; - // } - // } - // return 0; - // } -}; - -#endif //CLUSTER_SPECIES_H \ No newline at end of file diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp index 70216cedd..80bca13c5 100644 --- a/src/commons/DBReader.cpp +++ b/src/commons/DBReader.cpp @@ -137,8 +137,6 @@ template bool DBReader::open(int accessType){ } } if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) { - Debug(Debug::INFO) << "ReadLookup file: " << dataFileName << "\n"; //gyuri - std::string lookupFilename = (std::string(dataFileName) + ".lookup"); MemoryMapped lookupData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); if (lookupData.isValid() == false) { @@ -147,9 +145,7 @@ template bool DBReader::open(int accessType){ } char* lookupDataChar = (char *) lookupData.getData(); size_t lookupDataSize = lookupData.size(); - Debug(Debug::INFO) << "Lookup Data size is " << lookupDataSize << "\n"; //gyuri lookupSize = Util::ompCountLines(lookupDataChar, lookupDataSize, threads); - Debug(Debug::INFO) << "Lookup size is " << lookupSize << "\n"; //gyuri lookup = new(std::nothrow) LookupEntry[this->lookupSize]; incrementMemory(sizeof(LookupEntry) * this->lookupSize); readLookup(lookupDataChar, lookupDataSize, lookup); @@ -1025,19 +1021,6 @@ size_t DBReader::getOffset(size_t id) { return index[id].offset; } -template -size_t DBReader::getIndexLen(size_t id) { - if (id >= size){ - Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << indexFileName << "\n"; - Debug(Debug::ERROR) << "getOffset: local id (" << id << ") >= db size (" << size << ")\n"; - EXIT(EXIT_FAILURE); - } - if (local2id != NULL) { - id = local2id[id]; - } - return index[id].length; -} - template size_t DBReader::findNextOffsetid(size_t id) { size_t idOffset = getOffset(id); diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h index 198fa9ba5..81d28bdf1 100644 --- a/src/commons/DBReader.h +++ b/src/commons/DBReader.h @@ -187,7 +187,7 @@ class DBReader : public MemoryTracker { size_t getSize() const; - unsigned int getProteomeTotalLen(size_t id); //gyuri + unsigned int getProteomeTotalLen(size_t id); unsigned int getMaxSeqLen(){ return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ? @@ -446,8 +446,6 @@ class DBReader : public MemoryTracker { size_t findNextOffsetid(size_t id); - size_t getIndexLen(size_t id); - int isCompressed(){ return isCompressed(dbtype); } diff --git a/src/commons/DBWriter.cpp b/src/commons/DBWriter.cpp index 36aac0eaf..9ae84bae1 100644 --- a/src/commons/DBWriter.cpp +++ b/src/commons/DBWriter.cpp @@ -283,7 +283,6 @@ size_t DBWriter::writeAdd(const char* data, size_t dataSize, unsigned int thrIdx } size_t totalWriten = 0; if(isCompressedDB && (state[thrIdx] == INIT_STATE || state[thrIdx] == COMPRESSED) ) { - Debug(Debug::INFO) << "Compressing data in thread " << thrIdx << "\n"; //gyuri state[thrIdx] = COMPRESSED; // zstd seems to have a hard time with elements < 60 ZSTD_inBuffer input = { data, dataSize, 0 }; diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index ab8ef3ad8..6cb54999b 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -1349,6 +1349,10 @@ Parameters::Parameters(): // easylinclustworkflow easylinclustworkflow = combineList(linclustworkflow, createdb); + // easyalignproteomeworkflow + easyalignproteome = combineList(easylinclustworkflow, alignproteome); + + // clustering workflow clusterworkflow = combineList(prefilter, align); clusterworkflow = combineList(clusterworkflow, rescorediagonal); @@ -2353,7 +2357,7 @@ void Parameters::setDefaults() { maxRejected = INT_MAX; maxAccept = INT_MAX; seqIdThr = 0.0; - proteomeSimThr = 0.0; + proteomeSimThr = 0.9; alnLenThr = 0; altAlignment = 0; gapOpen = MultiParam>(NuclAA(11, 5)); diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 838e0ed69..ee825291e 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -1118,6 +1118,7 @@ class Parameters { std::vector kmersearch; std::vector countkmer; std::vector easylinclustworkflow; + std::vector easyalignproteome; std::vector linclustworkflow; std::vector easysearchworkflow; std::vector searchworkflow; diff --git a/src/util/alignall.cpp b/src/util/alignall.cpp index 3832ef9ee..39f2e199c 100644 --- a/src/util/alignall.cpp +++ b/src/util/alignall.cpp @@ -27,7 +27,6 @@ int alignall(int argc, const char **argv, const Command &command) { } unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr); - // DBReader tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); DBReader tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX|DBReader::USE_LOOKUP); tdbr.open(DBReader::NOSORT); @@ -58,7 +57,6 @@ int alignall(int argc, const char **argv, const Command &command) { EvalueComputation evaluer(tdbr.getAminoAcidDBSize(), subMat, gapOpen, gapExtend); const size_t flushSize = 100000000; size_t iterations = static_cast(ceil(static_cast(dbr_res.getSize()) / static_cast(flushSize))); - Debug(Debug::INFO) << "Number of iterations: " << iterations << " Size of linclust dbr_res : " << dbr_res.getSize() << '\n'; for (size_t i = 0; i < iterations; i++) { size_t start = (i * flushSize); diff --git a/src/util/alignproteome.cpp b/src/util/alignproteome.cpp index 197955da1..3a7311b81 100644 --- a/src/util/alignproteome.cpp +++ b/src/util/alignproteome.cpp @@ -50,7 +50,7 @@ struct __attribute__((__packed__)) ProteomeEntry{ totalSeqId += seqId; } - void addSeqLength(unsigned int seqLength) { + void addSeqLen(unsigned int seqLength) { proteomeAALen += seqLength; } @@ -64,14 +64,14 @@ struct __attribute__((__packed__)) ProteomeEntry{ } }; -struct __attribute__((__packed__)) memberProteinEntry{ +struct __attribute__((__packed__)) MemberProteinEntry{ unsigned int proteomeKey; unsigned int proteinKey; }; struct ClusterEntry { bool isAvailable; - std::vector memberProteins; + std::vector memberProteins; ClusterEntry() : isAvailable(false) {} @@ -89,7 +89,7 @@ void calculateProteomeLength(std::vector& ProteomeList, DBReader< for (size_t i = 0; i < lookupSize; i++) { const unsigned int ProteomeId = lookup[i].fileNumber; const unsigned int ProteinId = lookup[i].id; - ProteomeList[ProteomeId].addSeqLength(tProteinDB.getIndexLen(ProteinId) - 2); + ProteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId)); } } @@ -110,7 +110,7 @@ void initLocalClusterReps(size_t& id, std::vector& localClusterRep for (auto& eachMemberKey : memberKeys){ const unsigned int proteinId = tProteinDB.getId(eachMemberKey); const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId); - memberProteinEntry mem; + MemberProteinEntry mem; mem.proteomeKey = proteomeKey; mem.proteinKey = proteinId; eachClusterRep.memberProteins.push_back(mem); diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt index ce266e9c7..b4e1f0cf1 100644 --- a/src/workflow/CMakeLists.txt +++ b/src/workflow/CMakeLists.txt @@ -14,6 +14,7 @@ set(workflow_source_files workflow/Search.cpp workflow/Taxonomy.cpp workflow/EasyTaxonomy.cpp + workflow/EasyAlignproteome.cpp workflow/CreateIndex.cpp PARENT_SCOPE ) diff --git a/src/workflow/EasyAlignproteome.cpp b/src/workflow/EasyAlignproteome.cpp new file mode 100644 index 000000000..5374da773 --- /dev/null +++ b/src/workflow/EasyAlignproteome.cpp @@ -0,0 +1,41 @@ +#include + +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" +#include "Parameters.h" + +#include "easyalignproteome.sh.h" + +void setEasyAlignproteomeDefaults(Parameters *p) { + p->proteomeSimThr = 0.9; +} + +// void setEasyAlignproteomeMustPassAlong(Parameters *p){ + +// } + +int easyalignproteome(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + setEasyAlignproteomeDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + // setEasyAlignproteomeMustPassAlong(&par); + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); + + CommandCaller cmd; + cmd.addVariable("ALIGNPROTEOME_PAR", par.createParameterString(par.alignproteome,true).c_str()); // what? + std::string program = tmpDir + "/easyalignproteome.sh"; + FileUtil::writeFile(program, easyalignproteome_sh, easyalignproteome_sh_len); + cmd.execProgram(program.c_str(), par.filenames); + + // Should never get here + assert(false); + return 0; +} From df80f075d49a42e69ec72db0cf5580250f5b82c4 Mon Sep 17 00:00:00 2001 From: Kim Soo Hyun Date: Sun, 25 Aug 2024 18:49:01 +0900 Subject: [PATCH 03/31] Change createdb.cpp so that it takes in ".txt" file containing paths of different fasta files --- src/util/createdb.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) mode change 100644 => 100755 src/util/createdb.cpp diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp old mode 100644 new mode 100755 index 18d2fc7d0..440b2ed06 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -19,6 +19,28 @@ int createdb(int argc, const char **argv, const Command& command) { std::vector filenames(par.filenames); std::string dataFile = filenames.back(); filenames.pop_back(); + if (Util::endsWith(".txt", filenames[0])) { + if (filenames.size() > 1) { + Debug(Debug::ERROR) << "Only one txt file can be given\n"; + EXIT(EXIT_FAILURE); + } + std::string tsv = filenames.back(); + filenames.pop_back(); + + FILE* file = FileUtil::openFileOrDie(tsv.c_str(), "r", true); + char* line = NULL; + size_t len = 0; + ssize_t read; + while ((read = getline(&line, &len, file)) != -1) { + if (line[read - 1] == '\n') { + line[read - 1] = '\0'; + read--; + } + filenames.push_back(line); + } + free(line); + fclose(file); + } for (size_t i = 0; i < filenames.size(); i++) { if (FileUtil::directoryExists(filenames[i].c_str()) == true) { From 3617ea7153b54f7131366660d570d5c68053d58a Mon Sep 17 00:00:00 2001 From: Kim Soo Hyun Date: Mon, 26 Aug 2024 16:11:29 +0900 Subject: [PATCH 04/31] Change filepath format from txt to tsv. Also add explanations of its usage to createdb and easy-search in MMseqsBase.cpp. --- src/MMseqsBase.cpp | 9 ++++++++- src/util/createdb.cpp | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 8325f0d4a..c252ade41 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -11,6 +11,10 @@ std::vector baseCommands = { "Sensitive homology search", "# Search multiple FASTA against FASTA (like BLASTP, TBLASTN, BLASTX, BLASTN --search-type 3, TBLASTX --search-type 2)\n" "mmseqs easy-search examples/QUERY.fasta examples/QUERY.fasta examples/DB.fasta result.m8 tmp\n\n" + "# Search multiple query fasta files against target fasta files using a tsv file containing filepaths\n" + "echo -e \"dir1/QUERY1.fasta\\ndir2/QUERY2.fasta\" > examples/queries.tsv\n" + "echo -e \"dir3/TARGET1.fasta\\ndir4/TARGET2.fasta\" > examples/targets.tsv\n" + "mmseqs easy-search examples/queries.tsv examples/targets.tsv result.m8 tmp\n\n" "# Iterative profile search from stdin (like PSI-BLAST)\n" "cat examples/QUERY.fasta | mmseqs easy-search stdin examples/DB.fasta result.m8 tmp --num-iterations 2\n\n" "# Profile search against small databases (e.g. PFAM, eggNOG)\n" @@ -125,7 +129,10 @@ std::vector baseCommands = { "# Create a seqDB from stdin\n" "cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n" "# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n" - "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n", + "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n\n" + "# Create a seqDB from a tsv file containing filepaths of multiple FASTA files in each line\n" + "echo -e \"dir1/bacteria.fasta\\ndir2/archea.fasta.gz\" > filepaths.tsv\n" + "mmseqs createdb filepaths.tsv sequenceDB\n", "Martin Steinegger ", " ... | ", CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric }, diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp index 440b2ed06..e1d4ead3d 100755 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -19,9 +19,9 @@ int createdb(int argc, const char **argv, const Command& command) { std::vector filenames(par.filenames); std::string dataFile = filenames.back(); filenames.pop_back(); - if (Util::endsWith(".txt", filenames[0])) { + if (Util::endsWith(".tsv", filenames[0])) { if (filenames.size() > 1) { - Debug(Debug::ERROR) << "Only one txt file can be given\n"; + Debug(Debug::ERROR) << "Only one tsv file can be given\n"; EXIT(EXIT_FAILURE); } std::string tsv = filenames.back(); From 3fc7fcdd7ef5a1cfd551e48e7a24b599e8eb93d6 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Tue, 27 Aug 2024 22:55:08 +0900 Subject: [PATCH 05/31] createtsv - handle source identifier --- src/util/createtsv.cpp | 52 +++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/src/util/createtsv.cpp b/src/util/createtsv.cpp index 7b75c0f9a..e8e87089c 100644 --- a/src/util/createtsv.cpp +++ b/src/util/createtsv.cpp @@ -18,18 +18,34 @@ int createtsv(int argc, const char **argv, const Command &command) { Parameters &par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + const bool hasTargetDB = par.filenames.size() > 3; + DBReader *reader; + if (hasTargetDB) { + reader = new DBReader(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + } else { + reader = new DBReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + } + reader->open(DBReader::LINEAR_ACCCESS); + const bool useSourceIdentifier = DBReader::getExtendedDbtype(reader->getDbtype()) & Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER; bool queryNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db1.c_str()), Parameters::DBTYPE_NUCLEOTIDES); bool targetNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db2.c_str()), Parameters::DBTYPE_NUCLEOTIDES); const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); - int queryHeaderType = (queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS; + int queryHeaderType = (useSourceIdentifier) ? IndexReader::SOURCE : + (queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS; queryHeaderType = (par.idxSeqSrc == 0) ? queryHeaderType : (par.idxSeqSrc == 1) ? IndexReader::HEADERS : IndexReader::SRC_HEADERS; - IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); + unsigned int preloadMode = (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0; + unsigned int dataMode = (useSourceIdentifier) + ? (DBReader::USE_INDEX | DBReader::USE_DATA | DBReader::USE_SOURCE) + : (DBReader::USE_INDEX | DBReader::USE_DATA); + IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, preloadMode, dataMode); + + // IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); IndexReader * tDbrHeader=NULL; DBReader * queryDB = qDbrHeader.sequenceReader; DBReader * targetDB = NULL; bool sameDB = (par.db2.compare(par.db1) == 0); - const bool hasTargetDB = par.filenames.size() > 3; + DBReader::Index * qHeaderIndex = qDbrHeader.sequenceReader->getIndex(); DBReader::Index * tHeaderIndex = NULL; @@ -49,23 +65,13 @@ int createtsv(int argc, const char **argv, const Command &command) { } } - DBReader *reader; - if (hasTargetDB) { - - reader = new DBReader(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - } else { - - reader = new DBReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - } - reader->open(DBReader::LINEAR_ACCCESS); - const std::string& dataFile = hasTargetDB ? par.db4 : par.db3; const std::string& indexFile = hasTargetDB ? par.db4Index : par.db3Index; const bool shouldCompress = par.dbOut == true && par.compressed == true; const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE; DBWriter writer(dataFile.c_str(), indexFile.c_str(), par.threads, shouldCompress, dbType); writer.open(); - + const size_t targetColumn = (par.targetTsvColumn == 0) ? SIZE_T_MAX : par.targetTsvColumn - 1; #pragma omp parallel { @@ -84,6 +90,10 @@ int createtsv(int argc, const char **argv, const Command &command) { for (size_t i = 0; i < reader->getSize(); ++i) { unsigned int queryKey = reader->getDbKey(i); size_t queryIndex = queryDB->getId(queryKey); + size_t querySourceIndex = SIZE_T_MAX; + if (useSourceIdentifier){ + querySourceIndex = queryDB->getSourceKey(queryKey); + } char *headerData = queryDB->getData(queryIndex, thread_idx); if (headerData == NULL) { @@ -92,7 +102,10 @@ int createtsv(int argc, const char **argv, const Command &command) { } std::string queryHeader; - if (par.fullHeader) { + if (useSourceIdentifier){ + queryHeader = queryDB->getSourceFileName(querySourceIndex); + } + else if (par.fullHeader) { queryHeader = "\""; queryHeader.append(headerData, qHeaderIndex[queryIndex].length - 2); queryHeader.append("\""); @@ -118,12 +131,19 @@ int createtsv(int argc, const char **argv, const Command &command) { } else if (hasTargetDB) { unsigned int targetKey = (unsigned int) strtoul(dbKey, NULL, 10); size_t targetIndex = targetDB->getId(targetKey); + size_t targetSourceIdex = SIZE_T_MAX; + if (useSourceIdentifier){ + targetSourceIdex = targetDB->getSourceKey(targetKey); + } char *targetData = targetDB->getData(targetIndex, thread_idx); if (targetData == NULL) { Debug(Debug::WARNING) << "Invalid header entry in query " << queryKey << " and target " << targetKey << "!\n"; continue; } - if (par.fullHeader) { + if (useSourceIdentifier){ + targetAccession = targetDB->getSourceFileName(targetSourceIdex); + } + else if (par.fullHeader) { targetAccession = "\""; targetAccession.append(targetData, tHeaderIndex[targetIndex].length - 2); targetAccession.append("\""); From 0487a1f289e4549a4eda9bbd00088ca8d3d4fd67 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Tue, 27 Aug 2024 22:56:37 +0900 Subject: [PATCH 06/31] proteomcluster - change writer / way to select repProteome --- src/util/proteomecluster.cpp | 492 +++++++++++++++++++++++++++++++++++ 1 file changed, 492 insertions(+) create mode 100644 src/util/proteomecluster.cpp diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp new file mode 100644 index 000000000..dcfae8aa3 --- /dev/null +++ b/src/util/proteomecluster.cpp @@ -0,0 +1,492 @@ +#include "Util.h" +#include "Parameters.h" +#include "Matcher.h" +#include "Debug.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "NucleotideMatrix.h" +#include "SubstitutionMatrix.h" +#include "Alignment.h" +#include "itoa.h" + +#ifdef OPENMP +#include +#endif + +struct __attribute__((__packed__)) ProteomeEntry{ + int proteomeKey; + unsigned int proteomeAALen; + int repProtKey; + float protSimScore; + unsigned int memCount; + float totalSeqId; + + ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f) + : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {} + + void incrementMemCount(unsigned int count) { + memCount += count; + } + void decrementmemCount() { + memCount--; + } + int getRepProtKey() { + return repProtKey; + } + int getProteomeKey() { + return proteomeKey; + } + bool isCovered(){ + if (repProtKey == -1) { + return false; + } + return true; + } + void computeRedundancy() { + protSimScore = totalSeqId / proteomeAALen; + + } + void addSeqId(float seqId) { + totalSeqId += seqId; + } + void addSeqLen(unsigned int seqLength) { + proteomeAALen += seqLength; + } + void resetProteomeInfo(){ + memCount = 0; + totalSeqId = 0.0f; + } + float getProtSimScore() { + return protSimScore; + } + static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) { + + if (a.repProtKey < b.repProtKey){ + return true; + } + if (a.repProtKey > b.repProtKey){ + return false; + } + if (a.proteomeKey == a.repProtKey && b.proteomeKey != b.repProtKey){ + return true; + } + if (a.proteomeKey != a.repProtKey && b.proteomeKey == b.repProtKey){ + return false; + } + return false; + } +}; + +struct __attribute__((__packed__)) MemberProteinEntry{ + unsigned int proteomeKey; + unsigned int proteinKey; +}; + +struct ClusterEntry { + bool isAvailable; + std::vector memberProteins; + + ClusterEntry() : isAvailable(false) {} + + ClusterEntry(size_t memberProteinSize) : isAvailable(true) { + memberProteins.reserve(memberProteinSize); + } + + void resetClusterInfo(){ + memberProteins.clear(); + isAvailable = false; + } +}; + +void calculateProteomeLength(std::vector& ProteomeList, DBReader::LookupEntry* const& lookup, size_t lookupSize, DBReader& tProteinDB) { + for (size_t i = 0; i < lookupSize; i++) { + const unsigned int ProteomeId = lookup[i].fileNumber; + const unsigned int ProteinId = lookup[i].id; + ProteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId)); + if (ProteomeList[ProteomeId].proteomeKey == -1){ + ProteomeList[ProteomeId].proteomeKey = ProteomeId; + } + } +} + +void initLocalClusterReps(size_t& id, std::vector& localClusterReps, std::vector& localMemCount, DBReader& tProteinDB, DBReader& linResDB, unsigned int thread_idx){ + std::vector memberKeys; + memberKeys.reserve(50); // store key for every protein in a cluster + + std::vector isProteomeInCluster(localMemCount.size(), false); ; + char buffer[1024 + 32768*4]; + char *clustData = linResDB.getData(id, thread_idx); + + while (*clustData != '\0') { + Util::parseKey(clustData, buffer); + const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10); + memberKeys.push_back(key); + clustData = Util::skipLine(clustData); + } + if (memberKeys.size() > 1) { //If not a singleton cluster + ClusterEntry eachClusterRep(memberKeys.size()); + //init MemberProteinEntry and add it to memberProteins vector + for (auto& eachMemberKey : memberKeys){ + const unsigned int proteinId = tProteinDB.getId(eachMemberKey); + const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId); + MemberProteinEntry mem; + mem.proteomeKey = proteomeKey; + mem.proteinKey = proteinId; + eachClusterRep.memberProteins.push_back(mem); + isProteomeInCluster[proteomeKey] = true; + } + localClusterReps.push_back(eachClusterRep); + } + + for (size_t i = 0; i < localMemCount.size(); i++) { + if (isProteomeInCluster[i]) { + localMemCount[i]++; + } + } +} + +bool FindNextRepProteome(std::vector& ProteomeList, unsigned int& RepProteomeId) { + bool isAllCovered = true; + + unsigned int maxMemberCount = 0; + unsigned int notCoveredProtCount = 0; + + for (size_t idx = 0; idx < ProteomeList.size(); idx++){ + if (ProteomeList[idx].isCovered()) { + continue; + }else{ + isAllCovered = false; + notCoveredProtCount++; + if (ProteomeList[idx].memCount > maxMemberCount) { + maxMemberCount = ProteomeList[idx].memCount; + RepProteomeId = idx; + } + } + } + + if (isAllCovered){ + return false; + }else if (notCoveredProtCount == 1) { + //last one and it is singleton + ProteomeList[RepProteomeId].repProtKey = RepProteomeId; //todo + ProteomeList[RepProteomeId].protSimScore = 1.0; + return false; + }else{ + return true; + } +} + +void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& ProteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinClustWriter) { + char buffer[1024]; + bool isRepFound = false; + unsigned int lastqLen = 0; + unsigned int qproteinKey = 0; + unsigned int qproteomeKey =0; + //find Rep + for (auto& eachMember : clusterRep.memberProteins){ + if (eachMember.proteomeKey == RepProteomeId) { + isRepFound = true; + const unsigned int queryId = eachMember.proteinKey; + if( lastqLen < tProteinDB.getSeqLen(queryId)){ + lastqLen = tProteinDB.getSeqLen(queryId); + qproteinKey = eachMember.proteinKey; + qproteomeKey = eachMember.proteomeKey; + } + } + } + if (isRepFound){ + proteinClustWriter.writeStart(thread_idx); + const unsigned int queryId = qproteinKey; + const unsigned int queryKey = tProteinDB.getDbKey(queryId); + char* querySeq = tProteinDB.getData(queryId, thread_idx); + query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId)); + matcher.initQuery(&query); + const unsigned int queryProteomeLength = ProteomeList[qproteomeKey].proteomeAALen; + + // representative protein :same query and target (for createtsv) + Matcher::result_t rep_reult = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true); + size_t rep_len = Matcher::resultToBuffer(buffer, rep_reult, par.addBacktrace); + proteinClustWriter.writeAdd(buffer, rep_len, thread_idx); + localSeqIds[qproteomeKey] += rep_reult.getSeqId()*query.L; + + for (auto& eachTargetMember : clusterRep.memberProteins){ + if (eachTargetMember.proteomeKey == RepProteomeId) { + continue; + } + // if query Proteome's length < target Proteome's length * 0.9, skip + const unsigned int targetProteomeLength = ProteomeList[eachTargetMember.proteomeKey].proteomeAALen; + if (queryProteomeLength < targetProteomeLength * 0.9) { + continue; + } + const unsigned int targetId = eachTargetMember.proteinKey; + unsigned int targetProteomeId = eachTargetMember.proteomeKey; + const unsigned int targetKey = tProteinDB.getDbKey(targetId); + char* targetSeq = tProteinDB.getData(targetId, thread_idx); + target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId)); + + if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) { + continue; + }; + + const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false; + Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity); + + if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) { + if (query.L >= target.L*0.9){ + size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); + proteinClustWriter.writeAdd(buffer, len, thread_idx); + localSeqIds[targetProteomeId] += result.getSeqId()*target.L; + } + } + } + proteinClustWriter.writeEnd(queryKey, thread_idx); + } +} + +bool updateProteomeList(std::vector& ProteomeList, const unsigned int& RepProteomeId){ + bool isRepSingleton = true; + + // #pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < ProteomeList.size(); i++) { + if (ProteomeList[i].isCovered() == false) { + if (i == RepProteomeId){ + ProteomeList[i].repProtKey = RepProteomeId; + ProteomeList[i].protSimScore = 1.0; + }else{ + ProteomeList[i].computeRedundancy(); + if (ProteomeList[i].getProtSimScore() >= 0.9) { + ProteomeList[i].repProtKey = RepProteomeId; + isRepSingleton = false; + }else{ + ProteomeList[i].resetProteomeInfo(); + } + } + } + } + return isRepSingleton; +} + +void updateClusterReps(ClusterEntry& clusterRep, std::vector& ProteomeList, std::vector& localMemCount){ + std::vector isProteomeInCluster(localMemCount.size(), false); + if (clusterRep.isAvailable) { + bool isAllMemberCovered = true; + unsigned int notCoveredCount = 0; + unsigned int lastProteomeKey = 0; + //update cluster member info + for (auto& eachMember : clusterRep.memberProteins) { + if (ProteomeList[eachMember.proteomeKey].isCovered() == false) { + notCoveredCount++; + lastProteomeKey = eachMember.proteomeKey; + isAllMemberCovered = false; + isProteomeInCluster[eachMember.proteomeKey] = true; + } + } + if (isAllMemberCovered) { + clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; + } + if (notCoveredCount == 1) { //singleton Cluster + isProteomeInCluster[lastProteomeKey] = false; + clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; + } + } + + //update localMemCount + if (clusterRep.isAvailable) { + for (size_t i=0; i < localMemCount.size(); i++) { + if (isProteomeInCluster[i]) { + localMemCount[i]++; + } + } + } + +} + + +void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector &ProteomeList) { + std::vector proteomeClusterIdx; + char proteomeBuffer[1024]; + int repProtIdCluster = ProteomeList[0].getRepProtKey(); + + for (size_t idx = 0; idx < ProteomeList.size(); idx++) { + int repProtId = ProteomeList[idx].getRepProtKey(); + if (repProtIdCluster != repProtId) { + proteomeClustWriter.writeStart(); + for (auto &eachIdx : proteomeClusterIdx) { + char *basePos = proteomeBuffer; + char *tmpProteomeBuffer = Itoa::i32toa_sse2(ProteomeList[eachIdx].getProteomeKey(), proteomeBuffer); + *(tmpProteomeBuffer - 1) = '\t'; + tmpProteomeBuffer = Util::fastSeqIdToBuffer(ProteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer); + *(tmpProteomeBuffer - 1) = '\n'; + proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos); + } + proteomeClustWriter.writeEnd(repProtIdCluster); + // Reset + repProtIdCluster = repProtId; + proteomeClusterIdx.clear(); + proteomeClusterIdx.push_back(idx); + } else { + proteomeClusterIdx.push_back(idx); + } + + if (idx == ProteomeList.size() - 1) { + proteomeClustWriter.writeStart(); + for (auto &eachIdx : proteomeClusterIdx) { + char *basePos = proteomeBuffer; + char *tmpProteomeBuffer = Itoa::i32toa_sse2(ProteomeList[eachIdx].getProteomeKey(), proteomeBuffer); + *(tmpProteomeBuffer - 1) = '\t'; + tmpProteomeBuffer = Util::fastSeqIdToBuffer(ProteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer); + *(tmpProteomeBuffer - 1) = '\n'; + proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos); + } + proteomeClustWriter.writeEnd(repProtIdCluster); + proteomeClusterIdx.clear(); + } + } +} + + +int proteomecluster(int argc, const char **argv, const Command &command){ + //Initialize parameters + Parameters &par = Parameters::getInstance(); + par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0); + par.parseParameters(argc, argv, command, true, 0, 0); + + if (par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED) { + Debug(Debug::ERROR) << "Use rescorediagonal for ungapped alignment mode.\n"; + EXIT(EXIT_FAILURE); + } + if (par.addBacktrace == true) { + par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + } + + unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr); + + //Open the target protein database + DBReader tProteinDB(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX|DBReader::USE_LOOKUP|DBReader::USE_SOURCE); + tProteinDB.open(DBReader::NOSORT); + const int tProteinSeqType = tProteinDB.getDbtype(); + + if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) { + tProteinDB.readMmapedDataInMemory(); + } + //Open lookup table to get the source of the protein and the protein length + DBReader::LookupEntry* tLookup = tProteinDB.getLookup(); + const size_t tLookupSize = tProteinDB.getLookupSize(); + unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber; + std::vector ProteomeList(totalProteomeNumber + 1); + calculateProteomeLength(ProteomeList, tLookup, tLookupSize, tProteinDB); + + //Open the linclust result + DBReader linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); + linResDB.open(DBReader::LINEAR_ACCCESS); + + //Open the DBWriter + DBWriter proteinClustWriter(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB); + proteinClustWriter.open(); + int proteomeDBType = DBReader::setExtendedDbtype(Parameters::DBTYPE_GENERIC_DB, Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER); + DBWriter proteomeClustWriter(par.db4.c_str(), par.db4Index.c_str(), 1, par.compressed, proteomeDBType); + proteomeClustWriter.open(); + + std::vector clusterReps; + + int gapOpen, gapExtend; + BaseMatrix *subMat; + subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); + gapOpen = par.gapOpen.values.aminoacid(); + gapExtend = par.gapExtend.values.aminoacid(); + EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend); + + // Debug(Debug::INFO) << "Start Initialization\n"; + #pragma omp parallel + { + unsigned int thread_idx = 0; + #ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); + #endif + std::vector localClusterReps; + std::vector localMemCount(ProteomeList.size(), 0); + + #pragma omp for schedule(dynamic, 1) + for (size_t id = 0; id < linResDB.getSize(); id++) { + initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx); + } + + #pragma omp critical + { + for (size_t idx=0; idx < localMemCount.size(); idx++){ + ProteomeList[idx].incrementMemCount(localMemCount[idx]); + } + clusterReps.insert(clusterReps.end(), + std::make_move_iterator(localClusterReps.begin()), + std::make_move_iterator(localClusterReps.end())); + } + + } + + unsigned int RepProteomeId = -1; + // Debug(Debug::INFO) << "Run Alignment\n"; + + while (FindNextRepProteome(ProteomeList, RepProteomeId)) { + #pragma omp parallel + { + unsigned int thread_idx = 0; + #ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); + #endif + Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop); + Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); + Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); + std::vector localSeqIds(clusterReps.size(), 0.0f); + #pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < clusterReps.size(); i++) { + if (clusterReps[i].isAvailable) { + runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, ProteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter); + } + + } + #pragma omp critical + { + for (size_t idx = 0; idx < ProteomeList.size(); idx++) { + ProteomeList[idx].addSeqId(localSeqIds[idx]); + } + } + } + + // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n"; + bool isRepSingleton = updateProteomeList(ProteomeList, RepProteomeId); + + if (isRepSingleton) { + ProteomeList[RepProteomeId].repProtKey = RepProteomeId; + ProteomeList[RepProteomeId].protSimScore = 1.0; + } + + // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n"; + #pragma omp parallel + { + std::vector localMemCount(ProteomeList.size(), 0); + #pragma omp for schedule(dynamic, 1) + for (size_t i = 0; i < clusterReps.size(); i++){ + updateClusterReps(clusterReps[i], ProteomeList, localMemCount); + } + + #pragma omp critical + { + for (size_t i = 0; i < ProteomeList.size(); i++) { + ProteomeList[i].incrementMemCount(localMemCount[i]); + } + } + } + } + //sort ProteomeList by repProtKey + SORT_PARALLEL(ProteomeList.begin(), ProteomeList.end(), ProteomeEntry::compareByKey); + + //write result of proteome clustering + writeProteomeClusters(proteomeClustWriter, ProteomeList); + + proteomeClustWriter.close(); + proteinClustWriter.close(); + tProteinDB.close(); + delete subMat; + linResDB.close(); + return EXIT_SUCCESS; +} \ No newline at end of file From 3f4fc91338f92616fd4b528181f154116cb91740 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Tue, 27 Aug 2024 22:58:23 +0900 Subject: [PATCH 07/31] easy proteomecluster workflow --- data/workflow/easyproteomecluster.sh | 64 ++++++++++++++++++++++++++++ src/workflow/EasyProteomecluster.cpp | 52 ++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 data/workflow/easyproteomecluster.sh create mode 100644 src/workflow/EasyProteomecluster.cpp diff --git a/data/workflow/easyproteomecluster.sh b/data/workflow/easyproteomecluster.sh new file mode 100644 index 000000000..05b4af940 --- /dev/null +++ b/data/workflow/easyproteomecluster.sh @@ -0,0 +1,64 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + + +if notExists "${TMP_PATH}/input.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \ + || fail "query createdb died" +fi + +if notExists "${TMP_PATH}/clu.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \ + || fail "Search died" +fi + +if notExists "${TMP_PATH}/aln.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${PROTEOMECLUSTER_PAR} \ + || fail "Convert Alignments died" +fi + +if notExists "${RESULTS}_protein_cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \ + || fail "createtsv protein cluster died" +fi + +if notExists "${RESULTS}_protein_align.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \ + || fail "createtsv protein align died" +fi + +if notExists "${RESULTS}_proteome_cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \ + || fail "createtsv proteome cluster died" +fi + + +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/aln_protein" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/aln_proteome" ${VERBOSITY_PAR} + rm -rf "${TMP_PATH}/clu_tmp" + rm -f "${TMP_PATH}/easyproteomecluster.sh" +fi diff --git a/src/workflow/EasyProteomecluster.cpp b/src/workflow/EasyProteomecluster.cpp new file mode 100644 index 000000000..c30b4209a --- /dev/null +++ b/src/workflow/EasyProteomecluster.cpp @@ -0,0 +1,52 @@ +#include + +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" +#include "Parameters.h" + +#include "easyproteomecluster.sh.h" + +void setEasyproteomeclusterDefaults(Parameters *p) { + p->proteomeSimThr = 0.9; +} + +void setEasyproteomeclusterMustPassAlong(Parameters *p){ + p->PARAM_REMOVE_TMP_FILES.wasSet = true; + +} + +int easyproteomecluster(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + setEasyproteomeclusterDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + setEasyproteomeclusterMustPassAlong(&par); + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); + + CommandCaller cmd; + cmd.addVariable("TMP_PATH", tmpDir.c_str()); + cmd.addVariable("RESULTS", par.filenames.back().c_str()); + par.filenames.pop_back(); + cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); + + cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.createdb).c_str()); + cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clusterworkflow, true).c_str()); + cmd.addVariable("PROTEOMECLUSTER_PAR", par.createParameterString(par.proteomecluster).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); + cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); + + std::string program = tmpDir + "/easyproteomecluster.sh"; + FileUtil::writeFile(program, easyproteomecluster_sh, easyproteomecluster_sh_len); + cmd.execProgram(program.c_str(), par.filenames); + + // Should never get here + assert(false); + return 0; +} From eeabad3b995f21f874d14dd6a40f1ff5b24bad98 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Tue, 27 Aug 2024 22:59:09 +0900 Subject: [PATCH 08/31] indexreader - handle source identifier --- src/commons/IndexReader.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/commons/IndexReader.h b/src/commons/IndexReader.h index 17d637f12..0ad0ca15e 100644 --- a/src/commons/IndexReader.h +++ b/src/commons/IndexReader.h @@ -22,7 +22,7 @@ class IndexReader { ) : sequenceReader(NULL), index(NULL) { int targetDbtype = FileUtil::parseDbType(dataName.c_str()); if (Parameters::isEqualDbtype(targetDbtype, Parameters::DBTYPE_INDEX_DB)) { - index = new DBReader(dataName.c_str(), (dataName + ".index").c_str(), 1, DBReader::USE_DATA|DBReader::USE_INDEX); + new DBReader(dataName.c_str(), (dataName + ".index").c_str(), 1, dataMode); index->open(DBReader::NOSORT); if (PrefilteringIndexReader::checkIfIndexFile(index)) { PrefilteringIndexReader::printSummary(index); @@ -95,6 +95,8 @@ class IndexReader { }else{ failSuffix = "_aln"; } + } else if (databaseType & SOURCE) { + failSuffix = ""; } } sequenceReader = new DBReader( @@ -115,6 +117,7 @@ class IndexReader { static const unsigned int SRC_HEADERS = 4; static const unsigned int SRC_SEQUENCES = 8; static const unsigned int ALIGNMENTS = 16; + static const unsigned int SOURCE = 32; static const unsigned int USER_SELECT = 1 << 31; static unsigned int makeUserDatabaseType(unsigned int baseKey) { From 9996821a421d6cbff77d777450da00a4106ad5d6 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Tue, 27 Aug 2024 23:03:14 +0900 Subject: [PATCH 09/31] update mmseqsbase and change module name into proteomecluster --- src/commons/DBReader.cpp | 9 +++++ src/commons/DBReader.h | 3 +- src/commons/DBWriter.h | 1 + src/commons/Parameters.cpp | 71 +++++++++++++++++++------------------- src/commons/Parameters.h | 7 ++-- 5 files changed, 50 insertions(+), 41 deletions(-) diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp index 80bca13c5..598d152f6 100644 --- a/src/commons/DBReader.cpp +++ b/src/commons/DBReader.cpp @@ -733,6 +733,15 @@ template std::string DBReader::getSourceFileName (size_t id){ return source[id].fileName; } +template T DBReader::getSourceKey(size_t id){ + if (id >= sourceSize){ + Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".lookup\n"; + Debug(Debug::ERROR) << "getSource id: local id (" << id << ") >= db size (" << sourceSize << ")\n"; + EXIT(EXIT_FAILURE); + } + return source[id].id; +} + template<> void DBReader::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) { buffer.append(SSTR(entry.id)); diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h index 81d28bdf1..2c61664e1 100644 --- a/src/commons/DBReader.h +++ b/src/commons/DBReader.h @@ -187,8 +187,6 @@ class DBReader : public MemoryTracker { size_t getSize() const; - unsigned int getProteomeTotalLen(size_t id); - unsigned int getMaxSeqLen(){ return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ? (std::max(maxSeqLen, 1u)) / Sequence::PROFILE_READIN_SIZE : @@ -254,6 +252,7 @@ class DBReader : public MemoryTracker { LookupEntry* getLookup() { return lookup; }; std::string getSourceFileName(size_t id); + T getSourceKey(size_t id); static const int NOSORT = 0; static const int SORT_BY_LENGTH = 1; static const int LINEAR_ACCCESS = 2; diff --git a/src/commons/DBWriter.h b/src/commons/DBWriter.h index 190015cb3..21e5377bb 100644 --- a/src/commons/DBWriter.h +++ b/src/commons/DBWriter.h @@ -66,6 +66,7 @@ class DBWriter : public MemoryTracker { } static void sortIndex(const char *inFileNameIndex, const char *outFileNameIndex, const bool lexicographicOrder); + private: size_t addToThreadBuffer(const void *data, size_t itmesize, size_t nitems, int threadIdx); void writeThreadBuffer(unsigned int idx, size_t dataSize); diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 6cb54999b..a2db3ef47 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -340,7 +340,7 @@ Parameters::Parameters(): alignall.push_back(&PARAM_SUB_MAT); alignall.push_back(&PARAM_ADD_BACKTRACE); alignall.push_back(&PARAM_ALIGNMENT_MODE); - alignall.push_back(&PARAM_REALIGN_SCORE_BIAS); //gyuri + alignall.push_back(&PARAM_REALIGN_SCORE_BIAS); // alignall.push_back(&PARAM_WRAPPED_SCORING); alignall.push_back(&PARAM_E); alignall.push_back(&PARAM_MIN_SEQ_ID); @@ -368,39 +368,38 @@ Parameters::Parameters(): alignall.push_back(&PARAM_COMPRESSED); alignall.push_back(&PARAM_V); - //alignproteome - // alignproteome - alignproteome.push_back(&PARAM_SUB_MAT); - alignproteome.push_back(&PARAM_ADD_BACKTRACE); - alignproteome.push_back(&PARAM_ALIGNMENT_MODE); - alignproteome.push_back(&PARAM_REALIGN_SCORE_BIAS); //gyuri -// alignproteome.push_back(&PARAM_WRAPPED_SCORING); - alignproteome.push_back(&PARAM_E); - alignproteome.push_back(&PARAM_MIN_SEQ_ID); - alignproteome.push_back(&PARAM_MIN_ALN_LEN); - alignproteome.push_back(&PARAM_SEQ_ID_MODE); -// alignproteome.push_back(&PARAM_ALT_ALIGNMENT); - alignproteome.push_back(&PARAM_C); - alignproteome.push_back(&PARAM_COV_MODE); - alignproteome.push_back(&PARAM_MAX_SEQ_LEN); - alignproteome.push_back(&PARAM_NO_COMP_BIAS_CORR); - alignproteome.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE); - alignproteome.push_back(&PARAM_PROTEOME_SIMILARITY); - - alignproteome.push_back(&PARAM_REALIGN); -// alignproteome.push_back(&PARAM_MAX_REJECTED); -// alignproteome.push_back(&PARAM_MAX_ACCEPT); - alignproteome.push_back(&PARAM_INCLUDE_IDENTITY); - alignproteome.push_back(&PARAM_PRELOAD_MODE); - alignproteome.push_back(&PARAM_PCA); - alignproteome.push_back(&PARAM_PCB); - alignproteome.push_back(&PARAM_SCORE_BIAS); - alignproteome.push_back(&PARAM_GAP_OPEN); - alignproteome.push_back(&PARAM_GAP_EXTEND); - alignproteome.push_back(&PARAM_ZDROP); - alignproteome.push_back(&PARAM_THREADS); - alignproteome.push_back(&PARAM_COMPRESSED); - alignproteome.push_back(&PARAM_V); + //proteomecluster + proteomecluster.push_back(&PARAM_SUB_MAT); + proteomecluster.push_back(&PARAM_ADD_BACKTRACE); + proteomecluster.push_back(&PARAM_ALIGNMENT_MODE); + proteomecluster.push_back(&PARAM_REALIGN_SCORE_BIAS); +// proteomecluster.push_back(&PARAM_WRAPPED_SCORING); + proteomecluster.push_back(&PARAM_E); + proteomecluster.push_back(&PARAM_MIN_SEQ_ID); + proteomecluster.push_back(&PARAM_MIN_ALN_LEN); + proteomecluster.push_back(&PARAM_SEQ_ID_MODE); +// proteomecluster.push_back(&PARAM_ALT_ALIGNMENT); + proteomecluster.push_back(&PARAM_C); + proteomecluster.push_back(&PARAM_COV_MODE); + proteomecluster.push_back(&PARAM_MAX_SEQ_LEN); + proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR); + proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE); + proteomecluster.push_back(&PARAM_PROTEOME_SIMILARITY); + + proteomecluster.push_back(&PARAM_REALIGN); +// proteomecluster.push_back(&PARAM_MAX_REJECTED); +// proteomecluster.push_back(&PARAM_MAX_ACCEPT); + proteomecluster.push_back(&PARAM_INCLUDE_IDENTITY); + proteomecluster.push_back(&PARAM_PRELOAD_MODE); + proteomecluster.push_back(&PARAM_PCA); + proteomecluster.push_back(&PARAM_PCB); + proteomecluster.push_back(&PARAM_SCORE_BIAS); + proteomecluster.push_back(&PARAM_GAP_OPEN); + proteomecluster.push_back(&PARAM_GAP_EXTEND); + proteomecluster.push_back(&PARAM_ZDROP); + proteomecluster.push_back(&PARAM_THREADS); + proteomecluster.push_back(&PARAM_COMPRESSED); + proteomecluster.push_back(&PARAM_V); // alignment @@ -1349,8 +1348,8 @@ Parameters::Parameters(): // easylinclustworkflow easylinclustworkflow = combineList(linclustworkflow, createdb); - // easyalignproteomeworkflow - easyalignproteome = combineList(easylinclustworkflow, alignproteome); + // easyproteomeclusterworkflow + easyproteomeclusterworkflow = combineList(easylinclustworkflow, proteomecluster); // clustering workflow diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index ee825291e..b23b771cd 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -90,6 +90,7 @@ class Parameters { static const unsigned int DBTYPE_EXTENDED_COMPRESSED = 1; static const unsigned int DBTYPE_EXTENDED_INDEX_NEED_SRC = 2; static const unsigned int DBTYPE_EXTENDED_CONTEXT_PSEUDO_COUNTS = 4; + static const unsigned int DBTYPE_EXTENDED_SRC_IDENTIFIER = 8; // don't forget to add new database types to DBReader::getDbTypeName and Parameters::PARAM_OUTPUT_DBTYPE @@ -434,7 +435,7 @@ class Parameters { int maxAccept; // after n accepted sequences stop int altAlignment; // show up to this many alternative alignments float seqIdThr; // sequence identity threshold for acceptance - float proteomeSimThr; + float proteomeSimThr; // proteome similarity threshold for acceptance int alnLenThr; // min. alignment length bool addBacktrace; // store backtrace string (M=Match, D=deletion, I=insertion) bool realign; // realign hit with more conservative score @@ -1080,7 +1081,7 @@ class Parameters { std::vector threadsandcompression; std::vector alignall; - std::vector alignproteome; + std::vector proteomecluster; std::vector align; std::vector rescorediagonal; std::vector alignbykmer; @@ -1118,7 +1119,7 @@ class Parameters { std::vector kmersearch; std::vector countkmer; std::vector easylinclustworkflow; - std::vector easyalignproteome; + std::vector easyproteomeclusterworkflow; std::vector linclustworkflow; std::vector easysearchworkflow; std::vector searchworkflow; From 27d9f6802746ba802bf1cfa7537fa1ffcb084947 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Wed, 28 Aug 2024 14:16:44 +0900 Subject: [PATCH 10/31] set alignment mode default --- src/workflow/EasyProteomecluster.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/workflow/EasyProteomecluster.cpp b/src/workflow/EasyProteomecluster.cpp index c30b4209a..f2c2a2ae1 100644 --- a/src/workflow/EasyProteomecluster.cpp +++ b/src/workflow/EasyProteomecluster.cpp @@ -9,11 +9,13 @@ #include "easyproteomecluster.sh.h" void setEasyproteomeclusterDefaults(Parameters *p) { + p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; p->proteomeSimThr = 0.9; } void setEasyproteomeclusterMustPassAlong(Parameters *p){ p->PARAM_REMOVE_TMP_FILES.wasSet = true; + p->PARAM_ALIGNMENT_MODE.wasSet = true; } From e2ef2290befcd032ae942372a895bcfb2f414232 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Wed, 28 Aug 2024 17:32:11 +0900 Subject: [PATCH 11/31] latest --- data/workflow/CMakeLists.txt | 2 +- data/workflow/easyalignproteome.sh | 53 ---- data/workflow/easyproteomecluster.sh | 12 +- src/CommandDeclarations.h | 4 +- src/MMseqsBase.cpp | 32 +- src/commons/Parameters.cpp | 3 +- src/util/CMakeLists.txt | 2 +- src/util/alignproteome.cpp | 443 --------------------------- src/util/proteomecluster.cpp | 123 ++++---- src/workflow/CMakeLists.txt | 2 +- src/workflow/EasyAlignproteome.cpp | 41 --- src/workflow/EasyLinclust.cpp | 1 - 12 files changed, 94 insertions(+), 624 deletions(-) delete mode 100644 data/workflow/easyalignproteome.sh delete mode 100644 src/util/alignproteome.cpp delete mode 100644 src/workflow/EasyAlignproteome.cpp diff --git a/data/workflow/CMakeLists.txt b/data/workflow/CMakeLists.txt index 3e94e1817..3f006a286 100644 --- a/data/workflow/CMakeLists.txt +++ b/data/workflow/CMakeLists.txt @@ -3,7 +3,7 @@ set(GENERATED_WORKFLOWS workflow/easycluster.sh workflow/easytaxonomy.sh workflow/easyrbh.sh - workflow/easyalignproteome.sh + workflow/easyproteomecluster.sh workflow/blastp.sh workflow/blastpgp.sh workflow/map.sh diff --git a/data/workflow/easyalignproteome.sh b/data/workflow/easyalignproteome.sh deleted file mode 100644 index b27b8187a..000000000 --- a/data/workflow/easyalignproteome.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/sh -e -fail() { - echo "Error: $1" - exit 1 -} - -notExists() { - [ ! -f "$1" ] -} - - -if notExists "${TMP_PATH}/input.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \ - || fail "query createdb died" -fi - -if notExists "${TMP_PATH}/clu.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \ - || fail "Search died" -fi - -if notExists "${TMP_PATH}/aln.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" alignproteome "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${ALIGNPROTEOME_PAR} \ - || fail "Convert Alignments died" -fi - -if notExists "${RESULTS}_protein_cluster.tsv"; then - # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \ - || fail "createtsv protein died" -fi - -if notExists "${RESULTS}_proteome_cluster.tsv"; then - # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \ - || fail "createtsv proteome died" -fi - -if [ -n "${REMOVE_TMP}" ]; then - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR} - rm -rf "${TMP_PATH}/clu_tmp" - rm -f "${TMP_PATH}/easyproteomecluster.sh" -fi diff --git a/data/workflow/easyproteomecluster.sh b/data/workflow/easyproteomecluster.sh index 05b4af940..2bab198fc 100644 --- a/data/workflow/easyproteomecluster.sh +++ b/data/workflow/easyproteomecluster.sh @@ -21,18 +21,18 @@ if notExists "${TMP_PATH}/clu.dbtype"; then || fail "Search died" fi -if notExists "${TMP_PATH}/aln.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${PROTEOMECLUSTER_PAR} \ - || fail "Convert Alignments died" -fi - if notExists "${RESULTS}_protein_cluster.tsv"; then # shellcheck disable=SC2086 "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \ || fail "createtsv protein cluster died" fi +if notExists "${TMP_PATH}/aln.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${PROTEOMECLUSTER_PAR} \ + || fail "Convert Alignments died" +fi + if notExists "${RESULTS}_protein_align.tsv"; then # shellcheck disable=SC2086 "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \ diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index 04fbd521d..35939c4e9 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -4,7 +4,7 @@ extern int align(int argc, const char **argv, const Command& command); extern int alignall(int argc, const char **argv, const Command& command); -extern int alignproteome(int argc, const char **argv, const Command& command); +extern int proteomecluster(int argc, const char **argv, const Command& command); extern int alignbykmer(int argc, const char **argv, const Command& command); extern int appenddbtoindex(int argc, const char **argv, const Command& command); extern int apply(int argc, const char **argv, const Command& command); @@ -44,7 +44,7 @@ extern int diffseqdbs(int argc, const char **argv, const Command& command); extern int easycluster(int argc, const char **argv, const Command& command); extern int easyrbh(int argc, const char **argv, const Command& command); extern int easylinclust(int argc, const char **argv, const Command& command); -extern int easyalignproteome(int argc, const char **argv, const Command& command); +extern int easyproteomecluster(int argc, const char **argv, const Command& command); extern int easysearch(int argc, const char **argv, const Command& command); extern int easylinsearch(int argc, const char **argv, const Command& command); extern int tsv2exprofiledb(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 3b64eb556..5a6280da2 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -79,12 +79,19 @@ std::vector baseCommands = { CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin }, {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, - {"easy-alignproteome", easyalignproteome, &par.easyalignproteome, COMMAND_EASY, - "Proteome clustering and alignemnt", - " ... " - // CITATION_MMSEQS2, - - }, + {"easy-proteomecluster", easyproteomecluster, &par.easyproteomeclusterworkflow, COMMAND_EASY, + "Proteome clustering and alignment", + "mmseqs easy-proteomecluster examples/DB.fasta result tmp\n\n" + "# ProteomeCluster output\n" + "# - protein_cluster.tsv: Protein linclust result\n" + "# - proteome_cluster.tsv: Proteome redundancy clustering result\n" + "# - protein_align.tsv: Protein alignment list\n" + "mmseqs easy-proteomecluster examples/DB.fasta result tmp --proteome-similarity 0.9\n", + "Martin Steinegger & Gyuri Kim ", + " ... ", + CITATION_MMSEQS2, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin }, + {"outputReports", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }, + {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, {"easy-taxonomy", easytaxonomy, &par.easytaxonomy, COMMAND_EASY, "Taxonomic classification", "# Assign taxonomic labels to FASTA sequences\n" @@ -638,15 +645,15 @@ std::vector baseCommands = { {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, - {"alignproteome", alignproteome, &par.alignproteome, COMMAND_ALIGNMENT, + {"proteomecluster", proteomecluster, &par.proteomecluster, COMMAND_ALIGNMENT, "Proteome clustering and alignment", NULL, "Martin Steinegger & Gyuri Kim ", - " ", + " ", CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, - {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, - {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, - {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, + {"clustresultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, + {"proteinAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb}, + {"proteomeAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, {"alignall", alignall, &par.alignall, COMMAND_ALIGNMENT, "Within-result all-vs-all gapped local alignment", NULL, @@ -1314,5 +1321,6 @@ std::vector baseCommands = { NULL, "", "", - CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}} + CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}, + }; diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index a2db3ef47..d142bdd82 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -353,7 +353,7 @@ Parameters::Parameters(): alignall.push_back(&PARAM_NO_COMP_BIAS_CORR); alignall.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE); - alignall.push_back(&PARAM_REALIGN); +// alignall.push_back(&PARAM_REALIGN); // alignall.push_back(&PARAM_MAX_REJECTED); // alignall.push_back(&PARAM_MAX_ACCEPT); alignall.push_back(&PARAM_INCLUDE_IDENTITY); @@ -1351,7 +1351,6 @@ Parameters::Parameters(): // easyproteomeclusterworkflow easyproteomeclusterworkflow = combineList(easylinclustworkflow, proteomecluster); - // clustering workflow clusterworkflow = combineList(prefilter, align); clusterworkflow = combineList(clusterworkflow, rescorediagonal); diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 35d72b32f..6a5ee5a36 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -80,6 +80,6 @@ set(util_source_files util/proteinaln2nucl.cpp util/versionstring.cpp util/diskspaceavail.cpp - util/alignproteome.cpp + util/proteomecluster.cpp PARENT_SCOPE ) diff --git a/src/util/alignproteome.cpp b/src/util/alignproteome.cpp deleted file mode 100644 index 3a7311b81..000000000 --- a/src/util/alignproteome.cpp +++ /dev/null @@ -1,443 +0,0 @@ -#include "Util.h" -#include "Parameters.h" -#include "Matcher.h" -#include "Debug.h" -#include "DBReader.h" -#include "DBWriter.h" -#include "NucleotideMatrix.h" -#include "SubstitutionMatrix.h" -#include "Alignment.h" -#include "itoa.h" - -#ifdef OPENMP -#include -#endif - -struct __attribute__((__packed__)) ProteomeEntry{ - unsigned int proteomeAALen; - int repProtKey; - float protSimScore; - unsigned int memCount; - float totalSeqId; - - ProteomeEntry(unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f) - : proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {} - - void incrementMemCount(int count) { - memCount += count; - } - void decrementmemCount() { - memCount--; - } - - int getRepProtKey() { - return repProtKey; - } - - bool isCovered(){ - if (repProtKey == -1) { - return false; - } - return true; - } - - void computeRedundancy() { - protSimScore = totalSeqId / proteomeAALen; - - } - - void addSeqId(float seqId) { - totalSeqId += seqId; - } - - void addSeqLen(unsigned int seqLength) { - proteomeAALen += seqLength; - } - - void resetProteomeInfo(){ - memCount = 0; - totalSeqId = 0.0f; - } - - float getprotSimScore() { - return protSimScore; - } -}; - -struct __attribute__((__packed__)) MemberProteinEntry{ - unsigned int proteomeKey; - unsigned int proteinKey; -}; - -struct ClusterEntry { - bool isAvailable; - std::vector memberProteins; - - ClusterEntry() : isAvailable(false) {} - - ClusterEntry(size_t memberProteinSize) : isAvailable(true) { - memberProteins.reserve(memberProteinSize); - } - - void resetClusterInfo(){ - memberProteins.clear(); - isAvailable = false; - } -}; - -void calculateProteomeLength(std::vector& ProteomeList, DBReader::LookupEntry* const& lookup, size_t lookupSize, DBReader& tProteinDB) { - for (size_t i = 0; i < lookupSize; i++) { - const unsigned int ProteomeId = lookup[i].fileNumber; - const unsigned int ProteinId = lookup[i].id; - ProteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId)); - } -} - -void initLocalClusterReps(size_t& id, std::vector& localClusterReps, std::vector& localMemCount, DBReader& tProteinDB, DBReader& linResDB, unsigned int thread_idx){ - std::vector memberKeys; - char buffer[1024 + 32768*4]; ///why? - memberKeys.reserve(50); // store key for every protein in a cluster - char *clustData = linResDB.getData(id, thread_idx); - while (*clustData != '\0') { - Util::parseKey(clustData, buffer); - const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10); - memberKeys.push_back(key); - clustData = Util::skipLine(clustData); - } - if (memberKeys.size() > 1) { //IF Not a singleton cluster - ClusterEntry eachClusterRep(memberKeys.size()); - - for (auto& eachMemberKey : memberKeys){ - const unsigned int proteinId = tProteinDB.getId(eachMemberKey); - const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId); - MemberProteinEntry mem; - mem.proteomeKey = proteomeKey; - mem.proteinKey = proteinId; - eachClusterRep.memberProteins.push_back(mem); - localMemCount[proteomeKey]++; - } - localClusterReps.push_back(eachClusterRep); - } -} - -bool FindNextRepProteome(std::vector& ProteomeList, unsigned int& RepProteomeId) { - bool isAllCovered = true; - - unsigned int maxMemberCount = 0; - unsigned int notCoveredProtCount = 0; - - for (size_t idx = 0; idx < ProteomeList.size(); idx++){ - if (ProteomeList[idx].isCovered()) { - continue; - }else{ - isAllCovered = false; - notCoveredProtCount++; - if (ProteomeList[idx].memCount > maxMemberCount) { - maxMemberCount = ProteomeList[idx].memCount; - RepProteomeId = idx; - } - } - } - - if (isAllCovered){ - return false; - }else if (notCoveredProtCount == 1) { - //last one and it is singleton - ProteomeList[RepProteomeId].repProtKey = RepProteomeId; //todo - ProteomeList[RepProteomeId].protSimScore = 1.0; - return false; - }else{ - return true; - } -} - -void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& ProteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinAlignWriter) { - char buffer[1024]; //todo - How to calculate buffer size? - bool isRepFound = false; - unsigned int lastqLen = 0; - unsigned int qproteinKey = 0; - unsigned int qproteomeKey =0; - //find Rep - for (auto& eachMember : clusterRep.memberProteins){ - if (eachMember.proteomeKey == RepProteomeId) { - isRepFound = true; - const unsigned int queryId = eachMember.proteinKey; - if( lastqLen < tProteinDB.getSeqLen(queryId)){ - lastqLen = tProteinDB.getSeqLen(queryId); - qproteinKey = eachMember.proteinKey; - qproteomeKey = eachMember.proteomeKey; - } - } - } - if (isRepFound){ - const unsigned int queryId = qproteinKey; - const unsigned int queryKey = tProteinDB.getDbKey(queryId); - char* querySeq = tProteinDB.getData(queryId, thread_idx); //todo need openmp - query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId)); - matcher.initQuery(&query); - const unsigned int queryProteomeLength = ProteomeList[qproteomeKey].proteomeAALen; - char * tmpBuff = Itoa::u32toa_sse2((uint32_t) queryKey, buffer); - *(tmpBuff-1) = '\t'; - const unsigned int queryIdLen = tmpBuff - buffer; - - for (auto& eachTargetMember : clusterRep.memberProteins){ - // if query Proteome's length < target Proteome's length * 0.9, skip - const unsigned int targetProteomeLength = ProteomeList[eachTargetMember.proteomeKey].proteomeAALen; - if (queryProteomeLength < targetProteomeLength * 0.9) { - continue; - } - const unsigned int targetId = eachTargetMember.proteinKey; - unsigned int targetProteomeId = eachTargetMember.proteomeKey; - const unsigned int targetKey = tProteinDB.getDbKey(targetId); - char* targetSeq = tProteinDB.getData(targetId, thread_idx); - target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId)); - - if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) { - continue; - }; - - const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false; - Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity); - - if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) { - // Placeholder for writing results, uncomment and implement as needed - // size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); - // protRedunWriter.writeAdd(buffer, len, thread_idx); - // ProteomeList[targetProteomeId].addSeqId(result.getSeqId()); - size_t len = Matcher::resultToBuffer(tmpBuff, result, par.addBacktrace); - proteinAlignWriter.writeAdd(buffer, queryIdLen+len, thread_idx); - localSeqIds[targetProteomeId] += result.getSeqId()*target.L; - } - } - - } -} - -bool updateProteomeList(std::vector& ProteomeList, const unsigned int& RepProteomeId){ - bool isRepSingleton = true; - - #pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < ProteomeList.size(); i++) { - if (ProteomeList[i].isCovered() == false) { - if (i == RepProteomeId){ - ProteomeList[i].repProtKey = RepProteomeId; - ProteomeList[i].protSimScore = 1.0; - }else{ - ProteomeList[i].computeRedundancy(); - if (ProteomeList[i].getprotSimScore() >= 0.9) { - ProteomeList[i].repProtKey = RepProteomeId; - isRepSingleton = false; - }else{ - ProteomeList[i].resetProteomeInfo(); - } - } - } - } - return isRepSingleton; -} - -void updateClusterReps(std::vector& clusterReps, std::vector& ProteomeList, std::vector& localMemCount){ - #pragma omp parallel for schedule(dynamic, 1) - for (size_t i = 0; i < clusterReps.size(); i++) { - if (clusterReps[i].isAvailable) { - bool allMemberCovered = true; - unsigned int notCoveredCount = 0; - unsigned int lastProteomeKey = 0; - for (auto& eachMember : clusterReps[i].memberProteins) { - if (ProteomeList[eachMember.proteomeKey].isCovered() == false) { - notCoveredCount++; - lastProteomeKey = eachMember.proteomeKey; - allMemberCovered = false; - localMemCount[eachMember.proteomeKey]++; - } - } - if (allMemberCovered) { - clusterReps[i].resetClusterInfo(); - } - - if (notCoveredCount == 1) { //singleton Cluster - localMemCount[lastProteomeKey]--; - clusterReps[i].resetClusterInfo(); - } - } - } - -} - -size_t resultToBuffer(const std::string& proteomeName, const std::string& repProtName, float similarityScore, char* buffer){ - char* basePos = buffer; - std::strncpy(buffer, proteomeName.c_str(), proteomeName.length()); - buffer += proteomeName.length()+1; - *(buffer-1) = '\t'; // 탭 구분자 추가 - - // repProtName을 버퍼에 복사 - std::strncpy(buffer, repProtName.c_str(), repProtName.length()); - buffer += repProtName.length()+1; - *(buffer-1) = '\t'; // 탭 구분자 추가 - - // similarityScore를 버퍼에 추가 (Util::fastSeqIdToBuffer 사용) - buffer = Util::fastSeqIdToBuffer(similarityScore, buffer); - *(buffer-1) = '\n'; // 줄바꿈 추가 - // *(buffer) = '\0'; // 문자열 끝 추가 - return buffer - basePos; -} - -int alignproteome(int argc, const char **argv, const Command &command){ - //Initialize parameters - Parameters &par = Parameters::getInstance(); - par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0); - par.parseParameters(argc, argv, command, true, 0, 0); - - if (par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED) { - Debug(Debug::ERROR) << "Use rescorediagonal for ungapped alignment mode.\n"; - EXIT(EXIT_FAILURE); - } - if (par.addBacktrace == true) { - par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; - } - - unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr); - - //Open the target protein database - DBReader tProteinDB(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX|DBReader::USE_LOOKUP|DBReader::USE_SOURCE); - tProteinDB.open(DBReader::NOSORT); - const int tProteinSeqType = tProteinDB.getDbtype(); - - if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) { - tProteinDB.readMmapedDataInMemory(); - } - //Open lookup table to get the source of the protein and the protein length - DBReader::LookupEntry* tLookup = tProteinDB.getLookup(); - const size_t tLookupSize = tProteinDB.getLookupSize(); - unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber; - std::vector ProteomeList(totalProteomeNumber + 1); - calculateProteomeLength(ProteomeList, tLookup, tLookupSize, tProteinDB); - - //For Debug - // for (size_t i=0; i < ProteomeList.size(); i++) { - // Debug(Debug::INFO) << "Proteome " << i << " has " << ProteomeList[i].proteomeAALen << " amino acids.\n"; - // } - - //Open the linclust result - DBReader linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); - linResDB.open(DBReader::LINEAR_ACCCESS); - - //Open the DBWriter - DBWriter protRedunWriter(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_GENERIC_DB); - protRedunWriter.open(); - DBWriter proteinAlignWriter(par.db4.c_str(), par.db4Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB); - proteinAlignWriter.open(); - - std::vector clusterReps; - - int gapOpen, gapExtend; - BaseMatrix *subMat; - subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); - gapOpen = par.gapOpen.values.aminoacid(); - gapExtend = par.gapExtend.values.aminoacid(); - EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend); - - Debug(Debug::INFO) << "Start Initialization\n"; - #pragma omp parallel - { - unsigned int thread_idx = 0; - #ifdef OPENMP - thread_idx = (unsigned int) omp_get_thread_num(); - #endif - std::vector localClusterReps; - std::vector localMemCount(ProteomeList.size(), 0); - - #pragma omp for schedule(dynamic, 1) - for (size_t id = 0; id < linResDB.getSize(); id++) { - initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx); - } - - #pragma omp critical - { - for (size_t idx=0; idx < localMemCount.size(); idx++){ - ProteomeList[idx].incrementMemCount(localMemCount[idx]); - } - clusterReps.insert(clusterReps.end(), - std::make_move_iterator(localClusterReps.begin()), - std::make_move_iterator(localClusterReps.end())); - } - - } - - unsigned int RepProteomeId = -1; - Debug(Debug::INFO) << "Run Alignment\n"; - while (FindNextRepProteome(ProteomeList, RepProteomeId)) { - // if (FindNextRepProteome(ProteomeList, RepProteomeId) == false) { - // Debug(Debug::INFO) << "All Proteome is covered. Done.\n"; - // break; - // } - // Debug(Debug::INFO) << "New Rep Found : " << RepProteomeId << "\n"; - // Debug(Debug::INFO) << "1. Run alignment for each cluster\n"; - #pragma omp parallel - { - unsigned int thread_idx = 0; - #ifdef OPENMP - thread_idx = (unsigned int) omp_get_thread_num(); - #endif - Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop); - Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); - Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); - std::vector localSeqIds(clusterReps.size(), 0.0f); - proteinAlignWriter.writeStart(thread_idx); - #pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < clusterReps.size(); i++) { - if (clusterReps[i].isAvailable) { - runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, ProteomeList, par, thread_idx, swMode, localSeqIds, proteinAlignWriter); - } - - } - proteinAlignWriter.writeEnd(RepProteomeId, thread_idx); //gg - #pragma omp critical - { - for (size_t idx = 0; idx < ProteomeList.size(); idx++) { - ProteomeList[idx].addSeqId(localSeqIds[idx]); - } - } - } - - // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n"; - - bool isRepSingleton = updateProteomeList(ProteomeList, RepProteomeId); - - if (isRepSingleton) { - ProteomeList[RepProteomeId].repProtKey = RepProteomeId; - ProteomeList[RepProteomeId].protSimScore = 1.0; - } - - // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n"; - std::vector localMemCount(ProteomeList.size(), 0); - - updateClusterReps(clusterReps, ProteomeList, localMemCount); - - #pragma omp critical - { - for (size_t i = 0; i < ProteomeList.size(); i++) { - ProteomeList[i].memCount += localMemCount[i]; - } - } - } - // Debug(Debug::INFO) << "4. Write ProteomeList to file\n"; - char protRedunBuffer[1024]; //todo - How to calculate buffer size? - protRedunWriter.writeStart(); - for (size_t idx=0; idx < ProteomeList.size(); idx++){ - std::string proteomeName = tProteinDB.getSourceFileName(idx); - std::string repProtName = tProteinDB.getSourceFileName(ProteomeList[idx].getRepProtKey()); - float similarityScore = ProteomeList[idx].getprotSimScore(); - // Debug(Debug::INFO) << "Proteome " << idx << " : " << proteomeName << " has similarity score : " << similarityScore << " with " << repProtName << "\n"; - size_t result = resultToBuffer(proteomeName, repProtName, similarityScore,protRedunBuffer); - protRedunWriter.writeAdd(protRedunBuffer, result); - } - protRedunWriter.writeEnd(1); //Don't need index file (todo) - - protRedunWriter.close(); - proteinAlignWriter.close(); - tProteinDB.close(); - delete subMat; - linResDB.close(); - return EXIT_SUCCESS; -} \ No newline at end of file diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index dcfae8aa3..84d7421e0 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -98,13 +98,13 @@ struct ClusterEntry { } }; -void calculateProteomeLength(std::vector& ProteomeList, DBReader::LookupEntry* const& lookup, size_t lookupSize, DBReader& tProteinDB) { +void calculateProteomeLength(std::vector& proteomeList, DBReader::LookupEntry* const& lookup, size_t lookupSize, DBReader& tProteinDB) { for (size_t i = 0; i < lookupSize; i++) { const unsigned int ProteomeId = lookup[i].fileNumber; const unsigned int ProteinId = lookup[i].id; - ProteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId)); - if (ProteomeList[ProteomeId].proteomeKey == -1){ - ProteomeList[ProteomeId].proteomeKey = ProteomeId; + proteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId)); + if (proteomeList[ProteomeId].proteomeKey == -1) { + proteomeList[ProteomeId].proteomeKey = ProteomeId; } } } @@ -113,7 +113,7 @@ void initLocalClusterReps(size_t& id, std::vector& localClusterRep std::vector memberKeys; memberKeys.reserve(50); // store key for every protein in a cluster - std::vector isProteomeInCluster(localMemCount.size(), false); ; + std::vector isProteomeInCluster(localMemCount.size(), false); char buffer[1024 + 32768*4]; char *clustData = linResDB.getData(id, thread_idx); @@ -126,7 +126,7 @@ void initLocalClusterReps(size_t& id, std::vector& localClusterRep if (memberKeys.size() > 1) { //If not a singleton cluster ClusterEntry eachClusterRep(memberKeys.size()); //init MemberProteinEntry and add it to memberProteins vector - for (auto& eachMemberKey : memberKeys){ + for (auto& eachMemberKey : memberKeys) { const unsigned int proteinId = tProteinDB.getId(eachMemberKey); const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId); MemberProteinEntry mem; @@ -145,20 +145,20 @@ void initLocalClusterReps(size_t& id, std::vector& localClusterRep } } -bool FindNextRepProteome(std::vector& ProteomeList, unsigned int& RepProteomeId) { +bool FindNextRepProteome(std::vector& proteomeList, unsigned int& RepProteomeId) { bool isAllCovered = true; unsigned int maxMemberCount = 0; unsigned int notCoveredProtCount = 0; - for (size_t idx = 0; idx < ProteomeList.size(); idx++){ - if (ProteomeList[idx].isCovered()) { + for (size_t idx = 0; idx < proteomeList.size(); idx++){ + if (proteomeList[idx].isCovered()) { continue; }else{ isAllCovered = false; notCoveredProtCount++; - if (ProteomeList[idx].memCount > maxMemberCount) { - maxMemberCount = ProteomeList[idx].memCount; + if (proteomeList[idx].memCount > maxMemberCount) { + maxMemberCount = proteomeList[idx].memCount; RepProteomeId = idx; } } @@ -168,15 +168,15 @@ bool FindNextRepProteome(std::vector& ProteomeList, unsigned int& return false; }else if (notCoveredProtCount == 1) { //last one and it is singleton - ProteomeList[RepProteomeId].repProtKey = RepProteomeId; //todo - ProteomeList[RepProteomeId].protSimScore = 1.0; + proteomeList[RepProteomeId].repProtKey = RepProteomeId; + proteomeList[RepProteomeId].protSimScore = 1.0; return false; }else{ return true; } } -void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& ProteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinClustWriter) { +void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinClustWriter) { char buffer[1024]; bool isRepFound = false; unsigned int lastqLen = 0; @@ -201,7 +201,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt char* querySeq = tProteinDB.getData(queryId, thread_idx); query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId)); matcher.initQuery(&query); - const unsigned int queryProteomeLength = ProteomeList[qproteomeKey].proteomeAALen; + const unsigned int queryProteomeLength = proteomeList[qproteomeKey].proteomeAALen; // representative protein :same query and target (for createtsv) Matcher::result_t rep_reult = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true); @@ -214,19 +214,20 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt continue; } // if query Proteome's length < target Proteome's length * 0.9, skip - const unsigned int targetProteomeLength = ProteomeList[eachTargetMember.proteomeKey].proteomeAALen; + const unsigned int targetProteomeLength = proteomeList[eachTargetMember.proteomeKey].proteomeAALen; if (queryProteomeLength < targetProteomeLength * 0.9) { continue; } const unsigned int targetId = eachTargetMember.proteinKey; - unsigned int targetProteomeId = eachTargetMember.proteomeKey; const unsigned int targetKey = tProteinDB.getDbKey(targetId); + unsigned int tproteomeKey = eachTargetMember.proteomeKey; + char* targetSeq = tProteinDB.getData(targetId, thread_idx); target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId)); if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) { continue; - }; + } const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false; Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity); @@ -235,7 +236,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt if (query.L >= target.L*0.9){ size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); proteinClustWriter.writeAdd(buffer, len, thread_idx); - localSeqIds[targetProteomeId] += result.getSeqId()*target.L; + localSeqIds[tproteomeKey] += result.getSeqId()*target.L; } } } @@ -243,22 +244,22 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt } } -bool updateProteomeList(std::vector& ProteomeList, const unsigned int& RepProteomeId){ +bool updateproteomeList(std::vector& proteomeList, const unsigned int& RepProteomeId){ bool isRepSingleton = true; // #pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < ProteomeList.size(); i++) { - if (ProteomeList[i].isCovered() == false) { + for (size_t i = 0; i < proteomeList.size(); i++) { + if (proteomeList[i].isCovered() == false) { if (i == RepProteomeId){ - ProteomeList[i].repProtKey = RepProteomeId; - ProteomeList[i].protSimScore = 1.0; + proteomeList[i].repProtKey = RepProteomeId; + proteomeList[i].protSimScore = 1.0; }else{ - ProteomeList[i].computeRedundancy(); - if (ProteomeList[i].getProtSimScore() >= 0.9) { - ProteomeList[i].repProtKey = RepProteomeId; - isRepSingleton = false; + proteomeList[i].computeRedundancy(); + if (proteomeList[i].getProtSimScore() >= 0.9) { + proteomeList[i].repProtKey = RepProteomeId; + isRepSingleton = false; }else{ - ProteomeList[i].resetProteomeInfo(); + proteomeList[i].resetProteomeInfo(); } } } @@ -266,7 +267,7 @@ bool updateProteomeList(std::vector& ProteomeList, const unsigned return isRepSingleton; } -void updateClusterReps(ClusterEntry& clusterRep, std::vector& ProteomeList, std::vector& localMemCount){ +void updateClusterReps(ClusterEntry& clusterRep, std::vector& proteomeList, std::vector& localMemCount){ std::vector isProteomeInCluster(localMemCount.size(), false); if (clusterRep.isAvailable) { bool isAllMemberCovered = true; @@ -274,7 +275,7 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector& Pro unsigned int lastProteomeKey = 0; //update cluster member info for (auto& eachMember : clusterRep.memberProteins) { - if (ProteomeList[eachMember.proteomeKey].isCovered() == false) { + if (proteomeList[eachMember.proteomeKey].isCovered() == false) { notCoveredCount++; lastProteomeKey = eachMember.proteomeKey; isAllMemberCovered = false; @@ -285,8 +286,8 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector& Pro clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; } if (notCoveredCount == 1) { //singleton Cluster - isProteomeInCluster[lastProteomeKey] = false; - clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; + isProteomeInCluster[lastProteomeKey] = false; + clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; } } @@ -302,20 +303,20 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector& Pro } -void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector &ProteomeList) { +void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector &proteomeList) { std::vector proteomeClusterIdx; char proteomeBuffer[1024]; - int repProtIdCluster = ProteomeList[0].getRepProtKey(); + int repProtIdCluster = proteomeList[0].getRepProtKey(); - for (size_t idx = 0; idx < ProteomeList.size(); idx++) { - int repProtId = ProteomeList[idx].getRepProtKey(); + for (size_t idx = 0; idx < proteomeList.size(); idx++) { + int repProtId = proteomeList[idx].getRepProtKey(); if (repProtIdCluster != repProtId) { proteomeClustWriter.writeStart(); for (auto &eachIdx : proteomeClusterIdx) { char *basePos = proteomeBuffer; - char *tmpProteomeBuffer = Itoa::i32toa_sse2(ProteomeList[eachIdx].getProteomeKey(), proteomeBuffer); + char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer); *(tmpProteomeBuffer - 1) = '\t'; - tmpProteomeBuffer = Util::fastSeqIdToBuffer(ProteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer); + tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer); *(tmpProteomeBuffer - 1) = '\n'; proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos); } @@ -328,13 +329,13 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector::LookupEntry* tLookup = tProteinDB.getLookup(); const size_t tLookupSize = tProteinDB.getLookupSize(); unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber; - std::vector ProteomeList(totalProteomeNumber + 1); - calculateProteomeLength(ProteomeList, tLookup, tLookupSize, tProteinDB); + std::vector proteomeList(totalProteomeNumber + 1); + calculateProteomeLength(proteomeList, tLookup, tLookupSize, tProteinDB); //Open the linclust result DBReader linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); @@ -404,7 +405,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ thread_idx = (unsigned int) omp_get_thread_num(); #endif std::vector localClusterReps; - std::vector localMemCount(ProteomeList.size(), 0); + std::vector localMemCount(proteomeList.size(), 0); #pragma omp for schedule(dynamic, 1) for (size_t id = 0; id < linResDB.getSize(); id++) { @@ -414,7 +415,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ #pragma omp critical { for (size_t idx=0; idx < localMemCount.size(); idx++){ - ProteomeList[idx].incrementMemCount(localMemCount[idx]); + proteomeList[idx].incrementMemCount(localMemCount[idx]); } clusterReps.insert(clusterReps.end(), std::make_move_iterator(localClusterReps.begin()), @@ -426,7 +427,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ unsigned int RepProteomeId = -1; // Debug(Debug::INFO) << "Run Alignment\n"; - while (FindNextRepProteome(ProteomeList, RepProteomeId)) { + while (FindNextRepProteome(proteomeList, RepProteomeId)) { #pragma omp parallel { unsigned int thread_idx = 0; @@ -436,52 +437,52 @@ int proteomecluster(int argc, const char **argv, const Command &command){ Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop); Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); - std::vector localSeqIds(clusterReps.size(), 0.0f); + std::vector localSeqIds(proteomeList.size(), 0.0f); #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < clusterReps.size(); i++) { if (clusterReps[i].isAvailable) { - runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, ProteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter); + runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter); } } #pragma omp critical { - for (size_t idx = 0; idx < ProteomeList.size(); idx++) { - ProteomeList[idx].addSeqId(localSeqIds[idx]); + for (size_t idx = 0; idx < proteomeList.size(); idx++) { + proteomeList[idx].addSeqId(localSeqIds[idx]); } } } // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n"; - bool isRepSingleton = updateProteomeList(ProteomeList, RepProteomeId); + bool isRepSingleton = updateproteomeList(proteomeList, RepProteomeId); if (isRepSingleton) { - ProteomeList[RepProteomeId].repProtKey = RepProteomeId; - ProteomeList[RepProteomeId].protSimScore = 1.0; + proteomeList[RepProteomeId].repProtKey = RepProteomeId; + proteomeList[RepProteomeId].protSimScore = 1.0; } // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n"; #pragma omp parallel { - std::vector localMemCount(ProteomeList.size(), 0); + std::vector localMemCount(proteomeList.size(), 0); #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < clusterReps.size(); i++){ - updateClusterReps(clusterReps[i], ProteomeList, localMemCount); + updateClusterReps(clusterReps[i], proteomeList, localMemCount); } #pragma omp critical { - for (size_t i = 0; i < ProteomeList.size(); i++) { - ProteomeList[i].incrementMemCount(localMemCount[i]); + for (size_t i = 0; i < proteomeList.size(); i++) { + proteomeList[i].incrementMemCount(localMemCount[i]); } } } } - //sort ProteomeList by repProtKey - SORT_PARALLEL(ProteomeList.begin(), ProteomeList.end(), ProteomeEntry::compareByKey); + //sort proteomeList by repProtKey + SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey); //write result of proteome clustering - writeProteomeClusters(proteomeClustWriter, ProteomeList); + writeProteomeClusters(proteomeClustWriter, proteomeList); proteomeClustWriter.close(); proteinClustWriter.close(); diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt index b4e1f0cf1..5aa4e0bd9 100644 --- a/src/workflow/CMakeLists.txt +++ b/src/workflow/CMakeLists.txt @@ -14,7 +14,7 @@ set(workflow_source_files workflow/Search.cpp workflow/Taxonomy.cpp workflow/EasyTaxonomy.cpp - workflow/EasyAlignproteome.cpp + workflow/EasyProteomecluster.cpp workflow/CreateIndex.cpp PARENT_SCOPE ) diff --git a/src/workflow/EasyAlignproteome.cpp b/src/workflow/EasyAlignproteome.cpp deleted file mode 100644 index 5374da773..000000000 --- a/src/workflow/EasyAlignproteome.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include - -#include "FileUtil.h" -#include "CommandCaller.h" -#include "Util.h" -#include "Debug.h" -#include "Parameters.h" - -#include "easyalignproteome.sh.h" - -void setEasyAlignproteomeDefaults(Parameters *p) { - p->proteomeSimThr = 0.9; -} - -// void setEasyAlignproteomeMustPassAlong(Parameters *p){ - -// } - -int easyalignproteome(int argc, const char **argv, const Command &command) { - Parameters &par = Parameters::getInstance(); - setEasyAlignproteomeDefaults(&par); - par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - // setEasyAlignproteomeMustPassAlong(&par); - std::string tmpDir = par.filenames.back(); - std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); - if (par.reuseLatest) { - hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); - } - tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); - par.filenames.pop_back(); - - CommandCaller cmd; - cmd.addVariable("ALIGNPROTEOME_PAR", par.createParameterString(par.alignproteome,true).c_str()); // what? - std::string program = tmpDir + "/easyalignproteome.sh"; - FileUtil::writeFile(program, easyalignproteome_sh, easyalignproteome_sh_len); - cmd.execProgram(program.c_str(), par.filenames); - - // Should never get here - assert(false); - return 0; -} diff --git a/src/workflow/EasyLinclust.cpp b/src/workflow/EasyLinclust.cpp index 375ff1610..181968a8b 100644 --- a/src/workflow/EasyLinclust.cpp +++ b/src/workflow/EasyLinclust.cpp @@ -28,7 +28,6 @@ void setEasyLinclustMustPassAlong(Parameters *p) { p->PARAM_REMOVE_TMP_FILES.wasSet = true; p->PARAM_C.wasSet = true; p->PARAM_E.wasSet = true; - //p->PARAM_ALIGNMENT_MODE.wasSet = true; p->PARAM_ORF_START_MODE.wasSet = true; p->PARAM_ORF_MIN_LENGTH.wasSet = true; p->PARAM_ORF_MAX_LENGTH.wasSet = true; From fd1f5d6afa3779a30baa88f8bd57515b6d473825 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Wed, 28 Aug 2024 19:30:55 +0900 Subject: [PATCH 12/31] original index --- src/commons/IndexReader.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/commons/IndexReader.h b/src/commons/IndexReader.h index 0ad0ca15e..298c2bb8c 100644 --- a/src/commons/IndexReader.h +++ b/src/commons/IndexReader.h @@ -22,7 +22,7 @@ class IndexReader { ) : sequenceReader(NULL), index(NULL) { int targetDbtype = FileUtil::parseDbType(dataName.c_str()); if (Parameters::isEqualDbtype(targetDbtype, Parameters::DBTYPE_INDEX_DB)) { - new DBReader(dataName.c_str(), (dataName + ".index").c_str(), 1, dataMode); + index = new DBReader(dataName.c_str(), (dataName + ".index").c_str(), 1, DBReader::USE_DATA|DBReader::USE_INDEX); index->open(DBReader::NOSORT); if (PrefilteringIndexReader::checkIfIndexFile(index)) { PrefilteringIndexReader::printSummary(index); From 0d7db0f4e227c9141b5967b69eeaf6076e60eb88 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Wed, 28 Aug 2024 19:31:28 +0900 Subject: [PATCH 13/31] use sync_fetch_and_add when increment memcount --- src/util/proteomecluster.cpp | 43 ++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index 84d7421e0..e1c44c0d0 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -24,12 +24,6 @@ struct __attribute__((__packed__)) ProteomeEntry{ ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f) : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {} - void incrementMemCount(unsigned int count) { - memCount += count; - } - void decrementmemCount() { - memCount--; - } int getRepProtKey() { return repProtKey; } @@ -113,7 +107,7 @@ void initLocalClusterReps(size_t& id, std::vector& localClusterRep std::vector memberKeys; memberKeys.reserve(50); // store key for every protein in a cluster - std::vector isProteomeInCluster(localMemCount.size(), false); + std::vector isProteomeInCluster(localMemCount.size(), 0); char buffer[1024 + 32768*4]; char *clustData = linResDB.getData(id, thread_idx); @@ -133,7 +127,9 @@ void initLocalClusterReps(size_t& id, std::vector& localClusterRep mem.proteomeKey = proteomeKey; mem.proteinKey = proteinId; eachClusterRep.memberProteins.push_back(mem); - isProteomeInCluster[proteomeKey] = true; + if (isProteomeInCluster[proteomeKey] == 0) { + isProteomeInCluster[proteomeKey] = 1; + } } localClusterReps.push_back(eachClusterRep); } @@ -151,7 +147,7 @@ bool FindNextRepProteome(std::vector& proteomeList, unsigned int& unsigned int maxMemberCount = 0; unsigned int notCoveredProtCount = 0; - for (size_t idx = 0; idx < proteomeList.size(); idx++){ + for (size_t idx = 0; idx < proteomeList.size(); idx++) { if (proteomeList[idx].isCovered()) { continue; }else{ @@ -268,7 +264,7 @@ bool updateproteomeList(std::vector& proteomeList, const unsigned } void updateClusterReps(ClusterEntry& clusterRep, std::vector& proteomeList, std::vector& localMemCount){ - std::vector isProteomeInCluster(localMemCount.size(), false); + std::vector isProteomeInCluster(localMemCount.size(), 0); if (clusterRep.isAvailable) { bool isAllMemberCovered = true; unsigned int notCoveredCount = 0; @@ -279,14 +275,16 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector& pro notCoveredCount++; lastProteomeKey = eachMember.proteomeKey; isAllMemberCovered = false; - isProteomeInCluster[eachMember.proteomeKey] = true; + if (isProteomeInCluster[eachMember.proteomeKey] == 0) { + isProteomeInCluster[eachMember.proteomeKey] = 1; + } } } if (isAllMemberCovered) { clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; } if (notCoveredCount == 1) { //singleton Cluster - isProteomeInCluster[lastProteomeKey] = false; + isProteomeInCluster[lastProteomeKey] = 0; clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; } } @@ -392,7 +390,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ int gapOpen, gapExtend; BaseMatrix *subMat; - subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); + SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); gapOpen = par.gapOpen.values.aminoacid(); gapExtend = par.gapExtend.values.aminoacid(); EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend); @@ -412,11 +410,12 @@ int proteomecluster(int argc, const char **argv, const Command &command){ initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx); } + for (size_t idx = 0; idx < localMemCount.size(); idx++) { + __sync_fetch_and_add(&proteomeList[idx].memCount, localMemCount[idx]); + } + #pragma omp critical { - for (size_t idx=0; idx < localMemCount.size(); idx++){ - proteomeList[idx].incrementMemCount(localMemCount[idx]); - } clusterReps.insert(clusterReps.end(), std::make_move_iterator(localClusterReps.begin()), std::make_move_iterator(localClusterReps.end())); @@ -466,15 +465,12 @@ int proteomecluster(int argc, const char **argv, const Command &command){ { std::vector localMemCount(proteomeList.size(), 0); #pragma omp for schedule(dynamic, 1) - for (size_t i = 0; i < clusterReps.size(); i++){ + for (size_t i = 0; i < clusterReps.size(); i++) { updateClusterReps(clusterReps[i], proteomeList, localMemCount); } - - #pragma omp critical - { - for (size_t i = 0; i < proteomeList.size(); i++) { - proteomeList[i].incrementMemCount(localMemCount[i]); - } + + for (size_t i = 0; i < proteomeList.size(); i++) { + __sync_fetch_and_add(&proteomeList[i].memCount, localMemCount[i]); } } } @@ -487,7 +483,6 @@ int proteomecluster(int argc, const char **argv, const Command &command){ proteomeClustWriter.close(); proteinClustWriter.close(); tProteinDB.close(); - delete subMat; linResDB.close(); return EXIT_SUCCESS; } \ No newline at end of file From 1fe489e7fe1a9dafa746be1ff05ae880a2f5f025 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Wed, 28 Aug 2024 20:10:08 +0900 Subject: [PATCH 14/31] change submat --- src/util/proteomecluster.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index e1c44c0d0..845f18f08 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -243,7 +243,6 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt bool updateproteomeList(std::vector& proteomeList, const unsigned int& RepProteomeId){ bool isRepSingleton = true; - // #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < proteomeList.size(); i++) { if (proteomeList[i].isCovered() == false) { if (i == RepProteomeId){ @@ -389,11 +388,11 @@ int proteomecluster(int argc, const char **argv, const Command &command){ std::vector clusterReps; int gapOpen, gapExtend; - BaseMatrix *subMat; + // BaseMatrix *subMat; SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); gapOpen = par.gapOpen.values.aminoacid(); gapExtend = par.gapExtend.values.aminoacid(); - EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend); + EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend); // Debug(Debug::INFO) << "Start Initialization\n"; #pragma omp parallel @@ -433,9 +432,9 @@ int proteomecluster(int argc, const char **argv, const Command &command){ #ifdef OPENMP thread_idx = (unsigned int) omp_get_thread_num(); #endif - Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop); - Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); - Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection); + Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, &subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop); + Sequence query(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection); + Sequence target(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection); std::vector localSeqIds(proteomeList.size(), 0.0f); #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < clusterReps.size(); i++) { From 86f6b15d38dd64b13a475fcf2ce05eaf91f06dbe Mon Sep 17 00:00:00 2001 From: imgyuri Date: Thu, 29 Aug 2024 00:21:14 +0900 Subject: [PATCH 15/31] add timer --- src/util/proteomecluster.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index 845f18f08..c93fd13d4 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -8,6 +8,7 @@ #include "SubstitutionMatrix.h" #include "Alignment.h" #include "itoa.h" +#include "Timer.h" #ifdef OPENMP #include @@ -393,7 +394,8 @@ int proteomecluster(int argc, const char **argv, const Command &command){ gapOpen = par.gapOpen.values.aminoacid(); gapExtend = par.gapExtend.values.aminoacid(); EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend); - + Debug(Debug::INFO) << "Initilize "; + Timer timer; // Debug(Debug::INFO) << "Start Initialization\n"; #pragma omp parallel { @@ -421,10 +423,12 @@ int proteomecluster(int argc, const char **argv, const Command &command){ } } + Debug(Debug::INFO) << timer.lap() << "\n"; unsigned int RepProteomeId = -1; // Debug(Debug::INFO) << "Run Alignment\n"; - + Debug(Debug::INFO) << "Proteome Clustering "; + timer.reset(); while (FindNextRepProteome(proteomeList, RepProteomeId)) { #pragma omp parallel { @@ -473,6 +477,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ } } } + Debug(Debug::INFO) << timer.lap() << "\n"; //sort proteomeList by repProtKey SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey); From be5a53dc41cb79d4bcf9ca889fba70b4d1553038 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Thu, 29 Aug 2024 00:43:43 +0900 Subject: [PATCH 16/31] Remove __packed__ to resolve sync_fetch_and_add error --- src/util/proteomecluster.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index c93fd13d4..61aca740f 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -14,7 +14,7 @@ #include #endif -struct __attribute__((__packed__)) ProteomeEntry{ +struct ProteomeEntry{ int proteomeKey; unsigned int proteomeAALen; int repProtKey; From 8d3a38f543e0de81fb66523298cf0884f1865351 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Thu, 29 Aug 2024 16:21:28 +0900 Subject: [PATCH 17/31] change chmod in createdb --- src/util/createdb.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 src/util/createdb.cpp diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp old mode 100755 new mode 100644 From bfa1911e211b16a4d0e48eef1a3fe98a746c321d Mon Sep 17 00:00:00 2001 From: imgyuri Date: Thu, 29 Aug 2024 16:50:48 +0900 Subject: [PATCH 18/31] update proteome-similarity threshold parameter --- src/util/proteomecluster.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index 61aca740f..8190aca50 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -210,9 +210,9 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt if (eachTargetMember.proteomeKey == RepProteomeId) { continue; } - // if query Proteome's length < target Proteome's length * 0.9, skip + // if query Proteome's length < target Proteome's length * proteomeSimThr, skip const unsigned int targetProteomeLength = proteomeList[eachTargetMember.proteomeKey].proteomeAALen; - if (queryProteomeLength < targetProteomeLength * 0.9) { + if (queryProteomeLength < targetProteomeLength * par.proteomeSimThr) { continue; } const unsigned int targetId = eachTargetMember.proteinKey; @@ -241,7 +241,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt } } -bool updateproteomeList(std::vector& proteomeList, const unsigned int& RepProteomeId){ +bool updateproteomeList(std::vector& proteomeList, const unsigned int& RepProteomeId, Parameters& par) { bool isRepSingleton = true; for (size_t i = 0; i < proteomeList.size(); i++) { @@ -251,7 +251,7 @@ bool updateproteomeList(std::vector& proteomeList, const unsigned proteomeList[i].protSimScore = 1.0; }else{ proteomeList[i].computeRedundancy(); - if (proteomeList[i].getProtSimScore() >= 0.9) { + if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) { proteomeList[i].repProtKey = RepProteomeId; isRepSingleton = false; }else{ @@ -456,7 +456,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ } // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n"; - bool isRepSingleton = updateproteomeList(proteomeList, RepProteomeId); + bool isRepSingleton = updateproteomeList(proteomeList, RepProteomeId, par); if (isRepSingleton) { proteomeList[RepProteomeId].repProtKey = RepProteomeId; From 7e912d24268cbe4b7f6bb169494c177fa2b8a1cd Mon Sep 17 00:00:00 2001 From: imgyuri Date: Mon, 9 Sep 2024 01:51:06 +0900 Subject: [PATCH 19/31] Sort redundant proteomes by similarity score in proteome_cluster.tsv --- src/util/proteomecluster.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index 8190aca50..d4e79ad4e 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -68,6 +68,9 @@ struct ProteomeEntry{ if (a.proteomeKey != a.repProtKey && b.proteomeKey == b.repProtKey){ return false; } + if (a.proteomeKey != a.repProtKey && b.proteomeKey != b.repProtKey) { + return a.protSimScore > b.protSimScore; + } return false; } }; From 618e2bf567aa0fd4c13b80d09aa536ba138b4091 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Mon, 9 Sep 2024 10:50:03 +0900 Subject: [PATCH 20/31] delete redundant code - checking proteome is singleton(no redudant member proteome is detected) --- src/util/proteomecluster.cpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index d4e79ad4e..366ae8e8f 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -21,9 +21,10 @@ struct ProteomeEntry{ float protSimScore; unsigned int memCount; float totalSeqId; + unsigned int reciprocalMemCount; - ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f) - : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {} + ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f, unsigned int reciprocalMemCount = 0) + : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) , reciprocalMemCount(reciprocalMemCount) {} int getRepProtKey() { return repProtKey; @@ -50,6 +51,7 @@ struct ProteomeEntry{ void resetProteomeInfo(){ memCount = 0; totalSeqId = 0.0f; + reciprocalMemCount = 0; } float getProtSimScore() { return protSimScore; @@ -244,9 +246,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt } } -bool updateproteomeList(std::vector& proteomeList, const unsigned int& RepProteomeId, Parameters& par) { - bool isRepSingleton = true; - +void updateProteomeRedundancyInfo(std::vector& proteomeList, const unsigned int& RepProteomeId, Parameters& par) { for (size_t i = 0; i < proteomeList.size(); i++) { if (proteomeList[i].isCovered() == false) { if (i == RepProteomeId){ @@ -256,14 +256,13 @@ bool updateproteomeList(std::vector& proteomeList, const unsigned proteomeList[i].computeRedundancy(); if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) { proteomeList[i].repProtKey = RepProteomeId; - isRepSingleton = false; }else{ proteomeList[i].resetProteomeInfo(); } } } } - return isRepSingleton; + } void updateClusterReps(ClusterEntry& clusterRep, std::vector& proteomeList, std::vector& localMemCount){ @@ -453,18 +452,18 @@ int proteomecluster(int argc, const char **argv, const Command &command){ #pragma omp critical { for (size_t idx = 0; idx < proteomeList.size(); idx++) { - proteomeList[idx].addSeqId(localSeqIds[idx]); + if (localSeqIds[idx] > 0.0f) { + proteomeList[idx].addSeqId(localSeqIds[idx]); + if (idx != RepProteomeId) { + proteomeList[idx].reciprocalMemCount++; + } + } } } } // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n"; - bool isRepSingleton = updateproteomeList(proteomeList, RepProteomeId, par); - - if (isRepSingleton) { - proteomeList[RepProteomeId].repProtKey = RepProteomeId; - proteomeList[RepProteomeId].protSimScore = 1.0; - } + updateProteomeRedundancyInfo(proteomeList, RepProteomeId, par); // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n"; #pragma omp parallel From e990773212b9e04f6eefed415f5b71e9dca6dd5f Mon Sep 17 00:00:00 2001 From: imgyuri Date: Mon, 9 Sep 2024 11:12:53 +0900 Subject: [PATCH 21/31] stop --- src/util/proteomecluster.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index 366ae8e8f..b4285baf6 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -22,9 +22,10 @@ struct ProteomeEntry{ unsigned int memCount; float totalSeqId; unsigned int reciprocalMemCount; + float jaccardIndex; - ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f, unsigned int reciprocalMemCount = 0) - : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) , reciprocalMemCount(reciprocalMemCount) {} + ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f, unsigned int reciprocalMemCount = 0, float jaccardIndex = 0.0f) + : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) , reciprocalMemCount(reciprocalMemCount), jaccardIndex(jaccardIndex) {} int getRepProtKey() { return repProtKey; @@ -38,9 +39,9 @@ struct ProteomeEntry{ } return true; } - void computeRedundancy() { + void computeRedundancy(unsigned int repProteomeMemCount) { protSimScore = totalSeqId / proteomeAALen; - + jaccardIndex = reciprocalMemCount / (memCount + repProteomeMemCount - reciprocalMemCount); } void addSeqId(float seqId) { totalSeqId += seqId; @@ -52,10 +53,14 @@ struct ProteomeEntry{ memCount = 0; totalSeqId = 0.0f; reciprocalMemCount = 0; + jaccardIndex = 0.0f; } float getProtSimScore() { return protSimScore; } + float getJaccardIndex() { + return jaccardIndex; + } static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) { if (a.repProtKey < b.repProtKey){ @@ -247,15 +252,19 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt } void updateProteomeRedundancyInfo(std::vector& proteomeList, const unsigned int& RepProteomeId, Parameters& par) { + const unsigned int& repProteomeMemCount = proteomeList[RepProteomeId].memCount; + Debug(Debug::INFO) << "Rep Proteome " << RepProteomeId << " has " << repProteomeMemCount << " members\n"; for (size_t i = 0; i < proteomeList.size(); i++) { if (proteomeList[i].isCovered() == false) { if (i == RepProteomeId){ proteomeList[i].repProtKey = RepProteomeId; proteomeList[i].protSimScore = 1.0; }else{ - proteomeList[i].computeRedundancy(); + proteomeList[i].computeRedundancy(repProteomeMemCount); if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) { proteomeList[i].repProtKey = RepProteomeId; + + Debug(Debug::INFO) << "Proteome " << i << " is covered by " << RepProteomeId << " with ReciprocalMem: " << proteomeList[i].reciprocalMemCount << " with jaccard " << proteomeList[i].getJaccardIndex() << "\n"; }else{ proteomeList[i].resetProteomeInfo(); } @@ -317,8 +326,12 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector Date: Mon, 9 Sep 2024 01:38:00 +0900 Subject: [PATCH 22/31] jaccardindexTrial --- src/commons/DBReader.cpp | 2 +- src/util/proteomecluster.cpp | 173 ++++++++++++++++++++--------------- 2 files changed, 98 insertions(+), 77 deletions(-) diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp index 598d152f6..1eb80fd06 100644 --- a/src/commons/DBReader.cpp +++ b/src/commons/DBReader.cpp @@ -735,7 +735,7 @@ template std::string DBReader::getSourceFileName (size_t id){ template T DBReader::getSourceKey(size_t id){ if (id >= sourceSize){ - Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".lookup\n"; + Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n"; Debug(Debug::ERROR) << "getSource id: local id (" << id << ") >= db size (" << sourceSize << ")\n"; EXIT(EXIT_FAILURE); } diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index b4285baf6..d88cb5c33 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -19,13 +19,13 @@ struct ProteomeEntry{ unsigned int proteomeAALen; int repProtKey; float protSimScore; - unsigned int memCount; + unsigned int clusterCount; float totalSeqId; - unsigned int reciprocalMemCount; + unsigned int reciprocalClusterCount; float jaccardIndex; - ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f, unsigned int reciprocalMemCount = 0, float jaccardIndex = 0.0f) - : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) , reciprocalMemCount(reciprocalMemCount), jaccardIndex(jaccardIndex) {} + ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f, unsigned int reciprocalClusterCount = 0, float jaccardIndex = 0.0f) + : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) , reciprocalClusterCount(reciprocalClusterCount), jaccardIndex(jaccardIndex) {} int getRepProtKey() { return repProtKey; @@ -39,9 +39,9 @@ struct ProteomeEntry{ } return true; } - void computeRedundancy(unsigned int repProteomeMemCount) { + void computeRedundancy(unsigned int repProteomeclusterCount) { protSimScore = totalSeqId / proteomeAALen; - jaccardIndex = reciprocalMemCount / (memCount + repProteomeMemCount - reciprocalMemCount); + jaccardIndex = reciprocalClusterCount / (clusterCount + repProteomeclusterCount - reciprocalClusterCount); } void addSeqId(float seqId) { totalSeqId += seqId; @@ -50,9 +50,9 @@ struct ProteomeEntry{ proteomeAALen += seqLength; } void resetProteomeInfo(){ - memCount = 0; + clusterCount = 0; totalSeqId = 0.0f; - reciprocalMemCount = 0; + reciprocalClusterCount = 0; jaccardIndex = 0.0f; } float getProtSimScore() { @@ -114,76 +114,89 @@ void calculateProteomeLength(std::vector& proteomeList, DBReader< } } -void initLocalClusterReps(size_t& id, std::vector& localClusterReps, std::vector& localMemCount, DBReader& tProteinDB, DBReader& linResDB, unsigned int thread_idx){ +void initLocalClusterReps(size_t& id, std::vector& localClusterReps, std::vector& localProteomeCount, DBReader& tProteinDB, DBReader& linResDB, unsigned int thread_idx){ std::vector memberKeys; memberKeys.reserve(50); // store key for every protein in a cluster - std::vector isProteomeInCluster(localMemCount.size(), 0); + std::vector isProteomeInCluster(localProteomeCount.size(), 0); char buffer[1024 + 32768*4]; char *clustData = linResDB.getData(id, thread_idx); + while (*clustData != '\0') { Util::parseKey(clustData, buffer); const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10); memberKeys.push_back(key); clustData = Util::skipLine(clustData); } - if (memberKeys.size() > 1) { //If not a singleton cluster - ClusterEntry eachClusterRep(memberKeys.size()); - //init MemberProteinEntry and add it to memberProteins vector - for (auto& eachMemberKey : memberKeys) { - const unsigned int proteinId = tProteinDB.getId(eachMemberKey); - const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId); - MemberProteinEntry mem; - mem.proteomeKey = proteomeKey; - mem.proteinKey = proteinId; - eachClusterRep.memberProteins.push_back(mem); - if (isProteomeInCluster[proteomeKey] == 0) { - isProteomeInCluster[proteomeKey] = 1; - } + + if (memberKeys.size() <= 1) { + return; //singleton cluster (1 member only) + } + + ClusterEntry eachClusterRep(memberKeys.size()); + int uniqueProteomeCount = 0; + //init MemberProteinEntry and add it to memberProteins vector + for (auto& eachMemberKey : memberKeys) { + const unsigned int proteinId = tProteinDB.getId(eachMemberKey); + const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId); + MemberProteinEntry mem; + mem.proteomeKey = proteomeKey; + mem.proteinKey = proteinId; + eachClusterRep.memberProteins.push_back(mem); + if (isProteomeInCluster[proteomeKey] == 0) { //If gene is from new proteome(source) + isProteomeInCluster[proteomeKey] = 1; + uniqueProteomeCount++; } - localClusterReps.push_back(eachClusterRep); } - for (size_t i = 0; i < localMemCount.size(); i++) { + if (uniqueProteomeCount <= 1) { + return; //singleton cluster (geneDuplication_1 proteome only with multiple members) + } + + localClusterReps.push_back(eachClusterRep); + for (size_t i = 0; i < localProteomeCount.size(); i++) { if (isProteomeInCluster[i]) { - localMemCount[i]++; + localProteomeCount[i]++; } } } bool FindNextRepProteome(std::vector& proteomeList, unsigned int& RepProteomeId) { - bool isAllCovered = true; + bool isAllProteomeRedundancyCovered = true; unsigned int maxMemberCount = 0; - unsigned int notCoveredProtCount = 0; + unsigned int remainingProteomeCount = 0; + unsigned int lastProteomeKey = -1; for (size_t idx = 0; idx < proteomeList.size(); idx++) { if (proteomeList[idx].isCovered()) { continue; }else{ - isAllCovered = false; - notCoveredProtCount++; - if (proteomeList[idx].memCount > maxMemberCount) { - maxMemberCount = proteomeList[idx].memCount; + isAllProteomeRedundancyCovered = false; + remainingProteomeCount++; + lastProteomeKey = idx; + if (proteomeList[idx].clusterCount > maxMemberCount) { + maxMemberCount = proteomeList[idx].clusterCount; RepProteomeId = idx; } } } - if (isAllCovered){ + if (isAllProteomeRedundancyCovered){ return false; - }else if (notCoveredProtCount == 1) { - //last one and it is singleton - proteomeList[RepProteomeId].repProtKey = RepProteomeId; - proteomeList[RepProteomeId].protSimScore = 1.0; + }else if (remainingProteomeCount == 1) { + //Only proteome is left. It isn't redundant with any other proteome + proteomeList[lastProteomeKey].repProtKey = proteomeList[lastProteomeKey].proteomeKey; + proteomeList[lastProteomeKey].protSimScore = 1.0; + proteomeList[lastProteomeKey].jaccardIndex = 1.0; return false; }else{ return true; } } -void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinClustWriter) { +void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinClustWriter, std::vector& localProteomeCount) { char buffer[1024]; bool isRepFound = false; unsigned int lastqLen = 0; @@ -202,6 +215,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt } } if (isRepFound){ + std::vector isProteomeInCluster(localProteomeCount.size(), 0); proteinClustWriter.writeStart(thread_idx); const unsigned int queryId = qproteinKey; const unsigned int queryKey = tProteinDB.getDbKey(queryId); @@ -244,27 +258,33 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); proteinClustWriter.writeAdd(buffer, len, thread_idx); localSeqIds[tproteomeKey] += result.getSeqId()*target.L; + if (isProteomeInCluster[tproteomeKey] == 0) { + isProteomeInCluster[tproteomeKey] = 1; + } } } } + for (size_t i = 0; i < localProteomeCount.size(); i++) { + if (isProteomeInCluster[i]) { + localProteomeCount[i]++; + } + } proteinClustWriter.writeEnd(queryKey, thread_idx); } } void updateProteomeRedundancyInfo(std::vector& proteomeList, const unsigned int& RepProteomeId, Parameters& par) { - const unsigned int& repProteomeMemCount = proteomeList[RepProteomeId].memCount; - Debug(Debug::INFO) << "Rep Proteome " << RepProteomeId << " has " << repProteomeMemCount << " members\n"; + const unsigned int& repProteomeclusterCount = proteomeList[RepProteomeId].clusterCount; for (size_t i = 0; i < proteomeList.size(); i++) { if (proteomeList[i].isCovered() == false) { if (i == RepProteomeId){ proteomeList[i].repProtKey = RepProteomeId; proteomeList[i].protSimScore = 1.0; + proteomeList[i].jaccardIndex = 1.0; }else{ - proteomeList[i].computeRedundancy(repProteomeMemCount); + proteomeList[i].computeRedundancy(repProteomeclusterCount); if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) { proteomeList[i].repProtKey = RepProteomeId; - - Debug(Debug::INFO) << "Proteome " << i << " is covered by " << RepProteomeId << " with ReciprocalMem: " << proteomeList[i].reciprocalMemCount << " with jaccard " << proteomeList[i].getJaccardIndex() << "\n"; }else{ proteomeList[i].resetProteomeInfo(); } @@ -274,37 +294,37 @@ void updateProteomeRedundancyInfo(std::vector& proteomeList, cons } -void updateClusterReps(ClusterEntry& clusterRep, std::vector& proteomeList, std::vector& localMemCount){ - std::vector isProteomeInCluster(localMemCount.size(), 0); +void updateClusterReps(ClusterEntry& clusterRep, std::vector& proteomeList, std::vector& localProteomeCount){ + std::vector isProteomeInCluster(localProteomeCount.size(), 0); if (clusterRep.isAvailable) { - bool isAllMemberCovered = true; - unsigned int notCoveredCount = 0; - unsigned int lastProteomeKey = 0; + unsigned int notCoveredMemberCount = 0; + // unsigned int lastProteomeKey = 0; //gg : unused + unsigned int uniqueProteomeCount = 0; //update cluster member info for (auto& eachMember : clusterRep.memberProteins) { if (proteomeList[eachMember.proteomeKey].isCovered() == false) { - notCoveredCount++; - lastProteomeKey = eachMember.proteomeKey; - isAllMemberCovered = false; + notCoveredMemberCount++; + // lastProteomeKey = eachMember.proteomeKey;// gg : unused if (isProteomeInCluster[eachMember.proteomeKey] == 0) { isProteomeInCluster[eachMember.proteomeKey] = 1; + uniqueProteomeCount++; } } } - if (isAllMemberCovered) { - clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; - } - if (notCoveredCount == 1) { //singleton Cluster - isProteomeInCluster[lastProteomeKey] = 0; + if (notCoveredMemberCount <= 1 || uniqueProteomeCount <= 1 ) { + //notCoveredMemberCount : All members(genes)' proteome are covere(0) or only one member(gene) is not covered(1) + //uniqueProteomeCount : Remaining members(genes) are from same(one) proteome clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false; + return; } + } - //update localMemCount + //update localProteomeCount if (clusterRep.isAvailable) { - for (size_t i=0; i < localMemCount.size(); i++) { + for (size_t i=0; i < localProteomeCount.size(); i++) { if (isProteomeInCluster[i]) { - localMemCount[i]++; + localProteomeCount[i]++; } } } @@ -326,9 +346,6 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector localClusterReps; - std::vector localMemCount(proteomeList.size(), 0); + std::vector localProteomeCount(proteomeList.size(), 0); #pragma omp for schedule(dynamic, 1) for (size_t id = 0; id < linResDB.getSize(); id++) { - initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx); + initLocalClusterReps(id, localClusterReps, localProteomeCount, tProteinDB, linResDB, thread_idx); } - for (size_t idx = 0; idx < localMemCount.size(); idx++) { - __sync_fetch_and_add(&proteomeList[idx].memCount, localMemCount[idx]); + for (size_t idx = 0; idx < localProteomeCount.size(); idx++) { + __sync_fetch_and_add(&proteomeList[idx].clusterCount, localProteomeCount[idx]); } #pragma omp critical @@ -440,11 +457,16 @@ int proteomecluster(int argc, const char **argv, const Command &command){ } Debug(Debug::INFO) << timer.lap() << "\n"; + // //for debug gg + // for (size_t i = 0; i < proteomeList.size(); i++) { + // Debug(Debug::INFO) << "Proteome " << i << " has " << proteomeList[i].clusterCount << " members\n"; + // } unsigned int RepProteomeId = -1; // Debug(Debug::INFO) << "Run Alignment\n"; Debug(Debug::INFO) << "Proteome Clustering "; timer.reset(); while (FindNextRepProteome(proteomeList, RepProteomeId)) { + Debug(Debug::INFO) << " | Rep Proteome id : " << RepProteomeId << "\n"; #pragma omp parallel { unsigned int thread_idx = 0; @@ -455,21 +477,20 @@ int proteomecluster(int argc, const char **argv, const Command &command){ Sequence query(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection); Sequence target(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection); std::vector localSeqIds(proteomeList.size(), 0.0f); + std::vector localClusterCount(proteomeList.size(), 0); #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < clusterReps.size(); i++) { if (clusterReps[i].isAvailable) { - runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter); + runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter, localClusterCount); } } #pragma omp critical { for (size_t idx = 0; idx < proteomeList.size(); idx++) { - if (localSeqIds[idx] > 0.0f) { + if (proteomeList[idx].isCovered() == false) { proteomeList[idx].addSeqId(localSeqIds[idx]); - if (idx != RepProteomeId) { - proteomeList[idx].reciprocalMemCount++; - } + proteomeList[idx].reciprocalClusterCount += localClusterCount[idx]; } } } @@ -481,21 +502,21 @@ int proteomecluster(int argc, const char **argv, const Command &command){ // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n"; #pragma omp parallel { - std::vector localMemCount(proteomeList.size(), 0); + std::vector localProteomeCount(proteomeList.size(), 0); #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < clusterReps.size(); i++) { - updateClusterReps(clusterReps[i], proteomeList, localMemCount); + updateClusterReps(clusterReps[i], proteomeList, localProteomeCount); } for (size_t i = 0; i < proteomeList.size(); i++) { - __sync_fetch_and_add(&proteomeList[i].memCount, localMemCount[i]); + __sync_fetch_and_add(&proteomeList[i].clusterCount, localProteomeCount[i]); } } } Debug(Debug::INFO) << timer.lap() << "\n"; //sort proteomeList by repProtKey SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey); - + //write result of proteome clustering writeProteomeClusters(proteomeClustWriter, proteomeList); From d14217f72b87c98a6550cc7873444af1a1e78519 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Mon, 9 Sep 2024 19:49:42 +0900 Subject: [PATCH 23/31] add normalized scoring --- src/util/proteomecluster.cpp | 55 +++++++++++++++++------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index d88cb5c33..e94442e47 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -21,11 +21,10 @@ struct ProteomeEntry{ float protSimScore; unsigned int clusterCount; float totalSeqId; - unsigned int reciprocalClusterCount; - float jaccardIndex; + float normalizedSimScore; - ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f, unsigned int reciprocalClusterCount = 0, float jaccardIndex = 0.0f) - : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) , reciprocalClusterCount(reciprocalClusterCount), jaccardIndex(jaccardIndex) {} + ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f) + : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) {} int getRepProtKey() { return repProtKey; @@ -39,9 +38,12 @@ struct ProteomeEntry{ } return true; } - void computeRedundancy(unsigned int repProteomeclusterCount) { + void computeRedundancy(unsigned int repProteomeSize) { protSimScore = totalSeqId / proteomeAALen; - jaccardIndex = reciprocalClusterCount / (clusterCount + repProteomeclusterCount - reciprocalClusterCount); + normalizedSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen); + if (normalizedSimScore >= 1.0) { + normalizedSimScore = 1.0; + } } void addSeqId(float seqId) { totalSeqId += seqId; @@ -52,15 +54,16 @@ struct ProteomeEntry{ void resetProteomeInfo(){ clusterCount = 0; totalSeqId = 0.0f; - reciprocalClusterCount = 0; - jaccardIndex = 0.0f; + normalizedSimScore = 0.0f; + protSimScore = 0.0f; } float getProtSimScore() { return protSimScore; } - float getJaccardIndex() { - return jaccardIndex; + float getNormalizedSimScore() { + return normalizedSimScore; } + static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) { if (a.repProtKey < b.repProtKey){ @@ -189,14 +192,14 @@ bool FindNextRepProteome(std::vector& proteomeList, unsigned int& //Only proteome is left. It isn't redundant with any other proteome proteomeList[lastProteomeKey].repProtKey = proteomeList[lastProteomeKey].proteomeKey; proteomeList[lastProteomeKey].protSimScore = 1.0; - proteomeList[lastProteomeKey].jaccardIndex = 1.0; + proteomeList[lastProteomeKey].normalizedSimScore = 1.0; return false; }else{ return true; } } -void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinClustWriter, std::vector& localProteomeCount) { +void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinClustWriter) { char buffer[1024]; bool isRepFound = false; unsigned int lastqLen = 0; @@ -215,7 +218,6 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt } } if (isRepFound){ - std::vector isProteomeInCluster(localProteomeCount.size(), 0); proteinClustWriter.writeStart(thread_idx); const unsigned int queryId = qproteinKey; const unsigned int queryKey = tProteinDB.getDbKey(queryId); @@ -258,31 +260,23 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); proteinClustWriter.writeAdd(buffer, len, thread_idx); localSeqIds[tproteomeKey] += result.getSeqId()*target.L; - if (isProteomeInCluster[tproteomeKey] == 0) { - isProteomeInCluster[tproteomeKey] = 1; - } } } } - for (size_t i = 0; i < localProteomeCount.size(); i++) { - if (isProteomeInCluster[i]) { - localProteomeCount[i]++; - } - } proteinClustWriter.writeEnd(queryKey, thread_idx); } } void updateProteomeRedundancyInfo(std::vector& proteomeList, const unsigned int& RepProteomeId, Parameters& par) { - const unsigned int& repProteomeclusterCount = proteomeList[RepProteomeId].clusterCount; + const unsigned int& repProteomeAASize = proteomeList[RepProteomeId].proteomeAALen; for (size_t i = 0; i < proteomeList.size(); i++) { if (proteomeList[i].isCovered() == false) { if (i == RepProteomeId){ proteomeList[i].repProtKey = RepProteomeId; proteomeList[i].protSimScore = 1.0; - proteomeList[i].jaccardIndex = 1.0; + proteomeList[i].normalizedSimScore = 1.0; }else{ - proteomeList[i].computeRedundancy(repProteomeclusterCount); + proteomeList[i].computeRedundancy(repProteomeAASize); if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) { proteomeList[i].repProtKey = RepProteomeId; }else{ @@ -346,6 +340,9 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector localSeqIds(proteomeList.size(), 0.0f); - std::vector localClusterCount(proteomeList.size(), 0); #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < clusterReps.size(); i++) { if (clusterReps[i].isAvailable) { - runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter, localClusterCount); + runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter); } } @@ -490,7 +485,6 @@ int proteomecluster(int argc, const char **argv, const Command &command){ for (size_t idx = 0; idx < proteomeList.size(); idx++) { if (proteomeList[idx].isCovered() == false) { proteomeList[idx].addSeqId(localSeqIds[idx]); - proteomeList[idx].reciprocalClusterCount += localClusterCount[idx]; } } } @@ -516,7 +510,10 @@ int proteomecluster(int argc, const char **argv, const Command &command){ Debug(Debug::INFO) << timer.lap() << "\n"; //sort proteomeList by repProtKey SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey); - + //debug + // for (size_t i = 0; i < proteomeList.size(); i++) { + // Debug(Debug::INFO) << "Proteome " << proteomeList[i].proteomeKey << " RepKey " << proteomeList[i].repProtKey << " normalizedscore: " << proteomeList[i].normalizedSimScore << " members\n"; + // } //write result of proteome clustering writeProteomeClusters(proteomeClustWriter, proteomeList); From 5e6ee6d58b1cc1114e3972fe785aa9ba1e5238a9 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Mon, 9 Sep 2024 20:28:22 +0900 Subject: [PATCH 24/31] add proteome relative(normalized) similarity threshold parameter --- src/MMseqsBase.cpp | 2 +- src/commons/Parameters.cpp | 4 +++- src/commons/Parameters.h | 2 ++ src/util/proteomecluster.cpp | 29 ++++++++++++++-------------- src/workflow/EasyProteomecluster.cpp | 1 + 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 67ec4a76b..cccc9ce94 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -90,7 +90,7 @@ std::vector baseCommands = { "# - protein_cluster.tsv: Protein linclust result\n" "# - proteome_cluster.tsv: Proteome redundancy clustering result\n" "# - protein_align.tsv: Protein alignment list\n" - "mmseqs easy-proteomecluster examples/DB.fasta result tmp --proteome-similarity 0.9\n", + "mmseqs easy-proteomecluster examples/DB.fasta result tmp --proteome-similarity 0.9 --proteome-relative-similarity 0.9\n", "Martin Steinegger & Gyuri Kim ", " ... ", CITATION_MMSEQS2, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin }, diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 6d2418287..b299a2112 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -87,7 +87,7 @@ Parameters::Parameters(): PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), - + PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-normalized-similarity", "Proteome normalized similarity", "Proteome similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeNormalizedSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), #ifdef GAP_POS_SCORING PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT), #endif @@ -385,6 +385,7 @@ Parameters::Parameters(): proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR); proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE); proteomecluster.push_back(&PARAM_PROTEOME_SIMILARITY); + proteomecluster.push_back(&PARAM_PROTEOME_RELATIVE_SIMILARITY); proteomecluster.push_back(&PARAM_REALIGN); // proteomecluster.push_back(&PARAM_MAX_REJECTED); @@ -2356,6 +2357,7 @@ void Parameters::setDefaults() { maxAccept = INT_MAX; seqIdThr = 0.0; proteomeSimThr = 0.9; + proteomeNormalizedSimThr = 0.9; alnLenThr = 0; altAlignment = 0; gapOpen = MultiParam>(NuclAA(11, 5)); diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index b23b771cd..a19fcfa52 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -436,6 +436,7 @@ class Parameters { int altAlignment; // show up to this many alternative alignments float seqIdThr; // sequence identity threshold for acceptance float proteomeSimThr; // proteome similarity threshold for acceptance + float proteomeNormalizedSimThr; // proteome similarity threshold normalized by proteome size for acceptance int alnLenThr; // min. alignment length bool addBacktrace; // store backtrace string (M=Match, D=deletion, I=insertion) bool realign; // realign hit with more conservative score @@ -790,6 +791,7 @@ class Parameters { PARAMETER(PARAM_GAP_OPEN) PARAMETER(PARAM_GAP_EXTEND) PARAMETER(PARAM_PROTEOME_SIMILARITY) + PARAMETER(PARAM_PROTEOME_RELATIVE_SIMILARITY) #ifdef GAP_POS_SCORING PARAMETER(PARAM_GAP_PSEUDOCOUNT) #endif diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index e94442e47..9faddd439 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -21,7 +21,7 @@ struct ProteomeEntry{ float protSimScore; unsigned int clusterCount; float totalSeqId; - float normalizedSimScore; + float relativeSimScore; ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f) : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) {} @@ -40,9 +40,9 @@ struct ProteomeEntry{ } void computeRedundancy(unsigned int repProteomeSize) { protSimScore = totalSeqId / proteomeAALen; - normalizedSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen); - if (normalizedSimScore >= 1.0) { - normalizedSimScore = 1.0; + relativeSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen); + if (relativeSimScore >= 1.0) { + relativeSimScore = 1.0; } } void addSeqId(float seqId) { @@ -54,14 +54,14 @@ struct ProteomeEntry{ void resetProteomeInfo(){ clusterCount = 0; totalSeqId = 0.0f; - normalizedSimScore = 0.0f; + relativeSimScore = 0.0f; protSimScore = 0.0f; } float getProtSimScore() { return protSimScore; } - float getNormalizedSimScore() { - return normalizedSimScore; + float getrelativeSimScore() { + return relativeSimScore; } static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) { @@ -192,7 +192,7 @@ bool FindNextRepProteome(std::vector& proteomeList, unsigned int& //Only proteome is left. It isn't redundant with any other proteome proteomeList[lastProteomeKey].repProtKey = proteomeList[lastProteomeKey].proteomeKey; proteomeList[lastProteomeKey].protSimScore = 1.0; - proteomeList[lastProteomeKey].normalizedSimScore = 1.0; + proteomeList[lastProteomeKey].relativeSimScore = 1.0; return false; }else{ return true; @@ -274,10 +274,10 @@ void updateProteomeRedundancyInfo(std::vector& proteomeList, cons if (i == RepProteomeId){ proteomeList[i].repProtKey = RepProteomeId; proteomeList[i].protSimScore = 1.0; - proteomeList[i].normalizedSimScore = 1.0; + proteomeList[i].relativeSimScore = 1.0; }else{ proteomeList[i].computeRedundancy(repProteomeAASize); - if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) { + if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr && proteomeList[i].getrelativeSimScore() >= par.proteomeNormalizedSimThr){ proteomeList[i].repProtKey = RepProteomeId; }else{ proteomeList[i].resetProteomeInfo(); @@ -342,7 +342,7 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vectoralignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; p->proteomeSimThr = 0.9; + p->proteomeNormalizedSimThr = 0.0; } void setEasyproteomeclusterMustPassAlong(Parameters *p){ From 743370d355f660cacf2604bb1014e1053e5709f6 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Mon, 9 Sep 2024 20:51:30 +0900 Subject: [PATCH 25/31] relative similarity threshold --- src/commons/Parameters.cpp | 2 +- src/util/proteomecluster.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index b299a2112..860bd5abb 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -87,7 +87,7 @@ Parameters::Parameters(): PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), - PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-normalized-similarity", "Proteome normalized similarity", "Proteome similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeNormalizedSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), + PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-relative-similarity", "Proteome relative similarity", "Proteome relative similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeNormalizedSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), #ifdef GAP_POS_SCORING PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT), #endif diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index 9faddd439..5ddaaceb2 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -425,7 +425,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ gapOpen = par.gapOpen.values.aminoacid(); gapExtend = par.gapExtend.values.aminoacid(); EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend); - Debug(Debug::INFO) << "Initialization"; + Debug(Debug::INFO) << "Initialization "; Timer timer; #pragma omp parallel { From 44a6fcab00bdc696526f615d85c7e9bf6c70ec3d Mon Sep 17 00:00:00 2001 From: imgyuri Date: Mon, 9 Sep 2024 22:23:42 +0900 Subject: [PATCH 26/31] latest version-bugfix,relativeProteomeSimilarity added, singleton proteome handling update --- src/util/proteomecluster.cpp | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index 5ddaaceb2..a88511978 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -65,7 +65,6 @@ struct ProteomeEntry{ } static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) { - if (a.repProtKey < b.repProtKey){ return true; } @@ -134,7 +133,7 @@ void initLocalClusterReps(size_t& id, std::vector& localClusterRep } if (memberKeys.size() <= 1) { - return; //singleton cluster (1 member only) + return; //singleton protein member cluster (1 member only) } ClusterEntry eachClusterRep(memberKeys.size()); @@ -154,7 +153,7 @@ void initLocalClusterReps(size_t& id, std::vector& localClusterRep } if (uniqueProteomeCount <= 1) { - return; //singleton cluster (geneDuplication_1 proteome only with multiple members) + return; //singleton proteome cluster (geneDuplication_1 proteome only with multiple members, exclude cloud genome) } localClusterReps.push_back(eachClusterRep); @@ -205,12 +204,12 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt unsigned int lastqLen = 0; unsigned int qproteinKey = 0; unsigned int qproteomeKey =0; - //find Rep + //find representative query protein which has the longest sequence length for (auto& eachMember : clusterRep.memberProteins){ if (eachMember.proteomeKey == RepProteomeId) { isRepFound = true; const unsigned int queryId = eachMember.proteinKey; - if( lastqLen < tProteinDB.getSeqLen(queryId)){ + if( lastqLen < tProteinDB.getSeqLen(queryId)) { lastqLen = tProteinDB.getSeqLen(queryId); qproteinKey = eachMember.proteinKey; qproteomeKey = eachMember.proteomeKey; @@ -226,12 +225,13 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt matcher.initQuery(&query); const unsigned int queryProteomeLength = proteomeList[qproteomeKey].proteomeAALen; - // representative protein :same query and target (for createtsv) + // Alignment by representative protein itself : same query and target (for createtsv) Matcher::result_t rep_reult = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true); size_t rep_len = Matcher::resultToBuffer(buffer, rep_reult, par.addBacktrace); proteinClustWriter.writeAdd(buffer, rep_len, thread_idx); localSeqIds[qproteomeKey] += rep_reult.getSeqId()*query.L; + //Alignment by representative protein and other proteins in the cluster for (auto& eachTargetMember : clusterRep.memberProteins){ if (eachTargetMember.proteomeKey == RepProteomeId) { continue; @@ -256,7 +256,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity); if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) { - if (query.L >= target.L*0.9){ + if (query.L >= target.L*0.9){ // -s2 lenght difference parameter in cd-hit-2d size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); proteinClustWriter.writeAdd(buffer, len, thread_idx); localSeqIds[tproteomeKey] += result.getSeqId()*target.L; @@ -292,13 +292,11 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector& pro std::vector isProteomeInCluster(localProteomeCount.size(), 0); if (clusterRep.isAvailable) { unsigned int notCoveredMemberCount = 0; - // unsigned int lastProteomeKey = 0; //gg : unused unsigned int uniqueProteomeCount = 0; //update cluster member info for (auto& eachMember : clusterRep.memberProteins) { if (proteomeList[eachMember.proteomeKey].isCovered() == false) { notCoveredMemberCount++; - // lastProteomeKey = eachMember.proteomeKey;// gg : unused if (isProteomeInCluster[eachMember.proteomeKey] == 0) { isProteomeInCluster[eachMember.proteomeKey] = 1; uniqueProteomeCount++; @@ -314,7 +312,7 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector& pro } - //update localProteomeCount + //update localProteomeCount -> update clusterCount for next repProteome finding if (clusterRep.isAvailable) { for (size_t i=0; i < localProteomeCount.size(); i++) { if (isProteomeInCluster[i]) { @@ -340,7 +338,6 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector Date: Tue, 10 Sep 2024 20:50:19 +0900 Subject: [PATCH 27/31] Change Relative Score formula --- src/alignment/Matcher.h | 3 +++ src/commons/Parameters.cpp | 4 ++-- src/commons/Parameters.h | 2 +- src/util/proteomecluster.cpp | 35 +++++++++++++++++++++------- src/workflow/EasyProteomecluster.cpp | 2 +- 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h index 6e1d0ab8f..90185de63 100644 --- a/src/alignment/Matcher.h +++ b/src/alignment/Matcher.h @@ -144,6 +144,9 @@ class Matcher{ float getSeqId(){ return seqId; } + unsigned int getAlnLength(){ + return alnLength; + } }; Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m, diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 860bd5abb..fe35ab4c2 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -87,7 +87,7 @@ Parameters::Parameters(): PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), - PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-relative-similarity", "Proteome relative similarity", "Proteome relative similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeNormalizedSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), + PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-relative-similarity", "Proteome relative similarity", "Proteome relative similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeRelativeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN), #ifdef GAP_POS_SCORING PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT), #endif @@ -2357,7 +2357,7 @@ void Parameters::setDefaults() { maxAccept = INT_MAX; seqIdThr = 0.0; proteomeSimThr = 0.9; - proteomeNormalizedSimThr = 0.9; + proteomeRelativeSimThr = 0.9; alnLenThr = 0; altAlignment = 0; gapOpen = MultiParam>(NuclAA(11, 5)); diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index a19fcfa52..0d4b3eee3 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -436,7 +436,7 @@ class Parameters { int altAlignment; // show up to this many alternative alignments float seqIdThr; // sequence identity threshold for acceptance float proteomeSimThr; // proteome similarity threshold for acceptance - float proteomeNormalizedSimThr; // proteome similarity threshold normalized by proteome size for acceptance + float proteomeRelativeSimThr; // proteome similarity threshold normalized by proteome size for acceptance int alnLenThr; // min. alignment length bool addBacktrace; // store backtrace string (M=Match, D=deletion, I=insertion) bool realign; // realign hit with more conservative score diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index a88511978..cf54a34f1 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -21,10 +21,11 @@ struct ProteomeEntry{ float protSimScore; unsigned int clusterCount; float totalSeqId; + unsigned int AAMatchCount; float relativeSimScore; - ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f) - : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) {} + ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f, unsigned int matchCount = 0, float relSimScore = 0.0f) + : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId), AAMatchCount(matchCount), relativeSimScore(relSimScore) {} int getRepProtKey() { return repProtKey; @@ -40,6 +41,7 @@ struct ProteomeEntry{ } void computeRedundancy(unsigned int repProteomeSize) { protSimScore = totalSeqId / proteomeAALen; + relativeSimScore = static_cast (AAMatchCount * 2) / static_cast (repProteomeSize + proteomeAALen); relativeSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen); if (relativeSimScore >= 1.0) { relativeSimScore = 1.0; @@ -56,6 +58,7 @@ struct ProteomeEntry{ totalSeqId = 0.0f; relativeSimScore = 0.0f; protSimScore = 0.0f; + AAMatchCount = 0; } float getProtSimScore() { return protSimScore; @@ -198,7 +201,7 @@ bool FindNextRepProteome(std::vector& proteomeList, unsigned int& } } -void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, DBWriter& proteinClustWriter) { +void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector& localSeqIds, std::vector& localMatchCount, DBWriter& proteinClustWriter) { char buffer[1024]; bool isRepFound = false; unsigned int lastqLen = 0; @@ -226,10 +229,12 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt const unsigned int queryProteomeLength = proteomeList[qproteomeKey].proteomeAALen; // Alignment by representative protein itself : same query and target (for createtsv) - Matcher::result_t rep_reult = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true); - size_t rep_len = Matcher::resultToBuffer(buffer, rep_reult, par.addBacktrace); + Matcher::result_t rep_result = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true); + size_t rep_len = Matcher::resultToBuffer(buffer, rep_result, par.addBacktrace); proteinClustWriter.writeAdd(buffer, rep_len, thread_idx); - localSeqIds[qproteomeKey] += rep_reult.getSeqId()*query.L; + localSeqIds[qproteomeKey] += rep_result.getSeqId()*query.L; + localMatchCount[qproteomeKey] += static_cast (rep_result.getSeqId() * static_cast (rep_result.getAlnLength()) + 0.5); + //Alignment by representative protein and other proteins in the cluster for (auto& eachTargetMember : clusterRep.memberProteins){ @@ -259,7 +264,11 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt if (query.L >= target.L*0.9){ // -s2 lenght difference parameter in cd-hit-2d size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); proteinClustWriter.writeAdd(buffer, len, thread_idx); - localSeqIds[tproteomeKey] += result.getSeqId()*target.L; + // localSeqIds[tproteomeKey] += result.getSeqId()*target.L; + localSeqIds[tproteomeKey] += result.getSeqId() * target.L; + // localMatchCount[tproteomeKey] += static_cast ((result.getSeqId() * static_cast (result.getAlnLength())) + 0.5); + unsigned int matchCount = static_cast ((result.getSeqId() * static_cast (result.getAlnLength())) + 0.5); + localMatchCount[tproteomeKey] += matchCount; } } } @@ -277,7 +286,7 @@ void updateProteomeRedundancyInfo(std::vector& proteomeList, cons proteomeList[i].relativeSimScore = 1.0; }else{ proteomeList[i].computeRedundancy(repProteomeAASize); - if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr && proteomeList[i].getrelativeSimScore() >= par.proteomeNormalizedSimThr){ + if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr && proteomeList[i].getrelativeSimScore() >= par.proteomeRelativeSimThr){ proteomeList[i].repProtKey = RepProteomeId; }else{ proteomeList[i].resetProteomeInfo(); @@ -466,10 +475,11 @@ int proteomecluster(int argc, const char **argv, const Command &command){ Sequence query(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection); Sequence target(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection); std::vector localSeqIds(proteomeList.size(), 0.0f); + std::vector localMatchCount(proteomeList.size(), 0); #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < clusterReps.size(); i++) { if (clusterReps[i].isAvailable) { - runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter); + runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, localMatchCount, proteinClustWriter); } } @@ -478,6 +488,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ for (size_t idx = 0; idx < proteomeList.size(); idx++) { if (proteomeList[idx].isCovered() == false) { proteomeList[idx].addSeqId(localSeqIds[idx]); + proteomeList[idx].AAMatchCount += localMatchCount[idx]; } } } @@ -501,6 +512,12 @@ int proteomecluster(int argc, const char **argv, const Command &command){ } } Debug(Debug::INFO) << timer.lap() << "\n"; + + + for (size_t i=0; i < proteomeList.size(); i++) { + Debug(Debug::INFO) << "Proteome: " << proteomeList[i].proteomeKey << " relativeScore " << proteomeList[i].relativeSimScore << "\n"; + } + //sort proteomeList by repProtKey SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey); //write _proteome_cluster diff --git a/src/workflow/EasyProteomecluster.cpp b/src/workflow/EasyProteomecluster.cpp index d7e7509f9..c8e60e18e 100644 --- a/src/workflow/EasyProteomecluster.cpp +++ b/src/workflow/EasyProteomecluster.cpp @@ -11,7 +11,7 @@ void setEasyproteomeclusterDefaults(Parameters *p) { p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; p->proteomeSimThr = 0.9; - p->proteomeNormalizedSimThr = 0.0; + p->proteomeRelativeSimThr = 0.0; } void setEasyproteomeclusterMustPassAlong(Parameters *p){ From e24b9e3c6d855c8482e2c688c24492d3d34189c9 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Tue, 10 Sep 2024 21:06:50 +0900 Subject: [PATCH 28/31] Remove Debug printout --- src/util/proteomecluster.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index cf54a34f1..3c441426d 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -513,11 +513,6 @@ int proteomecluster(int argc, const char **argv, const Command &command){ } Debug(Debug::INFO) << timer.lap() << "\n"; - - for (size_t i=0; i < proteomeList.size(); i++) { - Debug(Debug::INFO) << "Proteome: " << proteomeList[i].proteomeKey << " relativeScore " << proteomeList[i].relativeSimScore << "\n"; - } - //sort proteomeList by repProtKey SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey); //write _proteome_cluster From ad1acdf13d221eb95b9a31f2401d619c85a150fa Mon Sep 17 00:00:00 2001 From: imgyuri Date: Tue, 10 Sep 2024 21:54:58 +0900 Subject: [PATCH 29/31] style change --- src/util/proteomecluster.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index 3c441426d..45589ca01 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -66,7 +66,6 @@ struct ProteomeEntry{ float getrelativeSimScore() { return relativeSimScore; } - static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) { if (a.repProtKey < b.repProtKey){ return true; @@ -212,7 +211,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt if (eachMember.proteomeKey == RepProteomeId) { isRepFound = true; const unsigned int queryId = eachMember.proteinKey; - if( lastqLen < tProteinDB.getSeqLen(queryId)) { + if (lastqLen < tProteinDB.getSeqLen(queryId)) { lastqLen = tProteinDB.getSeqLen(queryId); qproteinKey = eachMember.proteinKey; qproteomeKey = eachMember.proteomeKey; @@ -261,12 +260,10 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity); if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) { - if (query.L >= target.L*0.9){ // -s2 lenght difference parameter in cd-hit-2d + if (query.L >= target.L*0.9) { // -s2 lenght difference parameter in cd-hit-2d size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace); proteinClustWriter.writeAdd(buffer, len, thread_idx); - // localSeqIds[tproteomeKey] += result.getSeqId()*target.L; localSeqIds[tproteomeKey] += result.getSeqId() * target.L; - // localMatchCount[tproteomeKey] += static_cast ((result.getSeqId() * static_cast (result.getAlnLength())) + 0.5); unsigned int matchCount = static_cast ((result.getSeqId() * static_cast (result.getAlnLength())) + 0.5); localMatchCount[tproteomeKey] += matchCount; } @@ -351,7 +348,6 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector Date: Tue, 10 Sep 2024 22:50:58 +0900 Subject: [PATCH 30/31] Latest Version --- src/util/proteomecluster.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp index 45589ca01..28859a0cc 100644 --- a/src/util/proteomecluster.cpp +++ b/src/util/proteomecluster.cpp @@ -42,7 +42,6 @@ struct ProteomeEntry{ void computeRedundancy(unsigned int repProteomeSize) { protSimScore = totalSeqId / proteomeAALen; relativeSimScore = static_cast (AAMatchCount * 2) / static_cast (repProteomeSize + proteomeAALen); - relativeSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen); if (relativeSimScore >= 1.0) { relativeSimScore = 1.0; } From d9213ef1bbad70acc4f5f39797da82547d7b83e2 Mon Sep 17 00:00:00 2001 From: imgyuri Date: Thu, 19 Sep 2024 13:03:25 +0900 Subject: [PATCH 31/31] Add cluster count report --- data/workflow/easyproteomecluster.sh | 8 +++++- src/MMseqsBase.cpp | 7 +++-- src/util/proteomecluster.cpp | 42 ++++++++++++++++++++++++++-- 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/data/workflow/easyproteomecluster.sh b/data/workflow/easyproteomecluster.sh index 2bab198fc..3a8bc85bd 100644 --- a/data/workflow/easyproteomecluster.sh +++ b/data/workflow/easyproteomecluster.sh @@ -29,7 +29,7 @@ fi if notExists "${TMP_PATH}/aln.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${PROTEOMECLUSTER_PAR} \ + "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" "${TMP_PATH}/cluster_count" ${PROTEOMECLUSTER_PAR} \ || fail "Convert Alignments died" fi @@ -45,6 +45,12 @@ if notExists "${RESULTS}_proteome_cluster.tsv"; then || fail "createtsv proteome cluster died" fi +if notExists "${RESULTS}_cluster_count.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/cluster_count" "${RESULTS}_cluster_count.tsv" ${THREADS_PAR} \ + || fail "createtsv proteome cluster count report died" +fi + if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index cccc9ce94..84bf73726 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -656,11 +656,12 @@ std::vector baseCommands = { "Proteome clustering and alignment", NULL, "Martin Steinegger & Gyuri Kim ", - " ", + " & proteomeList, DBReader< } } -void initLocalClusterReps(size_t& id, std::vector& localClusterReps, std::vector& localProteomeCount, DBReader& tProteinDB, DBReader& linResDB, unsigned int thread_idx){ +void initLocalClusterReps(size_t& id, std::vector& localClusterReps, std::vector& localProteomeCount, DBReader& tProteinDB, DBReader& linResDB, unsigned int thread_idx) { std::vector memberKeys; memberKeys.reserve(50); // store key for every protein in a cluster @@ -375,6 +375,23 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector::setExtendedDbtype(Parameters::DBTYPE_GENERIC_DB, Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER); DBWriter proteomeClustWriter(par.db4.c_str(), par.db4Index.c_str(), 1, par.compressed, proteomeDBType); proteomeClustWriter.open(); + DBWriter clusterCountWriter(par.db5.c_str(), par.db5Index.c_str(), 1, par.compressed, proteomeDBType); + clusterCountWriter.open(); std::vector clusterReps; @@ -428,6 +447,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend); Debug(Debug::INFO) << "Initialization "; Timer timer; + #pragma omp parallel { unsigned int thread_idx = 0; @@ -436,7 +456,6 @@ int proteomecluster(int argc, const char **argv, const Command &command){ #endif std::vector localClusterReps; std::vector localProteomeCount(proteomeList.size(), 0); - #pragma omp for schedule(dynamic, 1) for (size_t id = 0; id < linResDB.getSize(); id++) { initLocalClusterReps(id, localClusterReps, localProteomeCount, tProteinDB, linResDB, thread_idx); @@ -452,10 +471,26 @@ int proteomecluster(int argc, const char **argv, const Command &command){ std::make_move_iterator(localClusterReps.begin()), std::make_move_iterator(localClusterReps.end())); } - + } Debug(Debug::INFO) << timer.lap() << "\n"; + unsigned int totalClusterCount = clusterReps.size(); + Debug(Debug::INFO) << "Total Cluster Count : " << totalClusterCount << "\n"; + for (size_t idx=0; idx < proteomeList.size(); idx++){ + float clusterCountRatio = static_cast (proteomeList[idx].clusterCount) / static_cast (totalClusterCount); + + char proteomeBuffer[1024]; + clusterCountWriter.writeStart(); + char *basePos = proteomeBuffer; + char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[idx].clusterCount, proteomeBuffer); + *(tmpProteomeBuffer - 1) = '\t'; + tmpProteomeBuffer = fastfloatToBuffer(clusterCountRatio, tmpProteomeBuffer); + *(tmpProteomeBuffer - 1) = '\n'; + clusterCountWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos); + clusterCountWriter.writeEnd(proteomeList[idx].proteomeKey); + } + unsigned int RepProteomeId = -1; Debug(Debug::INFO) << "Proteome Clustering "; timer.reset(); @@ -515,6 +550,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){ proteomeClustWriter.close(); proteinClustWriter.close(); + clusterCountWriter.close(); tProteinDB.close(); linResDB.close(); return EXIT_SUCCESS;