From 1cb20742c4a4554cc9d55b578a54d25d40c06528 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Fri, 16 Aug 2024 21:34:03 +0900
Subject: [PATCH 01/31] alignproteome added

---
 src/CommandDeclarations.h      |   1 +
 src/MMseqsBase.cpp             |  10 +
 src/alignment/Matcher.h        |   6 +-
 src/commons/CMakeLists.txt     |   2 +
 src/commons/ClusterSpecies.cpp |  32 +++
 src/commons/ClusterSpecies.h   |  42 ++++
 src/commons/DBReader.cpp       |  73 ++++++
 src/commons/DBReader.h         |  15 +-
 src/commons/DBWriter.cpp       |   1 +
 src/commons/Parameters.cpp     |  41 ++-
 src/commons/Parameters.h       |   3 +
 src/util/CMakeLists.txt        |   1 +
 src/util/alignall.cpp          |   6 +-
 src/util/alignproteome.cpp     | 443 +++++++++++++++++++++++++++++++++
 14 files changed, 671 insertions(+), 5 deletions(-)
 create mode 100644 src/commons/ClusterSpecies.cpp
 create mode 100644 src/commons/ClusterSpecies.h
 create mode 100644 src/util/alignproteome.cpp
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
index 3c4abb4e0..369034025 100644
--- a/src/CommandDeclarations.h
+++ b/src/CommandDeclarations.h
@@ -4,6 +4,7 @@
 
 extern int align(int argc, const char **argv, const Command& command);
 extern int alignall(int argc, const char **argv, const Command& command);
+extern int alignproteome(int argc, const char **argv, const Command& command);
 extern int alignbykmer(int argc, const char **argv, const Command& command);
 extern int appenddbtoindex(int argc, const char **argv, const Command& command);
 extern int apply(int argc, const char **argv, const Command& command);
diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index 720aeaccb..e91d6d0a6 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -632,6 +632,15 @@ std::vector<Command> baseCommands = {
                                                            {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"alignproteome",             alignproteome,             &par.alignproteome,             COMMAND_ALIGNMENT,
+                "Within-result all-vs-all gapped local alignment",
+                NULL,
+                "Martin Steinegger <martin.steinegger@snu.ac.kr>",
+                "<i:sequenceDB> <i:resultDB> <o:alignmentDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
         {"alignall",             alignall,             &par.alignall,             COMMAND_ALIGNMENT,
                 "Within-result all-vs-all gapped local alignment",
                 NULL,
@@ -640,6 +649,7 @@ std::vector<Command> baseCommands = {
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+                                                           
         {"transitivealign",      transitivealign,      &par.align,                COMMAND_ALIGNMENT,
                 "Transfer alignments via transitivity",
                 //"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B",
diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h
index a9c28beb4..a89ae9c5a 100644
--- a/src/alignment/Matcher.h
+++ b/src/alignment/Matcher.h
@@ -140,6 +140,10 @@ class Matcher{
                 }
             }
         }
+
+        float getSeqId(){
+            return seqId;
+        }
     };
 
     Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m,
@@ -217,7 +221,7 @@ class Matcher{
 
 
     static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress  = true, bool addOrfPosition = false);
-
+    static size_t resultToBuffer_str(char * buffer, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition=false);
     static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart);
 
     static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer,
diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt
index dc7ab3e74..a58b5322a 100644
--- a/src/commons/CMakeLists.txt
+++ b/src/commons/CMakeLists.txt
@@ -8,6 +8,7 @@ set(commons_header_files
         commons/Command.h
         commons/CommandCaller.h
         commons/Concat.h
+        # commons/ClusterSpecies.h
         commons/DBConcat.h
         commons/DBReader.h
         commons/DBWriter.h
@@ -52,6 +53,7 @@ set(commons_source_files
         commons/BaseMatrix.cpp
         commons/Command.cpp
         commons/CommandCaller.cpp
+        # commons/ClusterSpecies.cpp
         commons/DBConcat.cpp
         commons/DBReader.cpp
         commons/DBWriter.cpp
diff --git a/src/commons/ClusterSpecies.cpp b/src/commons/ClusterSpecies.cpp
new file mode 100644
index 000000000..3216cc210
--- /dev/null
+++ b/src/commons/ClusterSpecies.cpp
@@ -0,0 +1,32 @@
+#include "ClusterSpecies.h"
+void ClusterSpecies::addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId) {
+   clusterEntriesPerSpecies[speciesId][clusterRepId].push_back(proteinId);
+//    speciesClusterMemCounts[speciesId]++;
+}
+
+
+void ClusterSpecies::removeSpecies(unsigned int speciesId) {
+    clusterEntriesPerSpecies.erase(speciesId);
+    // speciesClusterMemCounts.erase(speciesId);
+}
+
+int ClusterSpecies::findRepSpecies(){
+    //find max entry in speciesClusterMemCounts
+    unsigned int maxSpeciesId = 0;
+    size_t maxSpeciesSize = 0;
+    for (auto const& species : clusterEntriesPerSpecies){
+        size_t speiciesSize = species.second.size();
+        if (speiciesSize > maxSpeciesSize){
+            maxSpeciesId = species.first;
+        }
+    }
+
+    return maxSpeciesId;
+    // for (auto const& x : speciesClusterMemCounts)
+    // {
+    //     if (x.second > maxSpeciesCount){
+    //         maxSpeciesCount = x.second;
+    //         maxSpeciesId = x.first;
+    //     }
+    // }
+}
\ No newline at end of file
diff --git a/src/commons/ClusterSpecies.h b/src/commons/ClusterSpecies.h
new file mode 100644
index 000000000..96b1e8688
--- /dev/null
+++ b/src/commons/ClusterSpecies.h
@@ -0,0 +1,42 @@
+#ifndef CLUSTER_SPECIES_H
+#define CLUSTER_SPECIES_H
+
+#include <vector>
+#include <unordered_map>
+
+
+class ClusterSpecies {
+public:
+    
+    // std::unordered_map<unsigned int, std::unordered_map<unsigned int, std::vector<unsigned int>>> clusterEntriesPerSpecies;
+    // // std::unordered_map<unsigned int, size_t> speciesClusterMemCounts;
+    // void addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId);
+    // void removeSpecies(unsigned int speciesId);
+    // int findRepSpecies();
+
+    // template <typename T>
+    // struct ProteomeLength{
+    //     T id;
+    //     unsigned int length;
+    // };
+    
+    // std::vector<ProteomeLength<unsigned int>> proteomeAAToatalLength;
+
+    // void addProteomeLength(unsigned int id, unsigned int length){
+    //     ProteomeLength<unsigned int> pl;
+    //     pl.id = id;
+    //     pl.length = length;
+    //     proteomeAAToatalLength.push_back(pl);
+    // }
+
+    // unsigned int getProteomeLength(unsigned int id){
+    //     for (auto const& pl : proteomeAAToatalLength){
+    //         if (pl.id == id){
+    //             return pl.length;
+    //         }
+    //     }
+    //     return 0;
+    // }
+};
+
+#endif //CLUSTER_SPECIES_H
\ No newline at end of file
diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp
index f52d2369c..70216cedd 100644
--- a/src/commons/DBReader.cpp
+++ b/src/commons/DBReader.cpp
@@ -112,6 +112,7 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         totalDataSize = 0;
         dataFileCnt = dataFileNames.size();
+
         dataSizeOffset = new size_t[dataFileNames.size() + 1];
         dataFiles = new char*[dataFileNames.size()];
         for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){
@@ -136,6 +137,8 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
     }
     if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) {
+        Debug(Debug::INFO) << "ReadLookup file: " << dataFileName << "\n"; //gyuri
+
         std::string lookupFilename = (std::string(dataFileName) + ".lookup");
         MemoryMapped lookupData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
         if (lookupData.isValid() == false) {
@@ -144,7 +147,9 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         char* lookupDataChar = (char *) lookupData.getData();
         size_t lookupDataSize = lookupData.size();
+        Debug(Debug::INFO) << "Lookup Data size is " << lookupDataSize << "\n"; //gyuri
         lookupSize = Util::ompCountLines(lookupDataChar, lookupDataSize, threads);
+        Debug(Debug::INFO) << "Lookup size is " << lookupSize << "\n"; //gyuri
         lookup = new(std::nothrow) LookupEntry[this->lookupSize];
         incrementMemory(sizeof(LookupEntry) * this->lookupSize);
         readLookup(lookupDataChar, lookupDataSize, lookup);
@@ -155,6 +160,22 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         lookupData.close();
     }
+    if (dataMode & USE_SOURCE){
+        std::string sourceFilename = (std::string(dataFileName) + ".source");
+        MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
+        if (sourceData.isValid() == false) {
+            Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        char* sourceDataChar = (char *) sourceData.getData();
+        size_t sourceDataSize = sourceData.size();
+        sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads);
+        source = new(std::nothrow) SourceEntry[this->sourceSize];
+        incrementMemory(sizeof(SourceEntry) * this->sourceSize);
+        readSource(sourceDataChar, sourceDataSize, source);
+
+    }
+
     bool isSortedById = false;
     if (externalData == false) {
         MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
@@ -707,6 +728,15 @@ template <typename T> unsigned int DBReader<T>::getLookupFileNumber(size_t id){
     return lookup[id].fileNumber;
 }
 
+template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){
+    if (id >= sourceSize){
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
+        Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return source[id].fileName;
+}
+
 template<>
 void DBReader<unsigned int>::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) {
     buffer.append(SSTR(entry.id));
@@ -995,6 +1025,19 @@ size_t DBReader<T>::getOffset(size_t id) {
     return index[id].offset;
 }
 
+template<typename T>
+size_t DBReader<T>::getIndexLen(size_t id) {
+    if (id >= size){
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << indexFileName << "\n";
+        Debug(Debug::ERROR) << "getOffset: local id (" << id << ") >= db size (" << size << ")\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (local2id != NULL) {
+        id = local2id[id];
+    }
+    return index[id].length;
+}
+
 template<typename T>
 size_t DBReader<T>::findNextOffsetid(size_t id) {
     size_t idOffset = getOffset(id);
@@ -1048,6 +1091,36 @@ void DBReader<T>::readLookup(char *data, size_t dataSize, DBReader::LookupEntry
         currPos = lookupData - (char *) data;
 
         i++;
+
+        
+    }
+}
+
+template <typename T>
+void DBReader<T>::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) {
+    size_t i=0;
+    size_t currPos = 0;
+    char* sourceData = (char *) data;
+    const char * cols[3];
+    while (currPos < dataSize){
+        if (i >= this->sourceSize) {
+            Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        Util::getFieldsOfLine(sourceData, cols, 3);
+        source[i].id = Util::fast_atoi<size_t>(cols[0]);
+        std::string fileName = std::string(cols[1], (cols[2] - cols[1]));
+        size_t lastDotPosition = fileName.rfind('.');
+
+        if (lastDotPosition != std::string::npos) {
+            fileName = fileName.substr(0, lastDotPosition);
+        }
+        source[i].fileName = fileName;
+        sourceData = Util::skipLine(sourceData);
+
+        currPos = sourceData - (char *) data;
+
+        i++;
     }
 }
 
diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h
index 57589b1f6..198fa9ba5 100644
--- a/src/commons/DBReader.h
+++ b/src/commons/DBReader.h
@@ -98,7 +98,10 @@ class DBReader : public MemoryTracker {
             return false;
         }
     };
-
+    struct SourceEntry{
+        T id;
+        std::string fileName;
+    };
     struct LookupEntry {
         T id;
         std::string entryName;
@@ -184,6 +187,8 @@ class DBReader : public MemoryTracker {
 
     size_t getSize() const;
 
+    unsigned int getProteomeTotalLen(size_t id); //gyuri
+
     unsigned int getMaxSeqLen(){ 
             return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ?
                     (std::max(maxSeqLen, 1u)) / Sequence::PROFILE_READIN_SIZE :
@@ -248,6 +253,7 @@ class DBReader : public MemoryTracker {
     void lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry);
     LookupEntry* getLookup() { return lookup; };
 
+    std::string getSourceFileName(size_t id);
     static const int NOSORT = 0;
     static const int SORT_BY_LENGTH = 1;
     static const int LINEAR_ACCCESS = 2;
@@ -266,6 +272,7 @@ class DBReader : public MemoryTracker {
     static const unsigned int USE_FREAD      = 4;
     static const unsigned int USE_LOOKUP     = 8;
     static const unsigned int USE_LOOKUP_REV = 16;
+    static const unsigned int USE_SOURCE     = 32;
 
 
     // compressed
@@ -309,6 +316,8 @@ class DBReader : public MemoryTracker {
 
     void readLookup(char *data, size_t dataSize, LookupEntry *lookup);
 
+    void readSource(char *data, size_t dataSize, SourceEntry *source);
+
     void readIndexId(T* id, char * line, const char** cols);
 
     unsigned int indexIdToNum(T* id);
@@ -437,6 +446,8 @@ class DBReader : public MemoryTracker {
 
     size_t findNextOffsetid(size_t id);
 
+    size_t getIndexLen(size_t id);
+
     int isCompressed(){
         return isCompressed(dbtype);
     }
@@ -485,7 +496,9 @@ class DBReader : public MemoryTracker {
 
     Index * index;
     size_t lookupSize;
+    size_t sourceSize;
     LookupEntry * lookup;
+    SourceEntry * source;
     bool sortedByOffset;
 
     unsigned int * id2local;
diff --git a/src/commons/DBWriter.cpp b/src/commons/DBWriter.cpp
index 9ae84bae1..36aac0eaf 100644
--- a/src/commons/DBWriter.cpp
+++ b/src/commons/DBWriter.cpp
@@ -283,6 +283,7 @@ size_t DBWriter::writeAdd(const char* data, size_t dataSize, unsigned int thrIdx
     }
     size_t totalWriten = 0;
     if(isCompressedDB && (state[thrIdx] == INIT_STATE || state[thrIdx] == COMPRESSED) ) {
+        Debug(Debug::INFO) << "Compressing data in thread " << thrIdx << "\n"; //gyuri
         state[thrIdx] = COMPRESSED;
         // zstd seems to have a hard time with elements < 60
         ZSTD_inBuffer input = { data, dataSize, 0 };
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index afc56ba96..ab8ef3ad8 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -86,6 +86,8 @@ Parameters::Parameters():
         PARAM_ALT_ALIGNMENT(PARAM_ALT_ALIGNMENT_ID, "--alt-ali", "Alternative alignments", "Show up to this many alternative alignments", typeid(int), (void *) &altAlignment, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
         PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
         PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
+
 #ifdef GAP_POS_SCORING
         PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
 #endif
@@ -338,6 +340,7 @@ Parameters::Parameters():
     alignall.push_back(&PARAM_SUB_MAT);
     alignall.push_back(&PARAM_ADD_BACKTRACE);
     alignall.push_back(&PARAM_ALIGNMENT_MODE);
+    alignall.push_back(&PARAM_REALIGN_SCORE_BIAS); //gyuri
 //    alignall.push_back(&PARAM_WRAPPED_SCORING);
     alignall.push_back(&PARAM_E);
     alignall.push_back(&PARAM_MIN_SEQ_ID);
@@ -350,7 +353,7 @@ Parameters::Parameters():
     alignall.push_back(&PARAM_NO_COMP_BIAS_CORR);
     alignall.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
 
-//    alignall.push_back(&PARAM_REALIGN);
+    alignall.push_back(&PARAM_REALIGN);
 //    alignall.push_back(&PARAM_MAX_REJECTED);
 //    alignall.push_back(&PARAM_MAX_ACCEPT);
     alignall.push_back(&PARAM_INCLUDE_IDENTITY);
@@ -365,6 +368,41 @@ Parameters::Parameters():
     alignall.push_back(&PARAM_COMPRESSED);
     alignall.push_back(&PARAM_V);
 
+    //alignproteome
+        // alignproteome
+    alignproteome.push_back(&PARAM_SUB_MAT);
+    alignproteome.push_back(&PARAM_ADD_BACKTRACE);
+    alignproteome.push_back(&PARAM_ALIGNMENT_MODE);
+    alignproteome.push_back(&PARAM_REALIGN_SCORE_BIAS); //gyuri
+//    alignproteome.push_back(&PARAM_WRAPPED_SCORING);
+    alignproteome.push_back(&PARAM_E);
+    alignproteome.push_back(&PARAM_MIN_SEQ_ID);
+    alignproteome.push_back(&PARAM_MIN_ALN_LEN);
+    alignproteome.push_back(&PARAM_SEQ_ID_MODE);
+//    alignproteome.push_back(&PARAM_ALT_ALIGNMENT);
+    alignproteome.push_back(&PARAM_C);
+    alignproteome.push_back(&PARAM_COV_MODE);
+    alignproteome.push_back(&PARAM_MAX_SEQ_LEN);
+    alignproteome.push_back(&PARAM_NO_COMP_BIAS_CORR);
+    alignproteome.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
+    alignproteome.push_back(&PARAM_PROTEOME_SIMILARITY);
+
+    alignproteome.push_back(&PARAM_REALIGN);
+//    alignproteome.push_back(&PARAM_MAX_REJECTED);
+//    alignproteome.push_back(&PARAM_MAX_ACCEPT);
+    alignproteome.push_back(&PARAM_INCLUDE_IDENTITY);
+    alignproteome.push_back(&PARAM_PRELOAD_MODE);
+    alignproteome.push_back(&PARAM_PCA);
+    alignproteome.push_back(&PARAM_PCB);
+    alignproteome.push_back(&PARAM_SCORE_BIAS);
+    alignproteome.push_back(&PARAM_GAP_OPEN);
+    alignproteome.push_back(&PARAM_GAP_EXTEND);
+    alignproteome.push_back(&PARAM_ZDROP);
+    alignproteome.push_back(&PARAM_THREADS);
+    alignproteome.push_back(&PARAM_COMPRESSED);
+    alignproteome.push_back(&PARAM_V);
+    
+
     // alignment
     align.push_back(&PARAM_SUB_MAT);
     align.push_back(&PARAM_ADD_BACKTRACE);
@@ -2315,6 +2353,7 @@ void Parameters::setDefaults() {
     maxRejected = INT_MAX;
     maxAccept   = INT_MAX;
     seqIdThr = 0.0;
+    proteomeSimThr = 0.0;
     alnLenThr = 0;
     altAlignment = 0;
     gapOpen = MultiParam<NuclAA<int>>(NuclAA<int>(11, 5));
diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
index 745d2f809..838e0ed69 100644
--- a/src/commons/Parameters.h
+++ b/src/commons/Parameters.h
@@ -434,6 +434,7 @@ class Parameters {
     int    maxAccept;                    // after n accepted sequences stop
     int    altAlignment;                 // show up to this many alternative alignments
     float  seqIdThr;                     // sequence identity threshold for acceptance
+    float  proteomeSimThr;
     int    alnLenThr;                    // min. alignment length
     bool   addBacktrace;                 // store backtrace string (M=Match, D=deletion, I=insertion)
     bool   realign;                      // realign hit with more conservative score
@@ -787,6 +788,7 @@ class Parameters {
     PARAMETER(PARAM_ALT_ALIGNMENT)
     PARAMETER(PARAM_GAP_OPEN)
     PARAMETER(PARAM_GAP_EXTEND)
+    PARAMETER(PARAM_PROTEOME_SIMILARITY)
 #ifdef GAP_POS_SCORING
     PARAMETER(PARAM_GAP_PSEUDOCOUNT)
 #endif
@@ -1078,6 +1080,7 @@ class Parameters {
     std::vector<MMseqsParameter*> threadsandcompression;
 
     std::vector<MMseqsParameter*> alignall;
+    std::vector<MMseqsParameter*> alignproteome;
     std::vector<MMseqsParameter*> align;
     std::vector<MMseqsParameter*> rescorediagonal;
     std::vector<MMseqsParameter*> alignbykmer;
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index c43c3356d..35d72b32f 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -80,5 +80,6 @@ set(util_source_files
         util/proteinaln2nucl.cpp
         util/versionstring.cpp
         util/diskspaceavail.cpp
+        util/alignproteome.cpp
         PARENT_SCOPE
         )
diff --git a/src/util/alignall.cpp b/src/util/alignall.cpp
index 3a07596f6..3832ef9ee 100644
--- a/src/util/alignall.cpp
+++ b/src/util/alignall.cpp
@@ -27,7 +27,9 @@ int alignall(int argc, const char **argv, const Command &command) {
     }
     unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr);
 
-    DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+    // DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+    DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_LOOKUP);
+
     tdbr.open(DBReader<unsigned int>::NOSORT);
     if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
         tdbr.readMmapedDataInMemory();
@@ -56,6 +58,7 @@ int alignall(int argc, const char **argv, const Command &command) {
     EvalueComputation evaluer(tdbr.getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
     const size_t flushSize = 100000000;
     size_t iterations = static_cast<int>(ceil(static_cast<double>(dbr_res.getSize()) / static_cast<double>(flushSize)));
+    Debug(Debug::INFO) << "Number of iterations: " << iterations << " Size of linclust dbr_res : " <<  dbr_res.getSize() <<  '\n';
 
     for (size_t i = 0; i < iterations; i++) {
         size_t start = (i * flushSize);
@@ -80,7 +83,6 @@ int alignall(int argc, const char **argv, const Command &command) {
 #pragma omp for schedule(dynamic, 1)
             for (size_t id = start; id < (start + bucketSize); id++) {
                 progress.updateProgress();
-
                 const unsigned int key = dbr_res.getDbKey(id);
                 char *data = dbr_res.getData(id, thread_idx);
 
diff --git a/src/util/alignproteome.cpp b/src/util/alignproteome.cpp
new file mode 100644
index 000000000..197955da1
--- /dev/null
+++ b/src/util/alignproteome.cpp
@@ -0,0 +1,443 @@
+#include "Util.h"
+#include "Parameters.h"
+#include "Matcher.h"
+#include "Debug.h"
+#include "DBReader.h"
+#include "DBWriter.h"
+#include "NucleotideMatrix.h"
+#include "SubstitutionMatrix.h"
+#include "Alignment.h"
+#include "itoa.h"
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
+struct __attribute__((__packed__)) ProteomeEntry{
+    unsigned int proteomeAALen;
+    int repProtKey;
+    float protSimScore;
+    unsigned int memCount;
+    float totalSeqId;
+
+    ProteomeEntry(unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f)
+        : proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {}
+
+    void incrementMemCount(int count) {
+        memCount += count;
+    }
+    void decrementmemCount() {
+        memCount--;
+    }
+
+    int getRepProtKey() {
+        return repProtKey;
+    }
+
+    bool isCovered(){
+        if (repProtKey == -1) {
+            return false;
+        }
+        return true;
+    }
+
+    void computeRedundancy() {
+        protSimScore = totalSeqId / proteomeAALen;
+
+    }
+
+    void addSeqId(float seqId) {
+        totalSeqId += seqId;
+    }
+
+    void addSeqLength(unsigned int seqLength) {
+        proteomeAALen += seqLength;
+    }
+
+    void resetProteomeInfo(){
+        memCount = 0;
+        totalSeqId = 0.0f;
+    }
+
+    float getprotSimScore() {
+        return protSimScore;
+    }
+};
+
+struct __attribute__((__packed__)) memberProteinEntry{
+    unsigned int proteomeKey;
+    unsigned int proteinKey;
+};
+
+struct ClusterEntry {
+    bool isAvailable;
+    std::vector<memberProteinEntry> memberProteins;
+
+    ClusterEntry() : isAvailable(false) {}
+
+    ClusterEntry(size_t memberProteinSize) : isAvailable(true) {
+        memberProteins.reserve(memberProteinSize);
+    }
+
+    void resetClusterInfo(){
+        memberProteins.clear();
+        isAvailable = false;
+    }
+};
+
+void calculateProteomeLength(std::vector<ProteomeEntry>& ProteomeList, DBReader<unsigned int>::LookupEntry* const& lookup, size_t lookupSize, DBReader<unsigned int>& tProteinDB) {
+    for (size_t i = 0; i < lookupSize; i++) {
+        const unsigned int ProteomeId = lookup[i].fileNumber;
+        const unsigned int ProteinId = lookup[i].id;
+        ProteomeList[ProteomeId].addSeqLength(tProteinDB.getIndexLen(ProteinId) - 2);
+    }
+}
+
+void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterReps, std::vector<unsigned int>& localMemCount, DBReader<unsigned int>& tProteinDB, DBReader<unsigned int>& linResDB, unsigned int thread_idx){
+    std::vector<unsigned int> memberKeys;
+    char buffer[1024 + 32768*4]; ///why?
+    memberKeys.reserve(50); // store key for every protein in a cluster
+    char *clustData = linResDB.getData(id, thread_idx);
+    while (*clustData != '\0') {
+        Util::parseKey(clustData, buffer);
+        const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10);
+        memberKeys.push_back(key);
+        clustData = Util::skipLine(clustData);
+    }
+    if (memberKeys.size() > 1) { //IF Not a singleton cluster
+        ClusterEntry eachClusterRep(memberKeys.size());
+
+        for (auto& eachMemberKey : memberKeys){
+            const unsigned int proteinId = tProteinDB.getId(eachMemberKey);
+            const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId);
+            memberProteinEntry mem;
+            mem.proteomeKey = proteomeKey;
+            mem.proteinKey = proteinId;
+            eachClusterRep.memberProteins.push_back(mem);
+            localMemCount[proteomeKey]++;
+        }
+        localClusterReps.push_back(eachClusterRep);
+    }
+}
+
+bool FindNextRepProteome(std::vector<ProteomeEntry>& ProteomeList, unsigned int& RepProteomeId) {
+    bool isAllCovered = true;
+    
+    unsigned int maxMemberCount = 0;
+    unsigned int notCoveredProtCount = 0;
+
+    for (size_t idx = 0; idx < ProteomeList.size(); idx++){
+        if (ProteomeList[idx].isCovered()) {
+            continue;
+        }else{
+            isAllCovered = false;
+            notCoveredProtCount++;
+            if (ProteomeList[idx].memCount > maxMemberCount) {
+                maxMemberCount = ProteomeList[idx].memCount;
+                RepProteomeId = idx;
+            }
+        }
+    }
+
+    if (isAllCovered){
+        return false;
+    }else if (notCoveredProtCount == 1) {
+        //last one and it is singleton
+        ProteomeList[RepProteomeId].repProtKey = RepProteomeId; //todo 
+        ProteomeList[RepProteomeId].protSimScore = 1.0;
+        return false;
+    }else{
+        return true;
+    }
+}
+
+void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& ProteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinAlignWriter) {
+    char buffer[1024]; //todo - How to calculate buffer size?
+    bool isRepFound = false;
+    unsigned int lastqLen = 0;
+    unsigned int qproteinKey = 0;
+    unsigned int qproteomeKey =0;
+    //find Rep
+    for (auto& eachMember : clusterRep.memberProteins){
+        if (eachMember.proteomeKey == RepProteomeId) {
+            isRepFound = true;
+            const unsigned int queryId = eachMember.proteinKey;
+            if( lastqLen < tProteinDB.getSeqLen(queryId)){
+                lastqLen = tProteinDB.getSeqLen(queryId);
+                qproteinKey = eachMember.proteinKey;
+                qproteomeKey = eachMember.proteomeKey;
+            }
+        }
+    }
+    if (isRepFound){
+        const unsigned int queryId = qproteinKey;
+        const unsigned int queryKey = tProteinDB.getDbKey(queryId);
+        char* querySeq = tProteinDB.getData(queryId, thread_idx); //todo need openmp
+        query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId));
+        matcher.initQuery(&query);
+        const unsigned int queryProteomeLength = ProteomeList[qproteomeKey].proteomeAALen;
+        char * tmpBuff = Itoa::u32toa_sse2((uint32_t) queryKey, buffer);
+        *(tmpBuff-1) = '\t';
+        const unsigned int queryIdLen = tmpBuff - buffer;
+
+        for (auto& eachTargetMember : clusterRep.memberProteins){
+            // if query Proteome's length < target Proteome's length * 0.9, skip
+            const unsigned int targetProteomeLength = ProteomeList[eachTargetMember.proteomeKey].proteomeAALen;
+            if (queryProteomeLength < targetProteomeLength * 0.9) {
+                continue;
+            }
+            const unsigned int targetId = eachTargetMember.proteinKey;
+            unsigned int targetProteomeId = eachTargetMember.proteomeKey;
+            const unsigned int targetKey = tProteinDB.getDbKey(targetId);
+            char* targetSeq = tProteinDB.getData(targetId, thread_idx);
+            target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId));
+
+            if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) {
+                continue;
+            };
+
+            const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false;
+            Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity);
+
+            if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) {
+                // Placeholder for writing results, uncomment and implement as needed
+                // size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
+                // protRedunWriter.writeAdd(buffer, len, thread_idx);
+                // ProteomeList[targetProteomeId].addSeqId(result.getSeqId());
+                size_t len = Matcher::resultToBuffer(tmpBuff, result, par.addBacktrace);
+                proteinAlignWriter.writeAdd(buffer, queryIdLen+len, thread_idx);
+                localSeqIds[targetProteomeId] += result.getSeqId()*target.L;
+            }
+        }
+
+    }
+}
+
+bool updateProteomeList(std::vector<ProteomeEntry>& ProteomeList, const unsigned int& RepProteomeId){
+    bool isRepSingleton = true;
+
+    #pragma omp for schedule(dynamic, 1)
+    for (size_t i = 0; i < ProteomeList.size(); i++) {
+        if (ProteomeList[i].isCovered() == false) {
+            if (i == RepProteomeId){
+                ProteomeList[i].repProtKey = RepProteomeId;
+                ProteomeList[i].protSimScore = 1.0;
+            }else{
+                ProteomeList[i].computeRedundancy();
+                if (ProteomeList[i].getprotSimScore() >= 0.9) {
+                ProteomeList[i].repProtKey = RepProteomeId;
+                isRepSingleton = false;
+                }else{
+                    ProteomeList[i].resetProteomeInfo();
+                }
+            }
+        }
+    }
+    return isRepSingleton;
+}
+
+void updateClusterReps(std::vector<ClusterEntry>& clusterReps, std::vector<ProteomeEntry>& ProteomeList, std::vector<unsigned int>& localMemCount){
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (size_t i = 0; i < clusterReps.size(); i++) {
+        if (clusterReps[i].isAvailable) {
+            bool allMemberCovered = true;
+            unsigned int notCoveredCount = 0;
+            unsigned int lastProteomeKey = 0;
+            for (auto& eachMember : clusterReps[i].memberProteins) {
+                if (ProteomeList[eachMember.proteomeKey].isCovered() == false) {
+                    notCoveredCount++;
+                    lastProteomeKey = eachMember.proteomeKey;
+                    allMemberCovered = false;
+                    localMemCount[eachMember.proteomeKey]++;
+                }
+            }
+            if (allMemberCovered) {
+                clusterReps[i].resetClusterInfo();
+            }
+
+            if (notCoveredCount == 1) { //singleton Cluster
+                localMemCount[lastProteomeKey]--;
+                clusterReps[i].resetClusterInfo();
+            }
+        }
+    }
+
+}
+
+size_t resultToBuffer(const std::string& proteomeName, const std::string& repProtName, float similarityScore, char* buffer){
+    char* basePos = buffer;
+    std::strncpy(buffer, proteomeName.c_str(), proteomeName.length());
+    buffer += proteomeName.length()+1;
+    *(buffer-1) = '\t';  // 탭 구분자 추가
+
+    // repProtName을 버퍼에 복사
+    std::strncpy(buffer, repProtName.c_str(), repProtName.length());
+    buffer += repProtName.length()+1;
+    *(buffer-1)  = '\t';  // 탭 구분자 추가
+
+    // similarityScore를 버퍼에 추가 (Util::fastSeqIdToBuffer 사용)
+    buffer = Util::fastSeqIdToBuffer(similarityScore, buffer);
+    *(buffer-1) = '\n';  // 줄바꿈 추가
+    // *(buffer) = '\0';  // 문자열 끝 추가
+    return buffer - basePos; 
+}
+
+int alignproteome(int argc, const char **argv, const Command &command){
+    //Initialize parameters
+    Parameters &par = Parameters::getInstance();
+    par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0);
+    par.parseParameters(argc, argv, command, true, 0, 0);
+
+    if (par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED) {
+        Debug(Debug::ERROR) << "Use rescorediagonal for ungapped alignment mode.\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (par.addBacktrace == true) {
+        par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
+    }
+    
+    unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr);
+
+    //Open the target protein database
+    DBReader<unsigned int> tProteinDB(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_LOOKUP|DBReader<unsigned int>::USE_SOURCE);
+    tProteinDB.open(DBReader<unsigned int>::NOSORT);
+    const int tProteinSeqType = tProteinDB.getDbtype();
+
+    if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
+        tProteinDB.readMmapedDataInMemory();
+    }
+    //Open lookup table to get the source of the protein and the protein length
+    DBReader<unsigned int>::LookupEntry* tLookup = tProteinDB.getLookup();
+    const size_t tLookupSize = tProteinDB.getLookupSize();
+    unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber;
+    std::vector<ProteomeEntry> ProteomeList(totalProteomeNumber + 1);
+    calculateProteomeLength(ProteomeList, tLookup, tLookupSize, tProteinDB);
+
+    //For Debug
+    // for (size_t i=0; i < ProteomeList.size(); i++) {
+    //     Debug(Debug::INFO) << "Proteome " << i << " has " << ProteomeList[i].proteomeAALen << " amino acids.\n";
+    // }
+
+    //Open the linclust result
+    DBReader<unsigned int> linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+    linResDB.open(DBReader<unsigned int>::LINEAR_ACCCESS);
+
+    //Open the DBWriter
+    DBWriter protRedunWriter(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_GENERIC_DB);
+    protRedunWriter.open();
+    DBWriter proteinAlignWriter(par.db4.c_str(), par.db4Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB);
+    proteinAlignWriter.open();
+
+    std::vector<ClusterEntry> clusterReps; 
+
+    int gapOpen, gapExtend;
+    BaseMatrix *subMat;
+    subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
+    gapOpen = par.gapOpen.values.aminoacid();
+    gapExtend = par.gapExtend.values.aminoacid();
+    EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
+
+    Debug(Debug::INFO) << "Start Initialization\n";
+    #pragma omp parallel
+    {
+        unsigned int thread_idx = 0;
+    #ifdef OPENMP
+        thread_idx = (unsigned int) omp_get_thread_num();
+    #endif    
+        std::vector<ClusterEntry> localClusterReps;
+        std::vector<unsigned int> localMemCount(ProteomeList.size(), 0);
+
+        #pragma omp for schedule(dynamic, 1)
+        for (size_t id = 0; id < linResDB.getSize(); id++) {
+            initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx);
+        }
+
+        #pragma omp critical
+        {
+            for (size_t idx=0; idx < localMemCount.size(); idx++){
+                ProteomeList[idx].incrementMemCount(localMemCount[idx]);
+            }
+            clusterReps.insert(clusterReps.end(),
+                               std::make_move_iterator(localClusterReps.begin()),
+                               std::make_move_iterator(localClusterReps.end()));
+        }
+
+    }
+
+    unsigned int RepProteomeId = -1;
+    Debug(Debug::INFO) << "Run Alignment\n";
+    while (FindNextRepProteome(ProteomeList, RepProteomeId)) {
+        // if (FindNextRepProteome(ProteomeList, RepProteomeId) == false) {
+        //     Debug(Debug::INFO) << "All Proteome is covered. Done.\n";
+        //     break;
+        // }
+        // Debug(Debug::INFO) << "New Rep Found : " << RepProteomeId << "\n";
+        // Debug(Debug::INFO) << "1. Run alignment for each cluster\n";
+        #pragma omp parallel
+        {
+            unsigned int thread_idx = 0;
+        #ifdef OPENMP
+            thread_idx = (unsigned int) omp_get_thread_num();
+        #endif  
+            Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop);
+            Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
+            Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
+            std::vector <float> localSeqIds(clusterReps.size(), 0.0f);
+            proteinAlignWriter.writeStart(thread_idx);
+            #pragma omp for schedule(dynamic, 1)
+            for (size_t i = 0; i < clusterReps.size(); i++) {
+                if (clusterReps[i].isAvailable) {
+                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, ProteomeList, par, thread_idx, swMode, localSeqIds, proteinAlignWriter);
+                }
+                
+            }
+            proteinAlignWriter.writeEnd(RepProteomeId, thread_idx); //gg
+            #pragma omp critical
+            {
+                for (size_t idx = 0; idx < ProteomeList.size(); idx++) {
+                    ProteomeList[idx].addSeqId(localSeqIds[idx]);
+                }
+            }
+        }
+
+        // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n";
+        
+        bool isRepSingleton = updateProteomeList(ProteomeList, RepProteomeId);
+
+        if (isRepSingleton) {
+            ProteomeList[RepProteomeId].repProtKey = RepProteomeId;
+            ProteomeList[RepProteomeId].protSimScore = 1.0;
+        }
+
+        // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n";
+        std::vector<unsigned int> localMemCount(ProteomeList.size(), 0);
+        
+        updateClusterReps(clusterReps, ProteomeList, localMemCount);
+
+        #pragma omp critical
+        {
+            for (size_t i = 0; i < ProteomeList.size(); i++) {
+                ProteomeList[i].memCount += localMemCount[i];
+            }
+        }
+    }
+    // Debug(Debug::INFO) << "4. Write ProteomeList to file\n";
+    char protRedunBuffer[1024]; //todo - How to calculate buffer size?
+    protRedunWriter.writeStart();
+    for (size_t idx=0; idx < ProteomeList.size(); idx++){
+        std::string proteomeName = tProteinDB.getSourceFileName(idx);
+        std::string repProtName = tProteinDB.getSourceFileName(ProteomeList[idx].getRepProtKey());
+        float similarityScore = ProteomeList[idx].getprotSimScore();
+        // Debug(Debug::INFO) << "Proteome " << idx << " : " << proteomeName << " has similarity score : " << similarityScore << " with " << repProtName << "\n";
+        size_t result = resultToBuffer(proteomeName, repProtName, similarityScore,protRedunBuffer);
+        protRedunWriter.writeAdd(protRedunBuffer, result);
+    }
+    protRedunWriter.writeEnd(1); //Don't need index file (todo)
+
+    protRedunWriter.close();
+    proteinAlignWriter.close();
+    tProteinDB.close();
+    delete subMat;
+    linResDB.close();
+    return EXIT_SUCCESS;
+}
\ No newline at end of file

From 12c0c81bd66b3ab9cc077fef07728d5a2ba3cde5 Mon Sep 17 00:00:00 2001
From: Gyuuul2 <gyuribio@snu.ac.kr>
Date: Mon, 19 Aug 2024 17:51:44 +0900
Subject: [PATCH 02/31] add workflow, revise based on  feedback

---
 data/workflow/CMakeLists.txt       |  1 +
 data/workflow/easyalignproteome.sh | 53 ++++++++++++++++++++++++++++++
 src/CommandDeclarations.h          |  1 +
 src/MMseqsBase.cpp                 | 10 ++++--
 src/alignment/Matcher.h            |  1 -
 src/commons/CMakeLists.txt         |  2 --
 src/commons/ClusterSpecies.cpp     | 32 ------------------
 src/commons/ClusterSpecies.h       | 42 -----------------------
 src/commons/DBReader.cpp           | 17 ----------
 src/commons/DBReader.h             |  4 +--
 src/commons/DBWriter.cpp           |  1 -
 src/commons/Parameters.cpp         |  6 +++-
 src/commons/Parameters.h           |  1 +
 src/util/alignall.cpp              |  2 --
 src/util/alignproteome.cpp         | 10 +++---
 src/workflow/CMakeLists.txt        |  1 +
 src/workflow/EasyAlignproteome.cpp | 41 +++++++++++++++++++++++
 17 files changed, 117 insertions(+), 108 deletions(-)
 create mode 100644 data/workflow/easyalignproteome.sh
 delete mode 100644 src/commons/ClusterSpecies.cpp
 delete mode 100644 src/commons/ClusterSpecies.h
 create mode 100644 src/workflow/EasyAlignproteome.cpp

diff --git a/data/workflow/CMakeLists.txt b/data/workflow/CMakeLists.txt
index f002a27db..3e94e1817 100644
--- a/data/workflow/CMakeLists.txt
+++ b/data/workflow/CMakeLists.txt
@@ -3,6 +3,7 @@ set(GENERATED_WORKFLOWS
         workflow/easycluster.sh
         workflow/easytaxonomy.sh
         workflow/easyrbh.sh
+        workflow/easyalignproteome.sh
         workflow/blastp.sh
         workflow/blastpgp.sh
         workflow/map.sh
diff --git a/data/workflow/easyalignproteome.sh b/data/workflow/easyalignproteome.sh
new file mode 100644
index 000000000..b27b8187a
--- /dev/null
+++ b/data/workflow/easyalignproteome.sh
@@ -0,0 +1,53 @@
+#!/bin/sh -e
+fail() {
+    echo "Error: $1"
+    exit 1
+}
+
+notExists() {
+   [ ! -f "$1" ]
+}
+
+
+if notExists "${TMP_PATH}/input.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \
+        || fail "query createdb died"
+fi
+
+if notExists "${TMP_PATH}/clu.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \
+        || fail "Search died"
+fi
+
+if notExists "${TMP_PATH}/aln.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" alignproteome "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${ALIGNPROTEOME_PAR} \
+        || fail "Convert Alignments died"
+fi
+
+if notExists "${RESULTS}_protein_cluster.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \
+            || fail "createtsv protein died"
+fi
+
+if notExists "${RESULTS}_proteome_cluster.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
+            || fail "createtsv proteome died"
+fi
+
+if [ -n "${REMOVE_TMP}" ]; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR}
+    rm -rf "${TMP_PATH}/clu_tmp"
+    rm -f "${TMP_PATH}/easyproteomecluster.sh"
+fi
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
index 369034025..04fbd521d 100644
--- a/src/CommandDeclarations.h
+++ b/src/CommandDeclarations.h
@@ -44,6 +44,7 @@ extern int diffseqdbs(int argc, const char **argv, const Command& command);
 extern int easycluster(int argc, const char **argv, const Command& command);
 extern int easyrbh(int argc, const char **argv, const Command& command);
 extern int easylinclust(int argc, const char **argv, const Command& command);
+extern int easyalignproteome(int argc, const char **argv, const Command& command);
 extern int easysearch(int argc, const char **argv, const Command& command);
 extern int easylinsearch(int argc, const char **argv, const Command& command);
 extern int tsv2exprofiledb(int argc, const char **argv, const Command& command);
diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index e91d6d0a6..3b64eb556 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -79,6 +79,12 @@ std::vector<Command> baseCommands = {
                 CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
                                                             {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
                                                             {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"easy-alignproteome", easyalignproteome, &par.easyalignproteome, COMMAND_EASY,
+                "Proteome clustering and alignemnt",
+                "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>"
+                // CITATION_MMSEQS2,
+        
+        },
         {"easy-taxonomy",        easytaxonomy,         &par.easytaxonomy,         COMMAND_EASY,
                 "Taxonomic classification",
                 "# Assign taxonomic labels to FASTA sequences\n"
@@ -633,9 +639,9 @@ std::vector<Command> baseCommands = {
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
         {"alignproteome",             alignproteome,             &par.alignproteome,             COMMAND_ALIGNMENT,
-                "Within-result all-vs-all gapped local alignment",
+                "Proteome clustering and alignment",
                 NULL,
-                "Martin Steinegger <martin.steinegger@snu.ac.kr>",
+                "Martin Steinegger <martin.steinegger@snu.ac.kr> & Gyuri Kim <gyuribio@snu.ac.kr>",
                 "<i:sequenceDB> <i:resultDB> <o:alignmentDB>",
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h
index a89ae9c5a..6e1d0ab8f 100644
--- a/src/alignment/Matcher.h
+++ b/src/alignment/Matcher.h
@@ -221,7 +221,6 @@ class Matcher{
 
 
     static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress  = true, bool addOrfPosition = false);
-    static size_t resultToBuffer_str(char * buffer, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition=false);
     static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart);
 
     static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer,
diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt
index a58b5322a..dc7ab3e74 100644
--- a/src/commons/CMakeLists.txt
+++ b/src/commons/CMakeLists.txt
@@ -8,7 +8,6 @@ set(commons_header_files
         commons/Command.h
         commons/CommandCaller.h
         commons/Concat.h
-        # commons/ClusterSpecies.h
         commons/DBConcat.h
         commons/DBReader.h
         commons/DBWriter.h
@@ -53,7 +52,6 @@ set(commons_source_files
         commons/BaseMatrix.cpp
         commons/Command.cpp
         commons/CommandCaller.cpp
-        # commons/ClusterSpecies.cpp
         commons/DBConcat.cpp
         commons/DBReader.cpp
         commons/DBWriter.cpp
diff --git a/src/commons/ClusterSpecies.cpp b/src/commons/ClusterSpecies.cpp
deleted file mode 100644
index 3216cc210..000000000
--- a/src/commons/ClusterSpecies.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#include "ClusterSpecies.h"
-void ClusterSpecies::addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId) {
-   clusterEntriesPerSpecies[speciesId][clusterRepId].push_back(proteinId);
-//    speciesClusterMemCounts[speciesId]++;
-}
-
-
-void ClusterSpecies::removeSpecies(unsigned int speciesId) {
-    clusterEntriesPerSpecies.erase(speciesId);
-    // speciesClusterMemCounts.erase(speciesId);
-}
-
-int ClusterSpecies::findRepSpecies(){
-    //find max entry in speciesClusterMemCounts
-    unsigned int maxSpeciesId = 0;
-    size_t maxSpeciesSize = 0;
-    for (auto const& species : clusterEntriesPerSpecies){
-        size_t speiciesSize = species.second.size();
-        if (speiciesSize > maxSpeciesSize){
-            maxSpeciesId = species.first;
-        }
-    }
-
-    return maxSpeciesId;
-    // for (auto const& x : speciesClusterMemCounts)
-    // {
-    //     if (x.second > maxSpeciesCount){
-    //         maxSpeciesCount = x.second;
-    //         maxSpeciesId = x.first;
-    //     }
-    // }
-}
\ No newline at end of file
diff --git a/src/commons/ClusterSpecies.h b/src/commons/ClusterSpecies.h
deleted file mode 100644
index 96b1e8688..000000000
--- a/src/commons/ClusterSpecies.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef CLUSTER_SPECIES_H
-#define CLUSTER_SPECIES_H
-
-#include <vector>
-#include <unordered_map>
-
-
-class ClusterSpecies {
-public:
-    
-    // std::unordered_map<unsigned int, std::unordered_map<unsigned int, std::vector<unsigned int>>> clusterEntriesPerSpecies;
-    // // std::unordered_map<unsigned int, size_t> speciesClusterMemCounts;
-    // void addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId);
-    // void removeSpecies(unsigned int speciesId);
-    // int findRepSpecies();
-
-    // template <typename T>
-    // struct ProteomeLength{
-    //     T id;
-    //     unsigned int length;
-    // };
-    
-    // std::vector<ProteomeLength<unsigned int>> proteomeAAToatalLength;
-
-    // void addProteomeLength(unsigned int id, unsigned int length){
-    //     ProteomeLength<unsigned int> pl;
-    //     pl.id = id;
-    //     pl.length = length;
-    //     proteomeAAToatalLength.push_back(pl);
-    // }
-
-    // unsigned int getProteomeLength(unsigned int id){
-    //     for (auto const& pl : proteomeAAToatalLength){
-    //         if (pl.id == id){
-    //             return pl.length;
-    //         }
-    //     }
-    //     return 0;
-    // }
-};
-
-#endif //CLUSTER_SPECIES_H
\ No newline at end of file
diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp
index 70216cedd..80bca13c5 100644
--- a/src/commons/DBReader.cpp
+++ b/src/commons/DBReader.cpp
@@ -137,8 +137,6 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
     }
     if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) {
-        Debug(Debug::INFO) << "ReadLookup file: " << dataFileName << "\n"; //gyuri
-
         std::string lookupFilename = (std::string(dataFileName) + ".lookup");
         MemoryMapped lookupData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
         if (lookupData.isValid() == false) {
@@ -147,9 +145,7 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         char* lookupDataChar = (char *) lookupData.getData();
         size_t lookupDataSize = lookupData.size();
-        Debug(Debug::INFO) << "Lookup Data size is " << lookupDataSize << "\n"; //gyuri
         lookupSize = Util::ompCountLines(lookupDataChar, lookupDataSize, threads);
-        Debug(Debug::INFO) << "Lookup size is " << lookupSize << "\n"; //gyuri
         lookup = new(std::nothrow) LookupEntry[this->lookupSize];
         incrementMemory(sizeof(LookupEntry) * this->lookupSize);
         readLookup(lookupDataChar, lookupDataSize, lookup);
@@ -1025,19 +1021,6 @@ size_t DBReader<T>::getOffset(size_t id) {
     return index[id].offset;
 }
 
-template<typename T>
-size_t DBReader<T>::getIndexLen(size_t id) {
-    if (id >= size){
-        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << indexFileName << "\n";
-        Debug(Debug::ERROR) << "getOffset: local id (" << id << ") >= db size (" << size << ")\n";
-        EXIT(EXIT_FAILURE);
-    }
-    if (local2id != NULL) {
-        id = local2id[id];
-    }
-    return index[id].length;
-}
-
 template<typename T>
 size_t DBReader<T>::findNextOffsetid(size_t id) {
     size_t idOffset = getOffset(id);
diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h
index 198fa9ba5..81d28bdf1 100644
--- a/src/commons/DBReader.h
+++ b/src/commons/DBReader.h
@@ -187,7 +187,7 @@ class DBReader : public MemoryTracker {
 
     size_t getSize() const;
 
-    unsigned int getProteomeTotalLen(size_t id); //gyuri
+    unsigned int getProteomeTotalLen(size_t id); 
 
     unsigned int getMaxSeqLen(){ 
             return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ?
@@ -446,8 +446,6 @@ class DBReader : public MemoryTracker {
 
     size_t findNextOffsetid(size_t id);
 
-    size_t getIndexLen(size_t id);
-
     int isCompressed(){
         return isCompressed(dbtype);
     }
diff --git a/src/commons/DBWriter.cpp b/src/commons/DBWriter.cpp
index 36aac0eaf..9ae84bae1 100644
--- a/src/commons/DBWriter.cpp
+++ b/src/commons/DBWriter.cpp
@@ -283,7 +283,6 @@ size_t DBWriter::writeAdd(const char* data, size_t dataSize, unsigned int thrIdx
     }
     size_t totalWriten = 0;
     if(isCompressedDB && (state[thrIdx] == INIT_STATE || state[thrIdx] == COMPRESSED) ) {
-        Debug(Debug::INFO) << "Compressing data in thread " << thrIdx << "\n"; //gyuri
         state[thrIdx] = COMPRESSED;
         // zstd seems to have a hard time with elements < 60
         ZSTD_inBuffer input = { data, dataSize, 0 };
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index ab8ef3ad8..6cb54999b 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -1349,6 +1349,10 @@ Parameters::Parameters():
     // easylinclustworkflow
     easylinclustworkflow = combineList(linclustworkflow, createdb);
 
+    // easyalignproteomeworkflow
+    easyalignproteome = combineList(easylinclustworkflow, alignproteome);
+
+
     // clustering workflow
     clusterworkflow = combineList(prefilter, align);
     clusterworkflow = combineList(clusterworkflow, rescorediagonal);
@@ -2353,7 +2357,7 @@ void Parameters::setDefaults() {
     maxRejected = INT_MAX;
     maxAccept   = INT_MAX;
     seqIdThr = 0.0;
-    proteomeSimThr = 0.0;
+    proteomeSimThr = 0.9;
     alnLenThr = 0;
     altAlignment = 0;
     gapOpen = MultiParam<NuclAA<int>>(NuclAA<int>(11, 5));
diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
index 838e0ed69..ee825291e 100644
--- a/src/commons/Parameters.h
+++ b/src/commons/Parameters.h
@@ -1118,6 +1118,7 @@ class Parameters {
     std::vector<MMseqsParameter*> kmersearch;
     std::vector<MMseqsParameter*> countkmer;
     std::vector<MMseqsParameter*> easylinclustworkflow;
+    std::vector<MMseqsParameter*> easyalignproteome;
     std::vector<MMseqsParameter*> linclustworkflow;
     std::vector<MMseqsParameter*> easysearchworkflow;
     std::vector<MMseqsParameter*> searchworkflow;
diff --git a/src/util/alignall.cpp b/src/util/alignall.cpp
index 3832ef9ee..39f2e199c 100644
--- a/src/util/alignall.cpp
+++ b/src/util/alignall.cpp
@@ -27,7 +27,6 @@ int alignall(int argc, const char **argv, const Command &command) {
     }
     unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr);
 
-    // DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
     DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_LOOKUP);
 
     tdbr.open(DBReader<unsigned int>::NOSORT);
@@ -58,7 +57,6 @@ int alignall(int argc, const char **argv, const Command &command) {
     EvalueComputation evaluer(tdbr.getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
     const size_t flushSize = 100000000;
     size_t iterations = static_cast<int>(ceil(static_cast<double>(dbr_res.getSize()) / static_cast<double>(flushSize)));
-    Debug(Debug::INFO) << "Number of iterations: " << iterations << " Size of linclust dbr_res : " <<  dbr_res.getSize() <<  '\n';
 
     for (size_t i = 0; i < iterations; i++) {
         size_t start = (i * flushSize);
diff --git a/src/util/alignproteome.cpp b/src/util/alignproteome.cpp
index 197955da1..3a7311b81 100644
--- a/src/util/alignproteome.cpp
+++ b/src/util/alignproteome.cpp
@@ -50,7 +50,7 @@ struct __attribute__((__packed__)) ProteomeEntry{
         totalSeqId += seqId;
     }
 
-    void addSeqLength(unsigned int seqLength) {
+    void addSeqLen(unsigned int seqLength) {
         proteomeAALen += seqLength;
     }
 
@@ -64,14 +64,14 @@ struct __attribute__((__packed__)) ProteomeEntry{
     }
 };
 
-struct __attribute__((__packed__)) memberProteinEntry{
+struct __attribute__((__packed__)) MemberProteinEntry{
     unsigned int proteomeKey;
     unsigned int proteinKey;
 };
 
 struct ClusterEntry {
     bool isAvailable;
-    std::vector<memberProteinEntry> memberProteins;
+    std::vector<MemberProteinEntry> memberProteins;
 
     ClusterEntry() : isAvailable(false) {}
 
@@ -89,7 +89,7 @@ void calculateProteomeLength(std::vector<ProteomeEntry>& ProteomeList, DBReader<
     for (size_t i = 0; i < lookupSize; i++) {
         const unsigned int ProteomeId = lookup[i].fileNumber;
         const unsigned int ProteinId = lookup[i].id;
-        ProteomeList[ProteomeId].addSeqLength(tProteinDB.getIndexLen(ProteinId) - 2);
+        ProteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId));
     }
 }
 
@@ -110,7 +110,7 @@ void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterRep
         for (auto& eachMemberKey : memberKeys){
             const unsigned int proteinId = tProteinDB.getId(eachMemberKey);
             const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId);
-            memberProteinEntry mem;
+            MemberProteinEntry mem;
             mem.proteomeKey = proteomeKey;
             mem.proteinKey = proteinId;
             eachClusterRep.memberProteins.push_back(mem);
diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt
index ce266e9c7..b4e1f0cf1 100644
--- a/src/workflow/CMakeLists.txt
+++ b/src/workflow/CMakeLists.txt
@@ -14,6 +14,7 @@ set(workflow_source_files
         workflow/Search.cpp
         workflow/Taxonomy.cpp
         workflow/EasyTaxonomy.cpp
+        workflow/EasyAlignproteome.cpp
         workflow/CreateIndex.cpp
         PARENT_SCOPE
         )
diff --git a/src/workflow/EasyAlignproteome.cpp b/src/workflow/EasyAlignproteome.cpp
new file mode 100644
index 000000000..5374da773
--- /dev/null
+++ b/src/workflow/EasyAlignproteome.cpp
@@ -0,0 +1,41 @@
+#include <cassert>
+
+#include "FileUtil.h"
+#include "CommandCaller.h"
+#include "Util.h"
+#include "Debug.h"
+#include "Parameters.h"
+
+#include "easyalignproteome.sh.h"
+
+void setEasyAlignproteomeDefaults(Parameters *p) {
+    p->proteomeSimThr = 0.9;
+}
+
+// void setEasyAlignproteomeMustPassAlong(Parameters *p){
+
+// }
+
+int easyalignproteome(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    setEasyAlignproteomeDefaults(&par);
+    par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
+    // setEasyAlignproteomeMustPassAlong(&par);
+    std::string tmpDir = par.filenames.back();
+    std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params));
+    if (par.reuseLatest) {
+        hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
+    }
+    tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
+    par.filenames.pop_back();
+
+    CommandCaller cmd;
+    cmd.addVariable("ALIGNPROTEOME_PAR", par.createParameterString(par.alignproteome,true).c_str()); // what?
+    std::string program = tmpDir + "/easyalignproteome.sh";
+    FileUtil::writeFile(program, easyalignproteome_sh, easyalignproteome_sh_len);
+    cmd.execProgram(program.c_str(), par.filenames);
+    
+    // Should never get here
+    assert(false);
+    return 0;
+}

From df80f075d49a42e69ec72db0cf5580250f5b82c4 Mon Sep 17 00:00:00 2001
From: Kim Soo Hyun <sd20163818@snu.ac.kr>
Date: Sun, 25 Aug 2024 18:49:01 +0900
Subject: [PATCH 03/31] Change createdb.cpp so that it takes in ".txt" file
 containing paths of different fasta files

---
 src/util/createdb.cpp | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 mode change 100644 => 100755 src/util/createdb.cpp

diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp
old mode 100644
new mode 100755
index 18d2fc7d0..440b2ed06
--- a/src/util/createdb.cpp
+++ b/src/util/createdb.cpp
@@ -19,6 +19,28 @@ int createdb(int argc, const char **argv, const Command& command) {
     std::vector<std::string> filenames(par.filenames);
     std::string dataFile = filenames.back();
     filenames.pop_back();
+    if (Util::endsWith(".txt", filenames[0])) {
+	    if (filenames.size() > 1) {
+		    Debug(Debug::ERROR) << "Only one txt file can be given\n";
+		    EXIT(EXIT_FAILURE);
+	    }
+	    std::string tsv = filenames.back();
+	    filenames.pop_back();
+
+	    FILE* file = FileUtil::openFileOrDie(tsv.c_str(), "r", true);
+	    char* line = NULL;
+	    size_t len = 0;
+	    ssize_t read;
+	    while ((read = getline(&line, &len, file)) != -1) {
+		    if (line[read - 1] == '\n') {
+			    line[read - 1] = '\0';
+			    read--;
+		    }
+		    filenames.push_back(line);
+	    }
+	    free(line);
+	    fclose(file);
+    }
 
     for (size_t i = 0; i < filenames.size(); i++) {
         if (FileUtil::directoryExists(filenames[i].c_str()) == true) {

From 3617ea7153b54f7131366660d570d5c68053d58a Mon Sep 17 00:00:00 2001
From: Kim Soo Hyun <sd20163818@snu.ac.kr>
Date: Mon, 26 Aug 2024 16:11:29 +0900
Subject: [PATCH 04/31] Change filepath format from txt to tsv. Also add
 explanations of its usage to createdb and easy-search in MMseqsBase.cpp.

---
 src/MMseqsBase.cpp    | 9 ++++++++-
 src/util/createdb.cpp | 4 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index 8325f0d4a..c252ade41 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -11,6 +11,10 @@ std::vector<Command> baseCommands = {
                 "Sensitive homology search",
                 "# Search multiple FASTA against FASTA (like BLASTP, TBLASTN, BLASTX, BLASTN --search-type 3, TBLASTX --search-type 2)\n"
                 "mmseqs easy-search examples/QUERY.fasta examples/QUERY.fasta examples/DB.fasta result.m8 tmp\n\n"
+                "# Search multiple query fasta files against target fasta files using a tsv file containing filepaths\n"
+                "echo -e \"dir1/QUERY1.fasta\\ndir2/QUERY2.fasta\" > examples/queries.tsv\n"
+                "echo -e \"dir3/TARGET1.fasta\\ndir4/TARGET2.fasta\" > examples/targets.tsv\n"
+                "mmseqs easy-search examples/queries.tsv examples/targets.tsv result.m8 tmp\n\n"
                 "# Iterative profile search from stdin (like PSI-BLAST)\n"
                 "cat examples/QUERY.fasta | mmseqs easy-search stdin examples/DB.fasta result.m8 tmp --num-iterations 2\n\n"
                 "# Profile search against small databases (e.g. PFAM, eggNOG)\n"
@@ -125,7 +129,10 @@ std::vector<Command> baseCommands = {
                 "# Create a seqDB from stdin\n"
                 "cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n"
                 "# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n"
-                "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n",
+                "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n\n"
+                "# Create a seqDB from a tsv file containing filepaths of multiple FASTA files in each line\n"
+                "echo -e \"dir1/bacteria.fasta\\ndir2/archea.fasta.gz\" > filepaths.tsv\n"
+                "mmseqs createdb filepaths.tsv sequenceDB\n",
                 "Martin Steinegger <martin.steinegger@snu.ac.kr>",
                 "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB>",
                 CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric },
diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp
index 440b2ed06..e1d4ead3d 100755
--- a/src/util/createdb.cpp
+++ b/src/util/createdb.cpp
@@ -19,9 +19,9 @@ int createdb(int argc, const char **argv, const Command& command) {
     std::vector<std::string> filenames(par.filenames);
     std::string dataFile = filenames.back();
     filenames.pop_back();
-    if (Util::endsWith(".txt", filenames[0])) {
+    if (Util::endsWith(".tsv", filenames[0])) {
 	    if (filenames.size() > 1) {
-		    Debug(Debug::ERROR) << "Only one txt file can be given\n";
+		    Debug(Debug::ERROR) << "Only one tsv file can be given\n";
 		    EXIT(EXIT_FAILURE);
 	    }
 	    std::string tsv = filenames.back();

From 3fc7fcdd7ef5a1cfd551e48e7a24b599e8eb93d6 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Tue, 27 Aug 2024 22:55:08 +0900
Subject: [PATCH 05/31] createtsv - handle source identifier

---
 src/util/createtsv.cpp | 52 +++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/src/util/createtsv.cpp b/src/util/createtsv.cpp
index 7b75c0f9a..e8e87089c 100644
--- a/src/util/createtsv.cpp
+++ b/src/util/createtsv.cpp
@@ -18,18 +18,34 @@
 int createtsv(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
+    const bool hasTargetDB = par.filenames.size() > 3;
+    DBReader<unsigned int> *reader;
+    if (hasTargetDB) {
+        reader = new DBReader<unsigned int>(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    } else {
+        reader = new DBReader<unsigned int>(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    }
+    reader->open(DBReader<unsigned int>::LINEAR_ACCCESS);
+    const bool useSourceIdentifier = DBReader<unsigned int>::getExtendedDbtype(reader->getDbtype()) & Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER;
 
     bool queryNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db1.c_str()), Parameters::DBTYPE_NUCLEOTIDES);
     bool targetNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db2.c_str()), Parameters::DBTYPE_NUCLEOTIDES);
     const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
-    int queryHeaderType = (queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS;
+    int queryHeaderType = (useSourceIdentifier) ? IndexReader::SOURCE : 
+                        (queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS;
     queryHeaderType = (par.idxSeqSrc == 0) ? queryHeaderType :  (par.idxSeqSrc == 1) ?  IndexReader::HEADERS : IndexReader::SRC_HEADERS;
-    IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
+    unsigned int preloadMode = (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0;
+    unsigned int dataMode = (useSourceIdentifier)
+        ? (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_SOURCE)
+        : (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+    IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, preloadMode, dataMode);
+
+    // IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
     IndexReader * tDbrHeader=NULL;
     DBReader<unsigned int> * queryDB = qDbrHeader.sequenceReader;
     DBReader<unsigned int> * targetDB = NULL;
     bool sameDB = (par.db2.compare(par.db1) == 0);
-    const bool hasTargetDB = par.filenames.size() > 3;
+    
     DBReader<unsigned int>::Index * qHeaderIndex = qDbrHeader.sequenceReader->getIndex();
     DBReader<unsigned int>::Index * tHeaderIndex = NULL;
 
@@ -49,23 +65,13 @@ int createtsv(int argc, const char **argv, const Command &command) {
         }
     }
 
-    DBReader<unsigned int> *reader;
-    if (hasTargetDB) {
-
-        reader = new DBReader<unsigned int>(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-    } else {
-
-        reader = new DBReader<unsigned int>(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-    }
-    reader->open(DBReader<unsigned int>::LINEAR_ACCCESS);
-
     const std::string& dataFile = hasTargetDB ? par.db4 : par.db3;
     const std::string& indexFile = hasTargetDB ? par.db4Index : par.db3Index;
     const bool shouldCompress = par.dbOut == true && par.compressed == true;
     const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE;
     DBWriter writer(dataFile.c_str(), indexFile.c_str(), par.threads, shouldCompress, dbType);
     writer.open();
-
+    
     const size_t targetColumn = (par.targetTsvColumn == 0) ? SIZE_T_MAX :  par.targetTsvColumn - 1;
 #pragma omp parallel
     {
@@ -84,6 +90,10 @@ int createtsv(int argc, const char **argv, const Command &command) {
         for (size_t i = 0; i < reader->getSize(); ++i) {
             unsigned int queryKey = reader->getDbKey(i);
             size_t queryIndex = queryDB->getId(queryKey);
+            size_t querySourceIndex = SIZE_T_MAX;
+            if (useSourceIdentifier){
+                querySourceIndex = queryDB->getSourceKey(queryKey);
+            }
 
             char *headerData = queryDB->getData(queryIndex, thread_idx);
             if (headerData == NULL) {
@@ -92,7 +102,10 @@ int createtsv(int argc, const char **argv, const Command &command) {
             }
 
             std::string queryHeader;
-            if (par.fullHeader) {
+            if (useSourceIdentifier){
+                queryHeader = queryDB->getSourceFileName(querySourceIndex);
+            }
+            else if (par.fullHeader) {
                 queryHeader = "\"";
                 queryHeader.append(headerData, qHeaderIndex[queryIndex].length - 2);
                 queryHeader.append("\"");
@@ -118,12 +131,19 @@ int createtsv(int argc, const char **argv, const Command &command) {
                 } else if (hasTargetDB) {
                     unsigned int targetKey = (unsigned int) strtoul(dbKey, NULL, 10);
                     size_t targetIndex = targetDB->getId(targetKey);
+                    size_t targetSourceIdex = SIZE_T_MAX;
+                    if (useSourceIdentifier){
+                        targetSourceIdex = targetDB->getSourceKey(targetKey);
+                    }
                     char *targetData = targetDB->getData(targetIndex, thread_idx);
                     if (targetData == NULL) {
                         Debug(Debug::WARNING) << "Invalid header entry in query " << queryKey << " and target " << targetKey << "!\n";
                         continue;
                     }
-                    if (par.fullHeader) {
+                    if (useSourceIdentifier){
+                        targetAccession = targetDB->getSourceFileName(targetSourceIdex);
+                    }
+                    else if (par.fullHeader) {
                         targetAccession = "\"";
                         targetAccession.append(targetData, tHeaderIndex[targetIndex].length - 2);
                         targetAccession.append("\"");

From 0487a1f289e4549a4eda9bbd00088ca8d3d4fd67 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Tue, 27 Aug 2024 22:56:37 +0900
Subject: [PATCH 06/31] proteomcluster - change writer / way to select
 repProteome

---
 src/util/proteomecluster.cpp | 492 +++++++++++++++++++++++++++++++++++
 1 file changed, 492 insertions(+)
 create mode 100644 src/util/proteomecluster.cpp

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
new file mode 100644
index 000000000..dcfae8aa3
--- /dev/null
+++ b/src/util/proteomecluster.cpp
@@ -0,0 +1,492 @@
+#include "Util.h"
+#include "Parameters.h"
+#include "Matcher.h"
+#include "Debug.h"
+#include "DBReader.h"
+#include "DBWriter.h"
+#include "NucleotideMatrix.h"
+#include "SubstitutionMatrix.h"
+#include "Alignment.h"
+#include "itoa.h"
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
+struct __attribute__((__packed__)) ProteomeEntry{
+    int proteomeKey;
+    unsigned int proteomeAALen;
+    int repProtKey;
+    float protSimScore;
+    unsigned int memCount;
+    float totalSeqId;
+
+    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f)
+        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {}
+
+    void incrementMemCount(unsigned int count) {
+        memCount += count;
+    }
+    void decrementmemCount() {
+        memCount--;
+    }
+    int getRepProtKey() {
+        return repProtKey;
+    }
+    int getProteomeKey() {
+        return proteomeKey;
+    }
+    bool isCovered(){
+        if (repProtKey == -1) {
+            return false;
+        }
+        return true;
+    }
+    void computeRedundancy() {
+        protSimScore = totalSeqId / proteomeAALen;
+
+    }
+    void addSeqId(float seqId) {
+        totalSeqId += seqId;
+    }
+    void addSeqLen(unsigned int seqLength) {
+        proteomeAALen += seqLength;
+    }
+    void resetProteomeInfo(){
+        memCount = 0;
+        totalSeqId = 0.0f;
+    }
+    float getProtSimScore() {
+        return protSimScore;
+    }
+    static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) {
+        
+        if (a.repProtKey < b.repProtKey){
+            return true;
+        }
+        if (a.repProtKey > b.repProtKey){
+            return false;
+        }
+        if (a.proteomeKey == a.repProtKey && b.proteomeKey != b.repProtKey){
+            return true;
+        }
+        if (a.proteomeKey != a.repProtKey && b.proteomeKey == b.repProtKey){
+            return false;
+        }
+        return false;
+    }
+};
+
+struct __attribute__((__packed__)) MemberProteinEntry{
+    unsigned int proteomeKey;
+    unsigned int proteinKey;
+};
+
+struct ClusterEntry {
+    bool isAvailable;
+    std::vector<MemberProteinEntry> memberProteins;
+
+    ClusterEntry() : isAvailable(false) {}
+
+    ClusterEntry(size_t memberProteinSize) : isAvailable(true) {
+        memberProteins.reserve(memberProteinSize);
+    }
+
+    void resetClusterInfo(){
+        memberProteins.clear();
+        isAvailable = false;
+    }
+};
+
+void calculateProteomeLength(std::vector<ProteomeEntry>& ProteomeList, DBReader<unsigned int>::LookupEntry* const& lookup, size_t lookupSize, DBReader<unsigned int>& tProteinDB) {
+    for (size_t i = 0; i < lookupSize; i++) {
+        const unsigned int ProteomeId = lookup[i].fileNumber;
+        const unsigned int ProteinId = lookup[i].id;
+        ProteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId));
+        if (ProteomeList[ProteomeId].proteomeKey == -1){
+            ProteomeList[ProteomeId].proteomeKey = ProteomeId;
+        }
+    }
+}
+
+void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterReps, std::vector<unsigned int>& localMemCount, DBReader<unsigned int>& tProteinDB, DBReader<unsigned int>& linResDB, unsigned int thread_idx){
+    std::vector<unsigned int> memberKeys;
+    memberKeys.reserve(50); // store key for every protein in a cluster
+
+    std::vector<bool> isProteomeInCluster(localMemCount.size(), false); ;
+    char buffer[1024 + 32768*4];
+    char *clustData = linResDB.getData(id, thread_idx);
+    
+    while (*clustData != '\0') {
+        Util::parseKey(clustData, buffer);
+        const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10);
+        memberKeys.push_back(key);
+        clustData = Util::skipLine(clustData);
+    }
+    if (memberKeys.size() > 1) { //If not a singleton cluster
+        ClusterEntry eachClusterRep(memberKeys.size());
+        //init MemberProteinEntry and add it to memberProteins vector
+        for (auto& eachMemberKey : memberKeys){
+            const unsigned int proteinId = tProteinDB.getId(eachMemberKey);
+            const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId);
+            MemberProteinEntry mem;
+            mem.proteomeKey = proteomeKey;
+            mem.proteinKey = proteinId;
+            eachClusterRep.memberProteins.push_back(mem);
+            isProteomeInCluster[proteomeKey] = true;
+        }
+        localClusterReps.push_back(eachClusterRep);
+    }
+
+    for (size_t i = 0; i < localMemCount.size(); i++) {
+        if (isProteomeInCluster[i]) {
+            localMemCount[i]++;
+        }
+    }
+}
+
+bool FindNextRepProteome(std::vector<ProteomeEntry>& ProteomeList, unsigned int& RepProteomeId) {
+    bool isAllCovered = true;
+    
+    unsigned int maxMemberCount = 0;
+    unsigned int notCoveredProtCount = 0;
+
+    for (size_t idx = 0; idx < ProteomeList.size(); idx++){
+        if (ProteomeList[idx].isCovered()) {
+            continue;
+        }else{
+            isAllCovered = false;
+            notCoveredProtCount++;
+            if (ProteomeList[idx].memCount > maxMemberCount) {
+                maxMemberCount = ProteomeList[idx].memCount;
+                RepProteomeId = idx;
+            }
+        }
+    }
+
+    if (isAllCovered){
+        return false;
+    }else if (notCoveredProtCount == 1) {
+        //last one and it is singleton
+        ProteomeList[RepProteomeId].repProtKey = RepProteomeId; //todo 
+        ProteomeList[RepProteomeId].protSimScore = 1.0;
+        return false;
+    }else{
+        return true;
+    }
+}
+
+void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& ProteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinClustWriter) {
+    char buffer[1024]; 
+    bool isRepFound = false;
+    unsigned int lastqLen = 0;
+    unsigned int qproteinKey = 0;
+    unsigned int qproteomeKey =0;
+    //find Rep
+    for (auto& eachMember : clusterRep.memberProteins){
+        if (eachMember.proteomeKey == RepProteomeId) {
+            isRepFound = true;
+            const unsigned int queryId = eachMember.proteinKey;
+            if( lastqLen < tProteinDB.getSeqLen(queryId)){
+                lastqLen = tProteinDB.getSeqLen(queryId);
+                qproteinKey = eachMember.proteinKey;
+                qproteomeKey = eachMember.proteomeKey;
+            }
+        }
+    }
+    if (isRepFound){
+        proteinClustWriter.writeStart(thread_idx);
+        const unsigned int queryId = qproteinKey;
+        const unsigned int queryKey = tProteinDB.getDbKey(queryId);
+        char* querySeq = tProteinDB.getData(queryId, thread_idx); 
+        query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId));
+        matcher.initQuery(&query);
+        const unsigned int queryProteomeLength = ProteomeList[qproteomeKey].proteomeAALen;
+
+        // representative protein :same query and target (for createtsv)
+        Matcher::result_t rep_reult = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true);
+        size_t rep_len = Matcher::resultToBuffer(buffer, rep_reult, par.addBacktrace);
+        proteinClustWriter.writeAdd(buffer, rep_len, thread_idx);
+        localSeqIds[qproteomeKey] += rep_reult.getSeqId()*query.L;
+
+        for (auto& eachTargetMember : clusterRep.memberProteins){
+            if (eachTargetMember.proteomeKey == RepProteomeId) {
+                continue;
+            }
+            // if query Proteome's length < target Proteome's length * 0.9, skip
+            const unsigned int targetProteomeLength = ProteomeList[eachTargetMember.proteomeKey].proteomeAALen;
+            if (queryProteomeLength < targetProteomeLength * 0.9) {
+                continue;
+            }
+            const unsigned int targetId = eachTargetMember.proteinKey;
+            unsigned int targetProteomeId = eachTargetMember.proteomeKey;
+            const unsigned int targetKey = tProteinDB.getDbKey(targetId);
+            char* targetSeq = tProteinDB.getData(targetId, thread_idx);
+            target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId));
+
+            if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) {
+                continue;
+            };
+
+            const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false;
+            Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity);
+
+            if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) {
+                if (query.L >= target.L*0.9){
+                    size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
+                    proteinClustWriter.writeAdd(buffer, len, thread_idx);
+                    localSeqIds[targetProteomeId] += result.getSeqId()*target.L;
+                }
+            }
+        }
+        proteinClustWriter.writeEnd(queryKey, thread_idx);
+    }
+}
+
+bool updateProteomeList(std::vector<ProteomeEntry>& ProteomeList, const unsigned int& RepProteomeId){
+    bool isRepSingleton = true;
+
+    // #pragma omp for schedule(dynamic, 1)
+    for (size_t i = 0; i < ProteomeList.size(); i++) {
+        if (ProteomeList[i].isCovered() == false) {
+            if (i == RepProteomeId){
+                ProteomeList[i].repProtKey = RepProteomeId;
+                ProteomeList[i].protSimScore = 1.0;
+            }else{
+                ProteomeList[i].computeRedundancy();
+                if (ProteomeList[i].getProtSimScore() >= 0.9) {
+                ProteomeList[i].repProtKey = RepProteomeId;
+                isRepSingleton = false;
+                }else{
+                    ProteomeList[i].resetProteomeInfo();
+                }
+            }
+        }
+    }
+    return isRepSingleton;
+}
+
+void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& ProteomeList, std::vector<unsigned int>& localMemCount){
+    std::vector<bool> isProteomeInCluster(localMemCount.size(), false);
+    if (clusterRep.isAvailable) {
+        bool isAllMemberCovered = true;
+        unsigned int notCoveredCount = 0;
+        unsigned int lastProteomeKey = 0;
+        //update cluster member info
+        for (auto& eachMember : clusterRep.memberProteins) {
+            if (ProteomeList[eachMember.proteomeKey].isCovered() == false) {
+                notCoveredCount++;
+                lastProteomeKey = eachMember.proteomeKey;
+                isAllMemberCovered = false;
+                isProteomeInCluster[eachMember.proteomeKey] = true;
+            }
+        }
+        if (isAllMemberCovered) {
+            clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
+        }
+        if (notCoveredCount == 1) { //singleton Cluster
+                isProteomeInCluster[lastProteomeKey] = false;
+                clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
+        }
+    }
+
+    //update localMemCount
+    if (clusterRep.isAvailable) {
+        for (size_t i=0; i < localMemCount.size(); i++) {
+            if (isProteomeInCluster[i]) {
+                localMemCount[i]++;
+            }
+        }
+    }
+
+}
+
+
+void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEntry> &ProteomeList) {
+    std::vector<size_t> proteomeClusterIdx;
+    char proteomeBuffer[1024];
+    int repProtIdCluster = ProteomeList[0].getRepProtKey();
+
+    for (size_t idx = 0; idx < ProteomeList.size(); idx++) {
+        int repProtId = ProteomeList[idx].getRepProtKey();
+        if (repProtIdCluster != repProtId) {
+            proteomeClustWriter.writeStart();
+            for (auto &eachIdx : proteomeClusterIdx) {
+                char *basePos = proteomeBuffer;
+                char *tmpProteomeBuffer = Itoa::i32toa_sse2(ProteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\t';
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(ProteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\n';
+                proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
+            }
+            proteomeClustWriter.writeEnd(repProtIdCluster);
+            // Reset
+            repProtIdCluster = repProtId;
+            proteomeClusterIdx.clear();
+            proteomeClusterIdx.push_back(idx);
+        } else {
+            proteomeClusterIdx.push_back(idx);
+        }
+
+        if (idx == ProteomeList.size() - 1) {
+            proteomeClustWriter.writeStart();
+            for (auto &eachIdx : proteomeClusterIdx) {
+                char *basePos = proteomeBuffer;
+                char *tmpProteomeBuffer = Itoa::i32toa_sse2(ProteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\t';
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(ProteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\n';
+                proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
+            }
+            proteomeClustWriter.writeEnd(repProtIdCluster);
+            proteomeClusterIdx.clear();
+        }
+    }
+}
+
+
+int proteomecluster(int argc, const char **argv, const Command &command){
+    //Initialize parameters
+    Parameters &par = Parameters::getInstance();
+    par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0);
+    par.parseParameters(argc, argv, command, true, 0, 0);
+
+    if (par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED) {
+        Debug(Debug::ERROR) << "Use rescorediagonal for ungapped alignment mode.\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (par.addBacktrace == true) {
+        par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
+    }
+    
+    unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr);
+
+    //Open the target protein database
+    DBReader<unsigned int> tProteinDB(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_LOOKUP|DBReader<unsigned int>::USE_SOURCE);
+    tProteinDB.open(DBReader<unsigned int>::NOSORT);
+    const int tProteinSeqType = tProteinDB.getDbtype();
+
+    if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
+        tProteinDB.readMmapedDataInMemory();
+    }
+    //Open lookup table to get the source of the protein and the protein length
+    DBReader<unsigned int>::LookupEntry* tLookup = tProteinDB.getLookup();
+    const size_t tLookupSize = tProteinDB.getLookupSize();
+    unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber;
+    std::vector<ProteomeEntry> ProteomeList(totalProteomeNumber + 1);
+    calculateProteomeLength(ProteomeList, tLookup, tLookupSize, tProteinDB);
+
+    //Open the linclust result
+    DBReader<unsigned int> linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+    linResDB.open(DBReader<unsigned int>::LINEAR_ACCCESS);
+
+    //Open the DBWriter
+    DBWriter proteinClustWriter(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB);
+    proteinClustWriter.open();
+    int proteomeDBType = DBReader<unsigned int>::setExtendedDbtype(Parameters::DBTYPE_GENERIC_DB, Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER);
+    DBWriter proteomeClustWriter(par.db4.c_str(), par.db4Index.c_str(), 1, par.compressed, proteomeDBType);
+    proteomeClustWriter.open();
+
+    std::vector<ClusterEntry> clusterReps; 
+
+    int gapOpen, gapExtend;
+    BaseMatrix *subMat;
+    subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
+    gapOpen = par.gapOpen.values.aminoacid();
+    gapExtend = par.gapExtend.values.aminoacid();
+    EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
+
+    // Debug(Debug::INFO) << "Start Initialization\n";
+    #pragma omp parallel
+    {
+        unsigned int thread_idx = 0;
+    #ifdef OPENMP
+        thread_idx = (unsigned int) omp_get_thread_num();
+    #endif    
+        std::vector<ClusterEntry> localClusterReps;
+        std::vector<unsigned int> localMemCount(ProteomeList.size(), 0);
+
+        #pragma omp for schedule(dynamic, 1)
+        for (size_t id = 0; id < linResDB.getSize(); id++) {
+            initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx);
+        }
+
+        #pragma omp critical
+        {
+            for (size_t idx=0; idx < localMemCount.size(); idx++){
+                ProteomeList[idx].incrementMemCount(localMemCount[idx]);
+            }
+            clusterReps.insert(clusterReps.end(),
+                               std::make_move_iterator(localClusterReps.begin()),
+                               std::make_move_iterator(localClusterReps.end()));
+        }
+
+    }
+
+    unsigned int RepProteomeId = -1;
+    // Debug(Debug::INFO) << "Run Alignment\n";
+
+    while (FindNextRepProteome(ProteomeList, RepProteomeId)) {
+        #pragma omp parallel
+        {
+            unsigned int thread_idx = 0;
+        #ifdef OPENMP
+            thread_idx = (unsigned int) omp_get_thread_num();
+        #endif  
+            Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop);
+            Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
+            Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
+            std::vector <float> localSeqIds(clusterReps.size(), 0.0f);
+            #pragma omp for schedule(dynamic, 1)
+            for (size_t i = 0; i < clusterReps.size(); i++) {
+                if (clusterReps[i].isAvailable) {
+                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, ProteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter);
+                }
+                
+            }
+            #pragma omp critical
+            {
+                for (size_t idx = 0; idx < ProteomeList.size(); idx++) {
+                    ProteomeList[idx].addSeqId(localSeqIds[idx]);
+                }
+            }
+        }
+
+        // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n";
+        bool isRepSingleton = updateProteomeList(ProteomeList, RepProteomeId);
+
+        if (isRepSingleton) {
+            ProteomeList[RepProteomeId].repProtKey = RepProteomeId;
+            ProteomeList[RepProteomeId].protSimScore = 1.0;
+        }
+
+        // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n";
+        #pragma omp parallel
+        {
+            std::vector<unsigned int> localMemCount(ProteomeList.size(), 0);
+            #pragma omp for schedule(dynamic, 1)
+            for (size_t i = 0; i < clusterReps.size(); i++){
+                updateClusterReps(clusterReps[i], ProteomeList, localMemCount);
+            }
+            
+            #pragma omp critical
+            {
+                for (size_t i = 0; i < ProteomeList.size(); i++) {
+                    ProteomeList[i].incrementMemCount(localMemCount[i]);
+                }
+            }
+        }
+    }
+    //sort ProteomeList by repProtKey
+    SORT_PARALLEL(ProteomeList.begin(), ProteomeList.end(), ProteomeEntry::compareByKey);
+
+    //write result of proteome clustering
+    writeProteomeClusters(proteomeClustWriter, ProteomeList);
+
+    proteomeClustWriter.close();
+    proteinClustWriter.close();
+    tProteinDB.close();
+    delete subMat;
+    linResDB.close();
+    return EXIT_SUCCESS;
+}
\ No newline at end of file

From 3f4fc91338f92616fd4b528181f154116cb91740 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Tue, 27 Aug 2024 22:58:23 +0900
Subject: [PATCH 07/31] easy proteomecluster workflow

---
 data/workflow/easyproteomecluster.sh | 64 ++++++++++++++++++++++++++++
 src/workflow/EasyProteomecluster.cpp | 52 ++++++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 data/workflow/easyproteomecluster.sh
 create mode 100644 src/workflow/EasyProteomecluster.cpp

diff --git a/data/workflow/easyproteomecluster.sh b/data/workflow/easyproteomecluster.sh
new file mode 100644
index 000000000..05b4af940
--- /dev/null
+++ b/data/workflow/easyproteomecluster.sh
@@ -0,0 +1,64 @@
+#!/bin/sh -e
+fail() {
+    echo "Error: $1"
+    exit 1
+}
+
+notExists() {
+   [ ! -f "$1" ]
+}
+
+
+if notExists "${TMP_PATH}/input.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \
+        || fail "query createdb died"
+fi
+
+if notExists "${TMP_PATH}/clu.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \
+        || fail "Search died"
+fi
+
+if notExists "${TMP_PATH}/aln.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${PROTEOMECLUSTER_PAR} \
+        || fail "Convert Alignments died"
+fi
+
+if notExists "${RESULTS}_protein_cluster.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \
+            || fail "createtsv protein cluster died"
+fi
+
+if notExists "${RESULTS}_protein_align.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \
+            || fail "createtsv protein align died"
+fi
+
+if notExists "${RESULTS}_proteome_cluster.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
+            || fail "createtsv proteome cluster died"
+fi
+
+
+if [ -n "${REMOVE_TMP}" ]; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_protein" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_proteome" ${VERBOSITY_PAR}
+    rm -rf "${TMP_PATH}/clu_tmp"
+    rm -f "${TMP_PATH}/easyproteomecluster.sh"
+fi
diff --git a/src/workflow/EasyProteomecluster.cpp b/src/workflow/EasyProteomecluster.cpp
new file mode 100644
index 000000000..c30b4209a
--- /dev/null
+++ b/src/workflow/EasyProteomecluster.cpp
@@ -0,0 +1,52 @@
+#include <cassert>
+
+#include "FileUtil.h"
+#include "CommandCaller.h"
+#include "Util.h"
+#include "Debug.h"
+#include "Parameters.h"
+
+#include "easyproteomecluster.sh.h"
+
+void setEasyproteomeclusterDefaults(Parameters *p) {
+    p->proteomeSimThr = 0.9;
+}
+
+void setEasyproteomeclusterMustPassAlong(Parameters *p){
+    p->PARAM_REMOVE_TMP_FILES.wasSet = true;
+
+}
+
+int easyproteomecluster(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    setEasyproteomeclusterDefaults(&par);
+    par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
+    setEasyproteomeclusterMustPassAlong(&par);
+    std::string tmpDir = par.filenames.back();
+    std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params));
+    if (par.reuseLatest) {
+        hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
+    }
+    tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
+    par.filenames.pop_back();
+
+    CommandCaller cmd;
+    cmd.addVariable("TMP_PATH", tmpDir.c_str());
+    cmd.addVariable("RESULTS", par.filenames.back().c_str());
+    par.filenames.pop_back();
+    cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
+
+    cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.createdb).c_str());
+    cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clusterworkflow, true).c_str());
+    cmd.addVariable("PROTEOMECLUSTER_PAR", par.createParameterString(par.proteomecluster).c_str());
+    cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
+    cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str());
+
+    std::string program = tmpDir + "/easyproteomecluster.sh";
+    FileUtil::writeFile(program, easyproteomecluster_sh, easyproteomecluster_sh_len);
+    cmd.execProgram(program.c_str(), par.filenames);
+    
+    // Should never get here
+    assert(false);
+    return 0;
+}

From eeabad3b995f21f874d14dd6a40f1ff5b24bad98 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Tue, 27 Aug 2024 22:59:09 +0900
Subject: [PATCH 08/31] indexreader - handle source identifier

---
 src/commons/IndexReader.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/commons/IndexReader.h b/src/commons/IndexReader.h
index 17d637f12..0ad0ca15e 100644
--- a/src/commons/IndexReader.h
+++ b/src/commons/IndexReader.h
@@ -22,7 +22,7 @@ class IndexReader {
     ) : sequenceReader(NULL), index(NULL) {
         int targetDbtype = FileUtil::parseDbType(dataName.c_str());
         if (Parameters::isEqualDbtype(targetDbtype, Parameters::DBTYPE_INDEX_DB)) {
-            index = new DBReader<unsigned int>(dataName.c_str(), (dataName + ".index").c_str(), 1, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+            new DBReader<unsigned int>(dataName.c_str(), (dataName + ".index").c_str(), 1, dataMode);
             index->open(DBReader<unsigned int>::NOSORT);
             if (PrefilteringIndexReader::checkIfIndexFile(index)) {
                 PrefilteringIndexReader::printSummary(index);
@@ -95,6 +95,8 @@ class IndexReader {
                     }else{
                         failSuffix = "_aln";
                     }
+                } else if (databaseType & SOURCE) {
+                    failSuffix = "";
                 }
             }
             sequenceReader = new DBReader<unsigned int>(
@@ -115,6 +117,7 @@ class IndexReader {
     static const unsigned int SRC_HEADERS = 4;
     static const unsigned int SRC_SEQUENCES =  8;
     static const unsigned int ALIGNMENTS = 16;
+    static const unsigned int SOURCE = 32;
     static const unsigned int USER_SELECT = 1 << 31;
 
     static unsigned int makeUserDatabaseType(unsigned int baseKey) {

From 9996821a421d6cbff77d777450da00a4106ad5d6 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Tue, 27 Aug 2024 23:03:14 +0900
Subject: [PATCH 09/31] update mmseqsbase and change module name into
 proteomecluster

---
 src/commons/DBReader.cpp   |  9 +++++
 src/commons/DBReader.h     |  3 +-
 src/commons/DBWriter.h     |  1 +
 src/commons/Parameters.cpp | 71 +++++++++++++++++++-------------------
 src/commons/Parameters.h   |  7 ++--
 5 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp
index 80bca13c5..598d152f6 100644
--- a/src/commons/DBReader.cpp
+++ b/src/commons/DBReader.cpp
@@ -733,6 +733,15 @@ template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){
     return source[id].fileName;
 }
 
+template <typename T> T DBReader<T>::getSourceKey(size_t id){
+    if (id >= sourceSize){
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".lookup\n";
+        Debug(Debug::ERROR) << "getSource id: local id (" << id << ") >= db size (" << sourceSize << ")\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return source[id].id;
+}
+
 template<>
 void DBReader<unsigned int>::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) {
     buffer.append(SSTR(entry.id));
diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h
index 81d28bdf1..2c61664e1 100644
--- a/src/commons/DBReader.h
+++ b/src/commons/DBReader.h
@@ -187,8 +187,6 @@ class DBReader : public MemoryTracker {
 
     size_t getSize() const;
 
-    unsigned int getProteomeTotalLen(size_t id); 
-
     unsigned int getMaxSeqLen(){ 
             return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ?
                     (std::max(maxSeqLen, 1u)) / Sequence::PROFILE_READIN_SIZE :
@@ -254,6 +252,7 @@ class DBReader : public MemoryTracker {
     LookupEntry* getLookup() { return lookup; };
 
     std::string getSourceFileName(size_t id);
+    T getSourceKey(size_t id);
     static const int NOSORT = 0;
     static const int SORT_BY_LENGTH = 1;
     static const int LINEAR_ACCCESS = 2;
diff --git a/src/commons/DBWriter.h b/src/commons/DBWriter.h
index 190015cb3..21e5377bb 100644
--- a/src/commons/DBWriter.h
+++ b/src/commons/DBWriter.h
@@ -66,6 +66,7 @@ class DBWriter : public MemoryTracker  {
     }
 
     static void sortIndex(const char *inFileNameIndex, const char *outFileNameIndex, const bool lexicographicOrder);
+    
 private:
     size_t addToThreadBuffer(const void *data, size_t itmesize, size_t nitems, int threadIdx);
     void writeThreadBuffer(unsigned int idx, size_t dataSize);
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index 6cb54999b..a2db3ef47 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -340,7 +340,7 @@ Parameters::Parameters():
     alignall.push_back(&PARAM_SUB_MAT);
     alignall.push_back(&PARAM_ADD_BACKTRACE);
     alignall.push_back(&PARAM_ALIGNMENT_MODE);
-    alignall.push_back(&PARAM_REALIGN_SCORE_BIAS); //gyuri
+    alignall.push_back(&PARAM_REALIGN_SCORE_BIAS);
 //    alignall.push_back(&PARAM_WRAPPED_SCORING);
     alignall.push_back(&PARAM_E);
     alignall.push_back(&PARAM_MIN_SEQ_ID);
@@ -368,39 +368,38 @@ Parameters::Parameters():
     alignall.push_back(&PARAM_COMPRESSED);
     alignall.push_back(&PARAM_V);
 
-    //alignproteome
-        // alignproteome
-    alignproteome.push_back(&PARAM_SUB_MAT);
-    alignproteome.push_back(&PARAM_ADD_BACKTRACE);
-    alignproteome.push_back(&PARAM_ALIGNMENT_MODE);
-    alignproteome.push_back(&PARAM_REALIGN_SCORE_BIAS); //gyuri
-//    alignproteome.push_back(&PARAM_WRAPPED_SCORING);
-    alignproteome.push_back(&PARAM_E);
-    alignproteome.push_back(&PARAM_MIN_SEQ_ID);
-    alignproteome.push_back(&PARAM_MIN_ALN_LEN);
-    alignproteome.push_back(&PARAM_SEQ_ID_MODE);
-//    alignproteome.push_back(&PARAM_ALT_ALIGNMENT);
-    alignproteome.push_back(&PARAM_C);
-    alignproteome.push_back(&PARAM_COV_MODE);
-    alignproteome.push_back(&PARAM_MAX_SEQ_LEN);
-    alignproteome.push_back(&PARAM_NO_COMP_BIAS_CORR);
-    alignproteome.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
-    alignproteome.push_back(&PARAM_PROTEOME_SIMILARITY);
-
-    alignproteome.push_back(&PARAM_REALIGN);
-//    alignproteome.push_back(&PARAM_MAX_REJECTED);
-//    alignproteome.push_back(&PARAM_MAX_ACCEPT);
-    alignproteome.push_back(&PARAM_INCLUDE_IDENTITY);
-    alignproteome.push_back(&PARAM_PRELOAD_MODE);
-    alignproteome.push_back(&PARAM_PCA);
-    alignproteome.push_back(&PARAM_PCB);
-    alignproteome.push_back(&PARAM_SCORE_BIAS);
-    alignproteome.push_back(&PARAM_GAP_OPEN);
-    alignproteome.push_back(&PARAM_GAP_EXTEND);
-    alignproteome.push_back(&PARAM_ZDROP);
-    alignproteome.push_back(&PARAM_THREADS);
-    alignproteome.push_back(&PARAM_COMPRESSED);
-    alignproteome.push_back(&PARAM_V);
+    //proteomecluster
+    proteomecluster.push_back(&PARAM_SUB_MAT);
+    proteomecluster.push_back(&PARAM_ADD_BACKTRACE);
+    proteomecluster.push_back(&PARAM_ALIGNMENT_MODE);
+    proteomecluster.push_back(&PARAM_REALIGN_SCORE_BIAS); 
+//    proteomecluster.push_back(&PARAM_WRAPPED_SCORING);
+    proteomecluster.push_back(&PARAM_E);
+    proteomecluster.push_back(&PARAM_MIN_SEQ_ID);
+    proteomecluster.push_back(&PARAM_MIN_ALN_LEN);
+    proteomecluster.push_back(&PARAM_SEQ_ID_MODE);
+//    proteomecluster.push_back(&PARAM_ALT_ALIGNMENT);
+    proteomecluster.push_back(&PARAM_C);
+    proteomecluster.push_back(&PARAM_COV_MODE);
+    proteomecluster.push_back(&PARAM_MAX_SEQ_LEN);
+    proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR);
+    proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
+    proteomecluster.push_back(&PARAM_PROTEOME_SIMILARITY);
+
+    proteomecluster.push_back(&PARAM_REALIGN);
+//    proteomecluster.push_back(&PARAM_MAX_REJECTED);
+//    proteomecluster.push_back(&PARAM_MAX_ACCEPT);
+    proteomecluster.push_back(&PARAM_INCLUDE_IDENTITY);
+    proteomecluster.push_back(&PARAM_PRELOAD_MODE);
+    proteomecluster.push_back(&PARAM_PCA);
+    proteomecluster.push_back(&PARAM_PCB);
+    proteomecluster.push_back(&PARAM_SCORE_BIAS);
+    proteomecluster.push_back(&PARAM_GAP_OPEN);
+    proteomecluster.push_back(&PARAM_GAP_EXTEND);
+    proteomecluster.push_back(&PARAM_ZDROP);
+    proteomecluster.push_back(&PARAM_THREADS);
+    proteomecluster.push_back(&PARAM_COMPRESSED);
+    proteomecluster.push_back(&PARAM_V);
     
 
     // alignment
@@ -1349,8 +1348,8 @@ Parameters::Parameters():
     // easylinclustworkflow
     easylinclustworkflow = combineList(linclustworkflow, createdb);
 
-    // easyalignproteomeworkflow
-    easyalignproteome = combineList(easylinclustworkflow, alignproteome);
+    // easyproteomeclusterworkflow
+    easyproteomeclusterworkflow = combineList(easylinclustworkflow, proteomecluster);
 
 
     // clustering workflow
diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
index ee825291e..b23b771cd 100644
--- a/src/commons/Parameters.h
+++ b/src/commons/Parameters.h
@@ -90,6 +90,7 @@ class Parameters {
     static const unsigned int DBTYPE_EXTENDED_COMPRESSED = 1;
     static const unsigned int DBTYPE_EXTENDED_INDEX_NEED_SRC = 2;
     static const unsigned int DBTYPE_EXTENDED_CONTEXT_PSEUDO_COUNTS = 4;
+    static const unsigned int DBTYPE_EXTENDED_SRC_IDENTIFIER = 8;
 
     // don't forget to add new database types to DBReader::getDbTypeName and Parameters::PARAM_OUTPUT_DBTYPE
 
@@ -434,7 +435,7 @@ class Parameters {
     int    maxAccept;                    // after n accepted sequences stop
     int    altAlignment;                 // show up to this many alternative alignments
     float  seqIdThr;                     // sequence identity threshold for acceptance
-    float  proteomeSimThr;
+    float  proteomeSimThr;              // proteome similarity threshold for acceptance
     int    alnLenThr;                    // min. alignment length
     bool   addBacktrace;                 // store backtrace string (M=Match, D=deletion, I=insertion)
     bool   realign;                      // realign hit with more conservative score
@@ -1080,7 +1081,7 @@ class Parameters {
     std::vector<MMseqsParameter*> threadsandcompression;
 
     std::vector<MMseqsParameter*> alignall;
-    std::vector<MMseqsParameter*> alignproteome;
+    std::vector<MMseqsParameter*> proteomecluster;
     std::vector<MMseqsParameter*> align;
     std::vector<MMseqsParameter*> rescorediagonal;
     std::vector<MMseqsParameter*> alignbykmer;
@@ -1118,7 +1119,7 @@ class Parameters {
     std::vector<MMseqsParameter*> kmersearch;
     std::vector<MMseqsParameter*> countkmer;
     std::vector<MMseqsParameter*> easylinclustworkflow;
-    std::vector<MMseqsParameter*> easyalignproteome;
+    std::vector<MMseqsParameter*> easyproteomeclusterworkflow;
     std::vector<MMseqsParameter*> linclustworkflow;
     std::vector<MMseqsParameter*> easysearchworkflow;
     std::vector<MMseqsParameter*> searchworkflow;

From 27d9f6802746ba802bf1cfa7537fa1ffcb084947 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Wed, 28 Aug 2024 14:16:44 +0900
Subject: [PATCH 10/31] set alignment mode default

---
 src/workflow/EasyProteomecluster.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/workflow/EasyProteomecluster.cpp b/src/workflow/EasyProteomecluster.cpp
index c30b4209a..f2c2a2ae1 100644
--- a/src/workflow/EasyProteomecluster.cpp
+++ b/src/workflow/EasyProteomecluster.cpp
@@ -9,11 +9,13 @@
 #include "easyproteomecluster.sh.h"
 
 void setEasyproteomeclusterDefaults(Parameters *p) {
+    p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
     p->proteomeSimThr = 0.9;
 }
 
 void setEasyproteomeclusterMustPassAlong(Parameters *p){
     p->PARAM_REMOVE_TMP_FILES.wasSet = true;
+    p->PARAM_ALIGNMENT_MODE.wasSet = true;
 
 }
 

From e2ef2290befcd032ae942372a895bcfb2f414232 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Wed, 28 Aug 2024 17:32:11 +0900
Subject: [PATCH 11/31] latest

---
 data/workflow/CMakeLists.txt         |   2 +-
 data/workflow/easyalignproteome.sh   |  53 ----
 data/workflow/easyproteomecluster.sh |  12 +-
 src/CommandDeclarations.h            |   4 +-
 src/MMseqsBase.cpp                   |  32 +-
 src/commons/Parameters.cpp           |   3 +-
 src/util/CMakeLists.txt              |   2 +-
 src/util/alignproteome.cpp           | 443 ---------------------------
 src/util/proteomecluster.cpp         | 123 ++++----
 src/workflow/CMakeLists.txt          |   2 +-
 src/workflow/EasyAlignproteome.cpp   |  41 ---
 src/workflow/EasyLinclust.cpp        |   1 -
 12 files changed, 94 insertions(+), 624 deletions(-)
 delete mode 100644 data/workflow/easyalignproteome.sh
 delete mode 100644 src/util/alignproteome.cpp
 delete mode 100644 src/workflow/EasyAlignproteome.cpp

diff --git a/data/workflow/CMakeLists.txt b/data/workflow/CMakeLists.txt
index 3e94e1817..3f006a286 100644
--- a/data/workflow/CMakeLists.txt
+++ b/data/workflow/CMakeLists.txt
@@ -3,7 +3,7 @@ set(GENERATED_WORKFLOWS
         workflow/easycluster.sh
         workflow/easytaxonomy.sh
         workflow/easyrbh.sh
-        workflow/easyalignproteome.sh
+        workflow/easyproteomecluster.sh
         workflow/blastp.sh
         workflow/blastpgp.sh
         workflow/map.sh
diff --git a/data/workflow/easyalignproteome.sh b/data/workflow/easyalignproteome.sh
deleted file mode 100644
index b27b8187a..000000000
--- a/data/workflow/easyalignproteome.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/sh -e
-fail() {
-    echo "Error: $1"
-    exit 1
-}
-
-notExists() {
-   [ ! -f "$1" ]
-}
-
-
-if notExists "${TMP_PATH}/input.dbtype"; then
-    # shellcheck disable=SC2086
-    "$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \
-        || fail "query createdb died"
-fi
-
-if notExists "${TMP_PATH}/clu.dbtype"; then
-    # shellcheck disable=SC2086
-    "$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \
-        || fail "Search died"
-fi
-
-if notExists "${TMP_PATH}/aln.dbtype"; then
-    # shellcheck disable=SC2086
-    "$MMSEQS" alignproteome "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${ALIGNPROTEOME_PAR} \
-        || fail "Convert Alignments died"
-fi
-
-if notExists "${RESULTS}_protein_cluster.tsv"; then
-    # shellcheck disable=SC2086
-    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \
-            || fail "createtsv protein died"
-fi
-
-if notExists "${RESULTS}_proteome_cluster.tsv"; then
-    # shellcheck disable=SC2086
-    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
-            || fail "createtsv proteome died"
-fi
-
-if [ -n "${REMOVE_TMP}" ]; then
-    # shellcheck disable=SC2086
-    "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR}
-    # shellcheck disable=SC2086
-    "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR}
-    # shellcheck disable=SC2086
-    "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR}
-    # shellcheck disable=SC2086
-    "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR}
-    rm -rf "${TMP_PATH}/clu_tmp"
-    rm -f "${TMP_PATH}/easyproteomecluster.sh"
-fi
diff --git a/data/workflow/easyproteomecluster.sh b/data/workflow/easyproteomecluster.sh
index 05b4af940..2bab198fc 100644
--- a/data/workflow/easyproteomecluster.sh
+++ b/data/workflow/easyproteomecluster.sh
@@ -21,18 +21,18 @@ if notExists "${TMP_PATH}/clu.dbtype"; then
         || fail "Search died"
 fi
 
-if notExists "${TMP_PATH}/aln.dbtype"; then
-    # shellcheck disable=SC2086
-    "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${PROTEOMECLUSTER_PAR} \
-        || fail "Convert Alignments died"
-fi
-
 if notExists "${RESULTS}_protein_cluster.tsv"; then
     # shellcheck disable=SC2086
     "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \
             || fail "createtsv protein cluster died"
 fi
 
+if notExists "${TMP_PATH}/aln.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${PROTEOMECLUSTER_PAR} \
+        || fail "Convert Alignments died"
+fi
+
 if notExists "${RESULTS}_protein_align.tsv"; then
     # shellcheck disable=SC2086
     "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
index 04fbd521d..35939c4e9 100644
--- a/src/CommandDeclarations.h
+++ b/src/CommandDeclarations.h
@@ -4,7 +4,7 @@
 
 extern int align(int argc, const char **argv, const Command& command);
 extern int alignall(int argc, const char **argv, const Command& command);
-extern int alignproteome(int argc, const char **argv, const Command& command);
+extern int proteomecluster(int argc, const char **argv, const Command& command);
 extern int alignbykmer(int argc, const char **argv, const Command& command);
 extern int appenddbtoindex(int argc, const char **argv, const Command& command);
 extern int apply(int argc, const char **argv, const Command& command);
@@ -44,7 +44,7 @@ extern int diffseqdbs(int argc, const char **argv, const Command& command);
 extern int easycluster(int argc, const char **argv, const Command& command);
 extern int easyrbh(int argc, const char **argv, const Command& command);
 extern int easylinclust(int argc, const char **argv, const Command& command);
-extern int easyalignproteome(int argc, const char **argv, const Command& command);
+extern int easyproteomecluster(int argc, const char **argv, const Command& command);
 extern int easysearch(int argc, const char **argv, const Command& command);
 extern int easylinsearch(int argc, const char **argv, const Command& command);
 extern int tsv2exprofiledb(int argc, const char **argv, const Command& command);
diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index 3b64eb556..5a6280da2 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -79,12 +79,19 @@ std::vector<Command> baseCommands = {
                 CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
                                                             {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
                                                             {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"easy-alignproteome", easyalignproteome, &par.easyalignproteome, COMMAND_EASY,
-                "Proteome clustering and alignemnt",
-                "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>"
-                // CITATION_MMSEQS2,
-        
-        },
+        {"easy-proteomecluster", easyproteomecluster, &par.easyproteomeclusterworkflow, COMMAND_EASY,
+                "Proteome clustering and alignment",
+                "mmseqs easy-proteomecluster examples/DB.fasta result tmp\n\n"
+                "# ProteomeCluster output\n"
+                "#  - protein_cluster.tsv:  Protein linclust result\n"
+                "#  - proteome_cluster.tsv: Proteome redundancy clustering result\n"
+                "#  - protein_align.tsv: Protein alignment list\n"
+                "mmseqs easy-proteomecluster examples/DB.fasta result tmp --proteome-similarity 0.9\n",
+                "Martin Steinegger <martin.steinegger@snu.ac.kr> & Gyuri Kim <gyuribio@snu.ac.kr>",
+                "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <o:clusterPrefix> <tmpDir>",
+                CITATION_MMSEQS2, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                        {"outputReports", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                        {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
         {"easy-taxonomy",        easytaxonomy,         &par.easytaxonomy,         COMMAND_EASY,
                 "Taxonomic classification",
                 "# Assign taxonomic labels to FASTA sequences\n"
@@ -638,15 +645,15 @@ std::vector<Command> baseCommands = {
                                                            {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
-        {"alignproteome",             alignproteome,             &par.alignproteome,             COMMAND_ALIGNMENT,
+        {"proteomecluster",             proteomecluster,             &par.proteomecluster,             COMMAND_ALIGNMENT,
                 "Proteome clustering and alignment",
                 NULL,
                 "Martin Steinegger <martin.steinegger@snu.ac.kr> & Gyuri Kim <gyuribio@snu.ac.kr>",
-                "<i:sequenceDB> <i:resultDB> <o:alignmentDB>",
+                "<i:sequenceDB> <i:clustresultDB> <o:proteinAlignmentDB> <o:proteomeAlignmentDB>",
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+                                                           {"clustresultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"proteinAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb},
+                                                           {"proteomeAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
         {"alignall",             alignall,             &par.alignall,             COMMAND_ALIGNMENT,
                 "Within-result all-vs-all gapped local alignment",
                 NULL,
@@ -1314,5 +1321,6 @@ std::vector<Command> baseCommands = {
                 NULL,
                 "",
                 "",
-                CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}
+                CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
+        
 };
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index a2db3ef47..d142bdd82 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -353,7 +353,7 @@ Parameters::Parameters():
     alignall.push_back(&PARAM_NO_COMP_BIAS_CORR);
     alignall.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
 
-    alignall.push_back(&PARAM_REALIGN);
+//    alignall.push_back(&PARAM_REALIGN);
 //    alignall.push_back(&PARAM_MAX_REJECTED);
 //    alignall.push_back(&PARAM_MAX_ACCEPT);
     alignall.push_back(&PARAM_INCLUDE_IDENTITY);
@@ -1351,7 +1351,6 @@ Parameters::Parameters():
     // easyproteomeclusterworkflow
     easyproteomeclusterworkflow = combineList(easylinclustworkflow, proteomecluster);
 
-
     // clustering workflow
     clusterworkflow = combineList(prefilter, align);
     clusterworkflow = combineList(clusterworkflow, rescorediagonal);
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index 35d72b32f..6a5ee5a36 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -80,6 +80,6 @@ set(util_source_files
         util/proteinaln2nucl.cpp
         util/versionstring.cpp
         util/diskspaceavail.cpp
-        util/alignproteome.cpp
+        util/proteomecluster.cpp
         PARENT_SCOPE
         )
diff --git a/src/util/alignproteome.cpp b/src/util/alignproteome.cpp
deleted file mode 100644
index 3a7311b81..000000000
--- a/src/util/alignproteome.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-#include "Util.h"
-#include "Parameters.h"
-#include "Matcher.h"
-#include "Debug.h"
-#include "DBReader.h"
-#include "DBWriter.h"
-#include "NucleotideMatrix.h"
-#include "SubstitutionMatrix.h"
-#include "Alignment.h"
-#include "itoa.h"
-
-#ifdef OPENMP
-#include <omp.h>
-#endif
-
-struct __attribute__((__packed__)) ProteomeEntry{
-    unsigned int proteomeAALen;
-    int repProtKey;
-    float protSimScore;
-    unsigned int memCount;
-    float totalSeqId;
-
-    ProteomeEntry(unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f)
-        : proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {}
-
-    void incrementMemCount(int count) {
-        memCount += count;
-    }
-    void decrementmemCount() {
-        memCount--;
-    }
-
-    int getRepProtKey() {
-        return repProtKey;
-    }
-
-    bool isCovered(){
-        if (repProtKey == -1) {
-            return false;
-        }
-        return true;
-    }
-
-    void computeRedundancy() {
-        protSimScore = totalSeqId / proteomeAALen;
-
-    }
-
-    void addSeqId(float seqId) {
-        totalSeqId += seqId;
-    }
-
-    void addSeqLen(unsigned int seqLength) {
-        proteomeAALen += seqLength;
-    }
-
-    void resetProteomeInfo(){
-        memCount = 0;
-        totalSeqId = 0.0f;
-    }
-
-    float getprotSimScore() {
-        return protSimScore;
-    }
-};
-
-struct __attribute__((__packed__)) MemberProteinEntry{
-    unsigned int proteomeKey;
-    unsigned int proteinKey;
-};
-
-struct ClusterEntry {
-    bool isAvailable;
-    std::vector<MemberProteinEntry> memberProteins;
-
-    ClusterEntry() : isAvailable(false) {}
-
-    ClusterEntry(size_t memberProteinSize) : isAvailable(true) {
-        memberProteins.reserve(memberProteinSize);
-    }
-
-    void resetClusterInfo(){
-        memberProteins.clear();
-        isAvailable = false;
-    }
-};
-
-void calculateProteomeLength(std::vector<ProteomeEntry>& ProteomeList, DBReader<unsigned int>::LookupEntry* const& lookup, size_t lookupSize, DBReader<unsigned int>& tProteinDB) {
-    for (size_t i = 0; i < lookupSize; i++) {
-        const unsigned int ProteomeId = lookup[i].fileNumber;
-        const unsigned int ProteinId = lookup[i].id;
-        ProteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId));
-    }
-}
-
-void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterReps, std::vector<unsigned int>& localMemCount, DBReader<unsigned int>& tProteinDB, DBReader<unsigned int>& linResDB, unsigned int thread_idx){
-    std::vector<unsigned int> memberKeys;
-    char buffer[1024 + 32768*4]; ///why?
-    memberKeys.reserve(50); // store key for every protein in a cluster
-    char *clustData = linResDB.getData(id, thread_idx);
-    while (*clustData != '\0') {
-        Util::parseKey(clustData, buffer);
-        const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10);
-        memberKeys.push_back(key);
-        clustData = Util::skipLine(clustData);
-    }
-    if (memberKeys.size() > 1) { //IF Not a singleton cluster
-        ClusterEntry eachClusterRep(memberKeys.size());
-
-        for (auto& eachMemberKey : memberKeys){
-            const unsigned int proteinId = tProteinDB.getId(eachMemberKey);
-            const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId);
-            MemberProteinEntry mem;
-            mem.proteomeKey = proteomeKey;
-            mem.proteinKey = proteinId;
-            eachClusterRep.memberProteins.push_back(mem);
-            localMemCount[proteomeKey]++;
-        }
-        localClusterReps.push_back(eachClusterRep);
-    }
-}
-
-bool FindNextRepProteome(std::vector<ProteomeEntry>& ProteomeList, unsigned int& RepProteomeId) {
-    bool isAllCovered = true;
-    
-    unsigned int maxMemberCount = 0;
-    unsigned int notCoveredProtCount = 0;
-
-    for (size_t idx = 0; idx < ProteomeList.size(); idx++){
-        if (ProteomeList[idx].isCovered()) {
-            continue;
-        }else{
-            isAllCovered = false;
-            notCoveredProtCount++;
-            if (ProteomeList[idx].memCount > maxMemberCount) {
-                maxMemberCount = ProteomeList[idx].memCount;
-                RepProteomeId = idx;
-            }
-        }
-    }
-
-    if (isAllCovered){
-        return false;
-    }else if (notCoveredProtCount == 1) {
-        //last one and it is singleton
-        ProteomeList[RepProteomeId].repProtKey = RepProteomeId; //todo 
-        ProteomeList[RepProteomeId].protSimScore = 1.0;
-        return false;
-    }else{
-        return true;
-    }
-}
-
-void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& ProteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinAlignWriter) {
-    char buffer[1024]; //todo - How to calculate buffer size?
-    bool isRepFound = false;
-    unsigned int lastqLen = 0;
-    unsigned int qproteinKey = 0;
-    unsigned int qproteomeKey =0;
-    //find Rep
-    for (auto& eachMember : clusterRep.memberProteins){
-        if (eachMember.proteomeKey == RepProteomeId) {
-            isRepFound = true;
-            const unsigned int queryId = eachMember.proteinKey;
-            if( lastqLen < tProteinDB.getSeqLen(queryId)){
-                lastqLen = tProteinDB.getSeqLen(queryId);
-                qproteinKey = eachMember.proteinKey;
-                qproteomeKey = eachMember.proteomeKey;
-            }
-        }
-    }
-    if (isRepFound){
-        const unsigned int queryId = qproteinKey;
-        const unsigned int queryKey = tProteinDB.getDbKey(queryId);
-        char* querySeq = tProteinDB.getData(queryId, thread_idx); //todo need openmp
-        query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId));
-        matcher.initQuery(&query);
-        const unsigned int queryProteomeLength = ProteomeList[qproteomeKey].proteomeAALen;
-        char * tmpBuff = Itoa::u32toa_sse2((uint32_t) queryKey, buffer);
-        *(tmpBuff-1) = '\t';
-        const unsigned int queryIdLen = tmpBuff - buffer;
-
-        for (auto& eachTargetMember : clusterRep.memberProteins){
-            // if query Proteome's length < target Proteome's length * 0.9, skip
-            const unsigned int targetProteomeLength = ProteomeList[eachTargetMember.proteomeKey].proteomeAALen;
-            if (queryProteomeLength < targetProteomeLength * 0.9) {
-                continue;
-            }
-            const unsigned int targetId = eachTargetMember.proteinKey;
-            unsigned int targetProteomeId = eachTargetMember.proteomeKey;
-            const unsigned int targetKey = tProteinDB.getDbKey(targetId);
-            char* targetSeq = tProteinDB.getData(targetId, thread_idx);
-            target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId));
-
-            if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) {
-                continue;
-            };
-
-            const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false;
-            Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity);
-
-            if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) {
-                // Placeholder for writing results, uncomment and implement as needed
-                // size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
-                // protRedunWriter.writeAdd(buffer, len, thread_idx);
-                // ProteomeList[targetProteomeId].addSeqId(result.getSeqId());
-                size_t len = Matcher::resultToBuffer(tmpBuff, result, par.addBacktrace);
-                proteinAlignWriter.writeAdd(buffer, queryIdLen+len, thread_idx);
-                localSeqIds[targetProteomeId] += result.getSeqId()*target.L;
-            }
-        }
-
-    }
-}
-
-bool updateProteomeList(std::vector<ProteomeEntry>& ProteomeList, const unsigned int& RepProteomeId){
-    bool isRepSingleton = true;
-
-    #pragma omp for schedule(dynamic, 1)
-    for (size_t i = 0; i < ProteomeList.size(); i++) {
-        if (ProteomeList[i].isCovered() == false) {
-            if (i == RepProteomeId){
-                ProteomeList[i].repProtKey = RepProteomeId;
-                ProteomeList[i].protSimScore = 1.0;
-            }else{
-                ProteomeList[i].computeRedundancy();
-                if (ProteomeList[i].getprotSimScore() >= 0.9) {
-                ProteomeList[i].repProtKey = RepProteomeId;
-                isRepSingleton = false;
-                }else{
-                    ProteomeList[i].resetProteomeInfo();
-                }
-            }
-        }
-    }
-    return isRepSingleton;
-}
-
-void updateClusterReps(std::vector<ClusterEntry>& clusterReps, std::vector<ProteomeEntry>& ProteomeList, std::vector<unsigned int>& localMemCount){
-    #pragma omp parallel for schedule(dynamic, 1)
-    for (size_t i = 0; i < clusterReps.size(); i++) {
-        if (clusterReps[i].isAvailable) {
-            bool allMemberCovered = true;
-            unsigned int notCoveredCount = 0;
-            unsigned int lastProteomeKey = 0;
-            for (auto& eachMember : clusterReps[i].memberProteins) {
-                if (ProteomeList[eachMember.proteomeKey].isCovered() == false) {
-                    notCoveredCount++;
-                    lastProteomeKey = eachMember.proteomeKey;
-                    allMemberCovered = false;
-                    localMemCount[eachMember.proteomeKey]++;
-                }
-            }
-            if (allMemberCovered) {
-                clusterReps[i].resetClusterInfo();
-            }
-
-            if (notCoveredCount == 1) { //singleton Cluster
-                localMemCount[lastProteomeKey]--;
-                clusterReps[i].resetClusterInfo();
-            }
-        }
-    }
-
-}
-
-size_t resultToBuffer(const std::string& proteomeName, const std::string& repProtName, float similarityScore, char* buffer){
-    char* basePos = buffer;
-    std::strncpy(buffer, proteomeName.c_str(), proteomeName.length());
-    buffer += proteomeName.length()+1;
-    *(buffer-1) = '\t';  // 탭 구분자 추가
-
-    // repProtName을 버퍼에 복사
-    std::strncpy(buffer, repProtName.c_str(), repProtName.length());
-    buffer += repProtName.length()+1;
-    *(buffer-1)  = '\t';  // 탭 구분자 추가
-
-    // similarityScore를 버퍼에 추가 (Util::fastSeqIdToBuffer 사용)
-    buffer = Util::fastSeqIdToBuffer(similarityScore, buffer);
-    *(buffer-1) = '\n';  // 줄바꿈 추가
-    // *(buffer) = '\0';  // 문자열 끝 추가
-    return buffer - basePos; 
-}
-
-int alignproteome(int argc, const char **argv, const Command &command){
-    //Initialize parameters
-    Parameters &par = Parameters::getInstance();
-    par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0);
-    par.parseParameters(argc, argv, command, true, 0, 0);
-
-    if (par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED) {
-        Debug(Debug::ERROR) << "Use rescorediagonal for ungapped alignment mode.\n";
-        EXIT(EXIT_FAILURE);
-    }
-    if (par.addBacktrace == true) {
-        par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
-    }
-    
-    unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr);
-
-    //Open the target protein database
-    DBReader<unsigned int> tProteinDB(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_LOOKUP|DBReader<unsigned int>::USE_SOURCE);
-    tProteinDB.open(DBReader<unsigned int>::NOSORT);
-    const int tProteinSeqType = tProteinDB.getDbtype();
-
-    if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
-        tProteinDB.readMmapedDataInMemory();
-    }
-    //Open lookup table to get the source of the protein and the protein length
-    DBReader<unsigned int>::LookupEntry* tLookup = tProteinDB.getLookup();
-    const size_t tLookupSize = tProteinDB.getLookupSize();
-    unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber;
-    std::vector<ProteomeEntry> ProteomeList(totalProteomeNumber + 1);
-    calculateProteomeLength(ProteomeList, tLookup, tLookupSize, tProteinDB);
-
-    //For Debug
-    // for (size_t i=0; i < ProteomeList.size(); i++) {
-    //     Debug(Debug::INFO) << "Proteome " << i << " has " << ProteomeList[i].proteomeAALen << " amino acids.\n";
-    // }
-
-    //Open the linclust result
-    DBReader<unsigned int> linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
-    linResDB.open(DBReader<unsigned int>::LINEAR_ACCCESS);
-
-    //Open the DBWriter
-    DBWriter protRedunWriter(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_GENERIC_DB);
-    protRedunWriter.open();
-    DBWriter proteinAlignWriter(par.db4.c_str(), par.db4Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB);
-    proteinAlignWriter.open();
-
-    std::vector<ClusterEntry> clusterReps; 
-
-    int gapOpen, gapExtend;
-    BaseMatrix *subMat;
-    subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
-    gapOpen = par.gapOpen.values.aminoacid();
-    gapExtend = par.gapExtend.values.aminoacid();
-    EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
-
-    Debug(Debug::INFO) << "Start Initialization\n";
-    #pragma omp parallel
-    {
-        unsigned int thread_idx = 0;
-    #ifdef OPENMP
-        thread_idx = (unsigned int) omp_get_thread_num();
-    #endif    
-        std::vector<ClusterEntry> localClusterReps;
-        std::vector<unsigned int> localMemCount(ProteomeList.size(), 0);
-
-        #pragma omp for schedule(dynamic, 1)
-        for (size_t id = 0; id < linResDB.getSize(); id++) {
-            initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx);
-        }
-
-        #pragma omp critical
-        {
-            for (size_t idx=0; idx < localMemCount.size(); idx++){
-                ProteomeList[idx].incrementMemCount(localMemCount[idx]);
-            }
-            clusterReps.insert(clusterReps.end(),
-                               std::make_move_iterator(localClusterReps.begin()),
-                               std::make_move_iterator(localClusterReps.end()));
-        }
-
-    }
-
-    unsigned int RepProteomeId = -1;
-    Debug(Debug::INFO) << "Run Alignment\n";
-    while (FindNextRepProteome(ProteomeList, RepProteomeId)) {
-        // if (FindNextRepProteome(ProteomeList, RepProteomeId) == false) {
-        //     Debug(Debug::INFO) << "All Proteome is covered. Done.\n";
-        //     break;
-        // }
-        // Debug(Debug::INFO) << "New Rep Found : " << RepProteomeId << "\n";
-        // Debug(Debug::INFO) << "1. Run alignment for each cluster\n";
-        #pragma omp parallel
-        {
-            unsigned int thread_idx = 0;
-        #ifdef OPENMP
-            thread_idx = (unsigned int) omp_get_thread_num();
-        #endif  
-            Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop);
-            Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
-            Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
-            std::vector <float> localSeqIds(clusterReps.size(), 0.0f);
-            proteinAlignWriter.writeStart(thread_idx);
-            #pragma omp for schedule(dynamic, 1)
-            for (size_t i = 0; i < clusterReps.size(); i++) {
-                if (clusterReps[i].isAvailable) {
-                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, ProteomeList, par, thread_idx, swMode, localSeqIds, proteinAlignWriter);
-                }
-                
-            }
-            proteinAlignWriter.writeEnd(RepProteomeId, thread_idx); //gg
-            #pragma omp critical
-            {
-                for (size_t idx = 0; idx < ProteomeList.size(); idx++) {
-                    ProteomeList[idx].addSeqId(localSeqIds[idx]);
-                }
-            }
-        }
-
-        // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n";
-        
-        bool isRepSingleton = updateProteomeList(ProteomeList, RepProteomeId);
-
-        if (isRepSingleton) {
-            ProteomeList[RepProteomeId].repProtKey = RepProteomeId;
-            ProteomeList[RepProteomeId].protSimScore = 1.0;
-        }
-
-        // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n";
-        std::vector<unsigned int> localMemCount(ProteomeList.size(), 0);
-        
-        updateClusterReps(clusterReps, ProteomeList, localMemCount);
-
-        #pragma omp critical
-        {
-            for (size_t i = 0; i < ProteomeList.size(); i++) {
-                ProteomeList[i].memCount += localMemCount[i];
-            }
-        }
-    }
-    // Debug(Debug::INFO) << "4. Write ProteomeList to file\n";
-    char protRedunBuffer[1024]; //todo - How to calculate buffer size?
-    protRedunWriter.writeStart();
-    for (size_t idx=0; idx < ProteomeList.size(); idx++){
-        std::string proteomeName = tProteinDB.getSourceFileName(idx);
-        std::string repProtName = tProteinDB.getSourceFileName(ProteomeList[idx].getRepProtKey());
-        float similarityScore = ProteomeList[idx].getprotSimScore();
-        // Debug(Debug::INFO) << "Proteome " << idx << " : " << proteomeName << " has similarity score : " << similarityScore << " with " << repProtName << "\n";
-        size_t result = resultToBuffer(proteomeName, repProtName, similarityScore,protRedunBuffer);
-        protRedunWriter.writeAdd(protRedunBuffer, result);
-    }
-    protRedunWriter.writeEnd(1); //Don't need index file (todo)
-
-    protRedunWriter.close();
-    proteinAlignWriter.close();
-    tProteinDB.close();
-    delete subMat;
-    linResDB.close();
-    return EXIT_SUCCESS;
-}
\ No newline at end of file
diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index dcfae8aa3..84d7421e0 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -98,13 +98,13 @@ struct ClusterEntry {
     }
 };
 
-void calculateProteomeLength(std::vector<ProteomeEntry>& ProteomeList, DBReader<unsigned int>::LookupEntry* const& lookup, size_t lookupSize, DBReader<unsigned int>& tProteinDB) {
+void calculateProteomeLength(std::vector<ProteomeEntry>& proteomeList, DBReader<unsigned int>::LookupEntry* const& lookup, size_t lookupSize, DBReader<unsigned int>& tProteinDB) {
     for (size_t i = 0; i < lookupSize; i++) {
         const unsigned int ProteomeId = lookup[i].fileNumber;
         const unsigned int ProteinId = lookup[i].id;
-        ProteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId));
-        if (ProteomeList[ProteomeId].proteomeKey == -1){
-            ProteomeList[ProteomeId].proteomeKey = ProteomeId;
+        proteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId));
+        if (proteomeList[ProteomeId].proteomeKey == -1) {
+            proteomeList[ProteomeId].proteomeKey = ProteomeId;
         }
     }
 }
@@ -113,7 +113,7 @@ void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterRep
     std::vector<unsigned int> memberKeys;
     memberKeys.reserve(50); // store key for every protein in a cluster
 
-    std::vector<bool> isProteomeInCluster(localMemCount.size(), false); ;
+    std::vector<bool> isProteomeInCluster(localMemCount.size(), false);
     char buffer[1024 + 32768*4];
     char *clustData = linResDB.getData(id, thread_idx);
     
@@ -126,7 +126,7 @@ void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterRep
     if (memberKeys.size() > 1) { //If not a singleton cluster
         ClusterEntry eachClusterRep(memberKeys.size());
         //init MemberProteinEntry and add it to memberProteins vector
-        for (auto& eachMemberKey : memberKeys){
+        for (auto& eachMemberKey : memberKeys) {
             const unsigned int proteinId = tProteinDB.getId(eachMemberKey);
             const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId);
             MemberProteinEntry mem;
@@ -145,20 +145,20 @@ void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterRep
     }
 }
 
-bool FindNextRepProteome(std::vector<ProteomeEntry>& ProteomeList, unsigned int& RepProteomeId) {
+bool FindNextRepProteome(std::vector<ProteomeEntry>& proteomeList, unsigned int& RepProteomeId) {
     bool isAllCovered = true;
     
     unsigned int maxMemberCount = 0;
     unsigned int notCoveredProtCount = 0;
 
-    for (size_t idx = 0; idx < ProteomeList.size(); idx++){
-        if (ProteomeList[idx].isCovered()) {
+    for (size_t idx = 0; idx < proteomeList.size(); idx++){
+        if (proteomeList[idx].isCovered()) {
             continue;
         }else{
             isAllCovered = false;
             notCoveredProtCount++;
-            if (ProteomeList[idx].memCount > maxMemberCount) {
-                maxMemberCount = ProteomeList[idx].memCount;
+            if (proteomeList[idx].memCount > maxMemberCount) {
+                maxMemberCount = proteomeList[idx].memCount;
                 RepProteomeId = idx;
             }
         }
@@ -168,15 +168,15 @@ bool FindNextRepProteome(std::vector<ProteomeEntry>& ProteomeList, unsigned int&
         return false;
     }else if (notCoveredProtCount == 1) {
         //last one and it is singleton
-        ProteomeList[RepProteomeId].repProtKey = RepProteomeId; //todo 
-        ProteomeList[RepProteomeId].protSimScore = 1.0;
+        proteomeList[RepProteomeId].repProtKey = RepProteomeId;
+        proteomeList[RepProteomeId].protSimScore = 1.0;
         return false;
     }else{
         return true;
     }
 }
 
-void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& ProteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinClustWriter) {
+void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinClustWriter) {
     char buffer[1024]; 
     bool isRepFound = false;
     unsigned int lastqLen = 0;
@@ -201,7 +201,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
         char* querySeq = tProteinDB.getData(queryId, thread_idx); 
         query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId));
         matcher.initQuery(&query);
-        const unsigned int queryProteomeLength = ProteomeList[qproteomeKey].proteomeAALen;
+        const unsigned int queryProteomeLength = proteomeList[qproteomeKey].proteomeAALen;
 
         // representative protein :same query and target (for createtsv)
         Matcher::result_t rep_reult = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true);
@@ -214,19 +214,20 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
                 continue;
             }
             // if query Proteome's length < target Proteome's length * 0.9, skip
-            const unsigned int targetProteomeLength = ProteomeList[eachTargetMember.proteomeKey].proteomeAALen;
+            const unsigned int targetProteomeLength = proteomeList[eachTargetMember.proteomeKey].proteomeAALen;
             if (queryProteomeLength < targetProteomeLength * 0.9) {
                 continue;
             }
             const unsigned int targetId = eachTargetMember.proteinKey;
-            unsigned int targetProteomeId = eachTargetMember.proteomeKey;
             const unsigned int targetKey = tProteinDB.getDbKey(targetId);
+            unsigned int tproteomeKey = eachTargetMember.proteomeKey;
+
             char* targetSeq = tProteinDB.getData(targetId, thread_idx);
             target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId));
 
             if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) {
                 continue;
-            };
+            }
 
             const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false;
             Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity);
@@ -235,7 +236,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
                 if (query.L >= target.L*0.9){
                     size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
                     proteinClustWriter.writeAdd(buffer, len, thread_idx);
-                    localSeqIds[targetProteomeId] += result.getSeqId()*target.L;
+                    localSeqIds[tproteomeKey] += result.getSeqId()*target.L;
                 }
             }
         }
@@ -243,22 +244,22 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
     }
 }
 
-bool updateProteomeList(std::vector<ProteomeEntry>& ProteomeList, const unsigned int& RepProteomeId){
+bool updateproteomeList(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId){
     bool isRepSingleton = true;
 
     // #pragma omp for schedule(dynamic, 1)
-    for (size_t i = 0; i < ProteomeList.size(); i++) {
-        if (ProteomeList[i].isCovered() == false) {
+    for (size_t i = 0; i < proteomeList.size(); i++) {
+        if (proteomeList[i].isCovered() == false) {
             if (i == RepProteomeId){
-                ProteomeList[i].repProtKey = RepProteomeId;
-                ProteomeList[i].protSimScore = 1.0;
+                proteomeList[i].repProtKey = RepProteomeId;
+                proteomeList[i].protSimScore = 1.0;
             }else{
-                ProteomeList[i].computeRedundancy();
-                if (ProteomeList[i].getProtSimScore() >= 0.9) {
-                ProteomeList[i].repProtKey = RepProteomeId;
-                isRepSingleton = false;
+                proteomeList[i].computeRedundancy();
+                if (proteomeList[i].getProtSimScore() >= 0.9) {
+                    proteomeList[i].repProtKey = RepProteomeId;
+                    isRepSingleton = false;
                 }else{
-                    ProteomeList[i].resetProteomeInfo();
+                    proteomeList[i].resetProteomeInfo();
                 }
             }
         }
@@ -266,7 +267,7 @@ bool updateProteomeList(std::vector<ProteomeEntry>& ProteomeList, const unsigned
     return isRepSingleton;
 }
 
-void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& ProteomeList, std::vector<unsigned int>& localMemCount){
+void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& proteomeList, std::vector<unsigned int>& localMemCount){
     std::vector<bool> isProteomeInCluster(localMemCount.size(), false);
     if (clusterRep.isAvailable) {
         bool isAllMemberCovered = true;
@@ -274,7 +275,7 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& Pro
         unsigned int lastProteomeKey = 0;
         //update cluster member info
         for (auto& eachMember : clusterRep.memberProteins) {
-            if (ProteomeList[eachMember.proteomeKey].isCovered() == false) {
+            if (proteomeList[eachMember.proteomeKey].isCovered() == false) {
                 notCoveredCount++;
                 lastProteomeKey = eachMember.proteomeKey;
                 isAllMemberCovered = false;
@@ -285,8 +286,8 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& Pro
             clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
         }
         if (notCoveredCount == 1) { //singleton Cluster
-                isProteomeInCluster[lastProteomeKey] = false;
-                clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
+            isProteomeInCluster[lastProteomeKey] = false;
+            clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
         }
     }
 
@@ -302,20 +303,20 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& Pro
 }
 
 
-void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEntry> &ProteomeList) {
+void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEntry> &proteomeList) {
     std::vector<size_t> proteomeClusterIdx;
     char proteomeBuffer[1024];
-    int repProtIdCluster = ProteomeList[0].getRepProtKey();
+    int repProtIdCluster = proteomeList[0].getRepProtKey();
 
-    for (size_t idx = 0; idx < ProteomeList.size(); idx++) {
-        int repProtId = ProteomeList[idx].getRepProtKey();
+    for (size_t idx = 0; idx < proteomeList.size(); idx++) {
+        int repProtId = proteomeList[idx].getRepProtKey();
         if (repProtIdCluster != repProtId) {
             proteomeClustWriter.writeStart();
             for (auto &eachIdx : proteomeClusterIdx) {
                 char *basePos = proteomeBuffer;
-                char *tmpProteomeBuffer = Itoa::i32toa_sse2(ProteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
+                char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\t';
-                tmpProteomeBuffer = Util::fastSeqIdToBuffer(ProteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\n';
                 proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
             }
@@ -328,13 +329,13 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEn
             proteomeClusterIdx.push_back(idx);
         }
 
-        if (idx == ProteomeList.size() - 1) {
+        if (idx == proteomeList.size() - 1) {
             proteomeClustWriter.writeStart();
             for (auto &eachIdx : proteomeClusterIdx) {
                 char *basePos = proteomeBuffer;
-                char *tmpProteomeBuffer = Itoa::i32toa_sse2(ProteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
+                char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\t';
-                tmpProteomeBuffer = Util::fastSeqIdToBuffer(ProteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\n';
                 proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
             }
@@ -373,8 +374,8 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     DBReader<unsigned int>::LookupEntry* tLookup = tProteinDB.getLookup();
     const size_t tLookupSize = tProteinDB.getLookupSize();
     unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber;
-    std::vector<ProteomeEntry> ProteomeList(totalProteomeNumber + 1);
-    calculateProteomeLength(ProteomeList, tLookup, tLookupSize, tProteinDB);
+    std::vector<ProteomeEntry> proteomeList(totalProteomeNumber + 1);
+    calculateProteomeLength(proteomeList, tLookup, tLookupSize, tProteinDB);
 
     //Open the linclust result
     DBReader<unsigned int> linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
@@ -404,7 +405,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
         thread_idx = (unsigned int) omp_get_thread_num();
     #endif    
         std::vector<ClusterEntry> localClusterReps;
-        std::vector<unsigned int> localMemCount(ProteomeList.size(), 0);
+        std::vector<unsigned int> localMemCount(proteomeList.size(), 0);
 
         #pragma omp for schedule(dynamic, 1)
         for (size_t id = 0; id < linResDB.getSize(); id++) {
@@ -414,7 +415,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
         #pragma omp critical
         {
             for (size_t idx=0; idx < localMemCount.size(); idx++){
-                ProteomeList[idx].incrementMemCount(localMemCount[idx]);
+                proteomeList[idx].incrementMemCount(localMemCount[idx]);
             }
             clusterReps.insert(clusterReps.end(),
                                std::make_move_iterator(localClusterReps.begin()),
@@ -426,7 +427,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     unsigned int RepProteomeId = -1;
     // Debug(Debug::INFO) << "Run Alignment\n";
 
-    while (FindNextRepProteome(ProteomeList, RepProteomeId)) {
+    while (FindNextRepProteome(proteomeList, RepProteomeId)) {
         #pragma omp parallel
         {
             unsigned int thread_idx = 0;
@@ -436,52 +437,52 @@ int proteomecluster(int argc, const char **argv, const Command &command){
             Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop);
             Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
             Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
-            std::vector <float> localSeqIds(clusterReps.size(), 0.0f);
+            std::vector <float> localSeqIds(proteomeList.size(), 0.0f);
             #pragma omp for schedule(dynamic, 1)
             for (size_t i = 0; i < clusterReps.size(); i++) {
                 if (clusterReps[i].isAvailable) {
-                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, ProteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter);
+                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter);
                 }
                 
             }
             #pragma omp critical
             {
-                for (size_t idx = 0; idx < ProteomeList.size(); idx++) {
-                    ProteomeList[idx].addSeqId(localSeqIds[idx]);
+                for (size_t idx = 0; idx < proteomeList.size(); idx++) {
+                    proteomeList[idx].addSeqId(localSeqIds[idx]);
                 }
             }
         }
 
         // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n";
-        bool isRepSingleton = updateProteomeList(ProteomeList, RepProteomeId);
+        bool isRepSingleton = updateproteomeList(proteomeList, RepProteomeId);
 
         if (isRepSingleton) {
-            ProteomeList[RepProteomeId].repProtKey = RepProteomeId;
-            ProteomeList[RepProteomeId].protSimScore = 1.0;
+            proteomeList[RepProteomeId].repProtKey = RepProteomeId;
+            proteomeList[RepProteomeId].protSimScore = 1.0;
         }
 
         // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n";
         #pragma omp parallel
         {
-            std::vector<unsigned int> localMemCount(ProteomeList.size(), 0);
+            std::vector<unsigned int> localMemCount(proteomeList.size(), 0);
             #pragma omp for schedule(dynamic, 1)
             for (size_t i = 0; i < clusterReps.size(); i++){
-                updateClusterReps(clusterReps[i], ProteomeList, localMemCount);
+                updateClusterReps(clusterReps[i], proteomeList, localMemCount);
             }
             
             #pragma omp critical
             {
-                for (size_t i = 0; i < ProteomeList.size(); i++) {
-                    ProteomeList[i].incrementMemCount(localMemCount[i]);
+                for (size_t i = 0; i < proteomeList.size(); i++) {
+                    proteomeList[i].incrementMemCount(localMemCount[i]);
                 }
             }
         }
     }
-    //sort ProteomeList by repProtKey
-    SORT_PARALLEL(ProteomeList.begin(), ProteomeList.end(), ProteomeEntry::compareByKey);
+    //sort proteomeList by repProtKey
+    SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey);
 
     //write result of proteome clustering
-    writeProteomeClusters(proteomeClustWriter, ProteomeList);
+    writeProteomeClusters(proteomeClustWriter, proteomeList);
 
     proteomeClustWriter.close();
     proteinClustWriter.close();
diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt
index b4e1f0cf1..5aa4e0bd9 100644
--- a/src/workflow/CMakeLists.txt
+++ b/src/workflow/CMakeLists.txt
@@ -14,7 +14,7 @@ set(workflow_source_files
         workflow/Search.cpp
         workflow/Taxonomy.cpp
         workflow/EasyTaxonomy.cpp
-        workflow/EasyAlignproteome.cpp
+        workflow/EasyProteomecluster.cpp
         workflow/CreateIndex.cpp
         PARENT_SCOPE
         )
diff --git a/src/workflow/EasyAlignproteome.cpp b/src/workflow/EasyAlignproteome.cpp
deleted file mode 100644
index 5374da773..000000000
--- a/src/workflow/EasyAlignproteome.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include <cassert>
-
-#include "FileUtil.h"
-#include "CommandCaller.h"
-#include "Util.h"
-#include "Debug.h"
-#include "Parameters.h"
-
-#include "easyalignproteome.sh.h"
-
-void setEasyAlignproteomeDefaults(Parameters *p) {
-    p->proteomeSimThr = 0.9;
-}
-
-// void setEasyAlignproteomeMustPassAlong(Parameters *p){
-
-// }
-
-int easyalignproteome(int argc, const char **argv, const Command &command) {
-    Parameters &par = Parameters::getInstance();
-    setEasyAlignproteomeDefaults(&par);
-    par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
-    // setEasyAlignproteomeMustPassAlong(&par);
-    std::string tmpDir = par.filenames.back();
-    std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params));
-    if (par.reuseLatest) {
-        hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
-    }
-    tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
-    par.filenames.pop_back();
-
-    CommandCaller cmd;
-    cmd.addVariable("ALIGNPROTEOME_PAR", par.createParameterString(par.alignproteome,true).c_str()); // what?
-    std::string program = tmpDir + "/easyalignproteome.sh";
-    FileUtil::writeFile(program, easyalignproteome_sh, easyalignproteome_sh_len);
-    cmd.execProgram(program.c_str(), par.filenames);
-    
-    // Should never get here
-    assert(false);
-    return 0;
-}
diff --git a/src/workflow/EasyLinclust.cpp b/src/workflow/EasyLinclust.cpp
index 375ff1610..181968a8b 100644
--- a/src/workflow/EasyLinclust.cpp
+++ b/src/workflow/EasyLinclust.cpp
@@ -28,7 +28,6 @@ void setEasyLinclustMustPassAlong(Parameters *p) {
     p->PARAM_REMOVE_TMP_FILES.wasSet = true;
     p->PARAM_C.wasSet = true;
     p->PARAM_E.wasSet = true;
-    //p->PARAM_ALIGNMENT_MODE.wasSet = true;
     p->PARAM_ORF_START_MODE.wasSet = true;
     p->PARAM_ORF_MIN_LENGTH.wasSet = true;
     p->PARAM_ORF_MAX_LENGTH.wasSet = true;

From fd1f5d6afa3779a30baa88f8bd57515b6d473825 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Wed, 28 Aug 2024 19:30:55 +0900
Subject: [PATCH 12/31] original index

---
 src/commons/IndexReader.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/commons/IndexReader.h b/src/commons/IndexReader.h
index 0ad0ca15e..298c2bb8c 100644
--- a/src/commons/IndexReader.h
+++ b/src/commons/IndexReader.h
@@ -22,7 +22,7 @@ class IndexReader {
     ) : sequenceReader(NULL), index(NULL) {
         int targetDbtype = FileUtil::parseDbType(dataName.c_str());
         if (Parameters::isEqualDbtype(targetDbtype, Parameters::DBTYPE_INDEX_DB)) {
-            new DBReader<unsigned int>(dataName.c_str(), (dataName + ".index").c_str(), 1, dataMode);
+            index = new DBReader<unsigned int>(dataName.c_str(), (dataName + ".index").c_str(), 1, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
             index->open(DBReader<unsigned int>::NOSORT);
             if (PrefilteringIndexReader::checkIfIndexFile(index)) {
                 PrefilteringIndexReader::printSummary(index);

From 0d7db0f4e227c9141b5967b69eeaf6076e60eb88 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Wed, 28 Aug 2024 19:31:28 +0900
Subject: [PATCH 13/31] use sync_fetch_and_add when increment memcount

---
 src/util/proteomecluster.cpp | 43 ++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 84d7421e0..e1c44c0d0 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -24,12 +24,6 @@ struct __attribute__((__packed__)) ProteomeEntry{
     ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f)
         : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {}
 
-    void incrementMemCount(unsigned int count) {
-        memCount += count;
-    }
-    void decrementmemCount() {
-        memCount--;
-    }
     int getRepProtKey() {
         return repProtKey;
     }
@@ -113,7 +107,7 @@ void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterRep
     std::vector<unsigned int> memberKeys;
     memberKeys.reserve(50); // store key for every protein in a cluster
 
-    std::vector<bool> isProteomeInCluster(localMemCount.size(), false);
+    std::vector<uint8_t> isProteomeInCluster(localMemCount.size(), 0);
     char buffer[1024 + 32768*4];
     char *clustData = linResDB.getData(id, thread_idx);
     
@@ -133,7 +127,9 @@ void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterRep
             mem.proteomeKey = proteomeKey;
             mem.proteinKey = proteinId;
             eachClusterRep.memberProteins.push_back(mem);
-            isProteomeInCluster[proteomeKey] = true;
+            if (isProteomeInCluster[proteomeKey] == 0) {
+                isProteomeInCluster[proteomeKey] = 1;
+            }
         }
         localClusterReps.push_back(eachClusterRep);
     }
@@ -151,7 +147,7 @@ bool FindNextRepProteome(std::vector<ProteomeEntry>& proteomeList, unsigned int&
     unsigned int maxMemberCount = 0;
     unsigned int notCoveredProtCount = 0;
 
-    for (size_t idx = 0; idx < proteomeList.size(); idx++){
+    for (size_t idx = 0; idx < proteomeList.size(); idx++) {
         if (proteomeList[idx].isCovered()) {
             continue;
         }else{
@@ -268,7 +264,7 @@ bool updateproteomeList(std::vector<ProteomeEntry>& proteomeList, const unsigned
 }
 
 void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& proteomeList, std::vector<unsigned int>& localMemCount){
-    std::vector<bool> isProteomeInCluster(localMemCount.size(), false);
+    std::vector<uint8_t> isProteomeInCluster(localMemCount.size(), 0);
     if (clusterRep.isAvailable) {
         bool isAllMemberCovered = true;
         unsigned int notCoveredCount = 0;
@@ -279,14 +275,16 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& pro
                 notCoveredCount++;
                 lastProteomeKey = eachMember.proteomeKey;
                 isAllMemberCovered = false;
-                isProteomeInCluster[eachMember.proteomeKey] = true;
+                if (isProteomeInCluster[eachMember.proteomeKey] == 0) {
+                    isProteomeInCluster[eachMember.proteomeKey] = 1;
+                }
             }
         }
         if (isAllMemberCovered) {
             clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
         }
         if (notCoveredCount == 1) { //singleton Cluster
-            isProteomeInCluster[lastProteomeKey] = false;
+            isProteomeInCluster[lastProteomeKey] = 0;
             clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
         }
     }
@@ -392,7 +390,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
 
     int gapOpen, gapExtend;
     BaseMatrix *subMat;
-    subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
+    SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
     gapOpen = par.gapOpen.values.aminoacid();
     gapExtend = par.gapExtend.values.aminoacid();
     EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
@@ -412,11 +410,12 @@ int proteomecluster(int argc, const char **argv, const Command &command){
             initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx);
         }
 
+        for (size_t idx = 0; idx < localMemCount.size(); idx++) {
+        __sync_fetch_and_add(&proteomeList[idx].memCount, localMemCount[idx]);
+        }
+
         #pragma omp critical
         {
-            for (size_t idx=0; idx < localMemCount.size(); idx++){
-                proteomeList[idx].incrementMemCount(localMemCount[idx]);
-            }
             clusterReps.insert(clusterReps.end(),
                                std::make_move_iterator(localClusterReps.begin()),
                                std::make_move_iterator(localClusterReps.end()));
@@ -466,15 +465,12 @@ int proteomecluster(int argc, const char **argv, const Command &command){
         {
             std::vector<unsigned int> localMemCount(proteomeList.size(), 0);
             #pragma omp for schedule(dynamic, 1)
-            for (size_t i = 0; i < clusterReps.size(); i++){
+            for (size_t i = 0; i < clusterReps.size(); i++) {
                 updateClusterReps(clusterReps[i], proteomeList, localMemCount);
             }
-            
-            #pragma omp critical
-            {
-                for (size_t i = 0; i < proteomeList.size(); i++) {
-                    proteomeList[i].incrementMemCount(localMemCount[i]);
-                }
+
+            for (size_t i = 0; i < proteomeList.size(); i++) {
+                __sync_fetch_and_add(&proteomeList[i].memCount, localMemCount[i]);
             }
         }
     }
@@ -487,7 +483,6 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     proteomeClustWriter.close();
     proteinClustWriter.close();
     tProteinDB.close();
-    delete subMat;
     linResDB.close();
     return EXIT_SUCCESS;
 }
\ No newline at end of file

From 1fe489e7fe1a9dafa746be1ff05ae880a2f5f025 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Wed, 28 Aug 2024 20:10:08 +0900
Subject: [PATCH 14/31] change submat

---
 src/util/proteomecluster.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index e1c44c0d0..845f18f08 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -243,7 +243,6 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
 bool updateproteomeList(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId){
     bool isRepSingleton = true;
 
-    // #pragma omp for schedule(dynamic, 1)
     for (size_t i = 0; i < proteomeList.size(); i++) {
         if (proteomeList[i].isCovered() == false) {
             if (i == RepProteomeId){
@@ -389,11 +388,11 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     std::vector<ClusterEntry> clusterReps; 
 
     int gapOpen, gapExtend;
-    BaseMatrix *subMat;
+    // BaseMatrix *subMat;
     SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
     gapOpen = par.gapOpen.values.aminoacid();
     gapExtend = par.gapExtend.values.aminoacid();
-    EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
+    EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend);
 
     // Debug(Debug::INFO) << "Start Initialization\n";
     #pragma omp parallel
@@ -433,9 +432,9 @@ int proteomecluster(int argc, const char **argv, const Command &command){
         #ifdef OPENMP
             thread_idx = (unsigned int) omp_get_thread_num();
         #endif  
-            Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop);
-            Sequence query(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
-            Sequence target(par.maxSeqLen, tProteinSeqType, subMat, 0, false, par.compBiasCorrection);
+            Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, &subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop);
+            Sequence query(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
+            Sequence target(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
             std::vector <float> localSeqIds(proteomeList.size(), 0.0f);
             #pragma omp for schedule(dynamic, 1)
             for (size_t i = 0; i < clusterReps.size(); i++) {

From 86f6b15d38dd64b13a475fcf2ce05eaf91f06dbe Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Thu, 29 Aug 2024 00:21:14 +0900
Subject: [PATCH 15/31] add timer

---
 src/util/proteomecluster.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 845f18f08..c93fd13d4 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -8,6 +8,7 @@
 #include "SubstitutionMatrix.h"
 #include "Alignment.h"
 #include "itoa.h"
+#include "Timer.h"
 
 #ifdef OPENMP
 #include <omp.h>
@@ -393,7 +394,8 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     gapOpen = par.gapOpen.values.aminoacid();
     gapExtend = par.gapExtend.values.aminoacid();
     EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend);
-
+    Debug(Debug::INFO) << "Initilize ";
+    Timer timer;
     // Debug(Debug::INFO) << "Start Initialization\n";
     #pragma omp parallel
     {
@@ -421,10 +423,12 @@ int proteomecluster(int argc, const char **argv, const Command &command){
         }
 
     }
+    Debug(Debug::INFO) << timer.lap() << "\n";
 
     unsigned int RepProteomeId = -1;
     // Debug(Debug::INFO) << "Run Alignment\n";
-
+    Debug(Debug::INFO) << "Proteome Clustering ";
+    timer.reset();
     while (FindNextRepProteome(proteomeList, RepProteomeId)) {
         #pragma omp parallel
         {
@@ -473,6 +477,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
             }
         }
     }
+    Debug(Debug::INFO) << timer.lap() << "\n";
     //sort proteomeList by repProtKey
     SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey);
 

From be5a53dc41cb79d4bcf9ca889fba70b4d1553038 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Thu, 29 Aug 2024 00:43:43 +0900
Subject: [PATCH 16/31] Remove __packed__ to resolve sync_fetch_and_add error

---
 src/util/proteomecluster.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index c93fd13d4..61aca740f 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -14,7 +14,7 @@
 #include <omp.h>
 #endif
 
-struct __attribute__((__packed__)) ProteomeEntry{
+struct ProteomeEntry{
     int proteomeKey;
     unsigned int proteomeAALen;
     int repProtKey;

From 8d3a38f543e0de81fb66523298cf0884f1865351 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Thu, 29 Aug 2024 16:21:28 +0900
Subject: [PATCH 17/31] change chmod in createdb

---
 src/util/createdb.cpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 src/util/createdb.cpp

diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp
old mode 100755
new mode 100644

From bfa1911e211b16a4d0e48eef1a3fe98a746c321d Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Thu, 29 Aug 2024 16:50:48 +0900
Subject: [PATCH 18/31] update proteome-similarity threshold parameter

---
 src/util/proteomecluster.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 61aca740f..8190aca50 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -210,9 +210,9 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
             if (eachTargetMember.proteomeKey == RepProteomeId) {
                 continue;
             }
-            // if query Proteome's length < target Proteome's length * 0.9, skip
+            // if query Proteome's length < target Proteome's length * proteomeSimThr, skip
             const unsigned int targetProteomeLength = proteomeList[eachTargetMember.proteomeKey].proteomeAALen;
-            if (queryProteomeLength < targetProteomeLength * 0.9) {
+            if (queryProteomeLength < targetProteomeLength * par.proteomeSimThr) {
                 continue;
             }
             const unsigned int targetId = eachTargetMember.proteinKey;
@@ -241,7 +241,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
     }
 }
 
-bool updateproteomeList(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId){
+bool updateproteomeList(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId, Parameters& par) {
     bool isRepSingleton = true;
 
     for (size_t i = 0; i < proteomeList.size(); i++) {
@@ -251,7 +251,7 @@ bool updateproteomeList(std::vector<ProteomeEntry>& proteomeList, const unsigned
                 proteomeList[i].protSimScore = 1.0;
             }else{
                 proteomeList[i].computeRedundancy();
-                if (proteomeList[i].getProtSimScore() >= 0.9) {
+                if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) {
                     proteomeList[i].repProtKey = RepProteomeId;
                     isRepSingleton = false;
                 }else{
@@ -456,7 +456,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
         }
 
         // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n";
-        bool isRepSingleton = updateproteomeList(proteomeList, RepProteomeId);
+        bool isRepSingleton = updateproteomeList(proteomeList, RepProteomeId, par);
 
         if (isRepSingleton) {
             proteomeList[RepProteomeId].repProtKey = RepProteomeId;

From 7e912d24268cbe4b7f6bb169494c177fa2b8a1cd Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Mon, 9 Sep 2024 01:51:06 +0900
Subject: [PATCH 19/31] Sort redundant proteomes by similarity score in
 proteome_cluster.tsv

---
 src/util/proteomecluster.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 8190aca50..d4e79ad4e 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -68,6 +68,9 @@ struct ProteomeEntry{
         if (a.proteomeKey != a.repProtKey && b.proteomeKey == b.repProtKey){
             return false;
         }
+        if (a.proteomeKey != a.repProtKey && b.proteomeKey != b.repProtKey) {
+            return a.protSimScore > b.protSimScore;
+        }
         return false;
     }
 };

From 618e2bf567aa0fd4c13b80d09aa536ba138b4091 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Mon, 9 Sep 2024 10:50:03 +0900
Subject: [PATCH 20/31] delete redundant code - checking proteome is
 singleton(no redudant member proteome is detected)

---
 src/util/proteomecluster.cpp | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index d4e79ad4e..366ae8e8f 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -21,9 +21,10 @@ struct ProteomeEntry{
     float protSimScore;
     unsigned int memCount;
     float totalSeqId;
+    unsigned int reciprocalMemCount;
 
-    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f)
-        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) {}
+    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f, unsigned int reciprocalMemCount = 0)
+        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) , reciprocalMemCount(reciprocalMemCount) {}
 
     int getRepProtKey() {
         return repProtKey;
@@ -50,6 +51,7 @@ struct ProteomeEntry{
     void resetProteomeInfo(){
         memCount = 0;
         totalSeqId = 0.0f;
+        reciprocalMemCount = 0;
     }
     float getProtSimScore() {
         return protSimScore;
@@ -244,9 +246,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
     }
 }
 
-bool updateproteomeList(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId, Parameters& par) {
-    bool isRepSingleton = true;
-
+void updateProteomeRedundancyInfo(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId, Parameters& par) {
     for (size_t i = 0; i < proteomeList.size(); i++) {
         if (proteomeList[i].isCovered() == false) {
             if (i == RepProteomeId){
@@ -256,14 +256,13 @@ bool updateproteomeList(std::vector<ProteomeEntry>& proteomeList, const unsigned
                 proteomeList[i].computeRedundancy();
                 if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) {
                     proteomeList[i].repProtKey = RepProteomeId;
-                    isRepSingleton = false;
                 }else{
                     proteomeList[i].resetProteomeInfo();
                 }
             }
         }
     }
-    return isRepSingleton;
+    
 }
 
 void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& proteomeList, std::vector<unsigned int>& localMemCount){
@@ -453,18 +452,18 @@ int proteomecluster(int argc, const char **argv, const Command &command){
             #pragma omp critical
             {
                 for (size_t idx = 0; idx < proteomeList.size(); idx++) {
-                    proteomeList[idx].addSeqId(localSeqIds[idx]);
+                    if (localSeqIds[idx] > 0.0f) {
+                        proteomeList[idx].addSeqId(localSeqIds[idx]);
+                        if (idx != RepProteomeId) {
+                            proteomeList[idx].reciprocalMemCount++;
+                        }
+                    }
                 }
             }
         }
 
         // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n";
-        bool isRepSingleton = updateproteomeList(proteomeList, RepProteomeId, par);
-
-        if (isRepSingleton) {
-            proteomeList[RepProteomeId].repProtKey = RepProteomeId;
-            proteomeList[RepProteomeId].protSimScore = 1.0;
-        }
+        updateProteomeRedundancyInfo(proteomeList, RepProteomeId, par);
 
         // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n";
         #pragma omp parallel

From e990773212b9e04f6eefed415f5b71e9dca6dd5f Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Mon, 9 Sep 2024 11:12:53 +0900
Subject: [PATCH 21/31] stop

---
 src/util/proteomecluster.cpp | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 366ae8e8f..b4285baf6 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -22,9 +22,10 @@ struct ProteomeEntry{
     unsigned int memCount;
     float totalSeqId;
     unsigned int reciprocalMemCount;
+    float jaccardIndex;
 
-    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f, unsigned int reciprocalMemCount = 0)
-        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) , reciprocalMemCount(reciprocalMemCount) {}
+    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f, unsigned int reciprocalMemCount = 0, float jaccardIndex = 0.0f)
+        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) , reciprocalMemCount(reciprocalMemCount), jaccardIndex(jaccardIndex) {}
 
     int getRepProtKey() {
         return repProtKey;
@@ -38,9 +39,9 @@ struct ProteomeEntry{
         }
         return true;
     }
-    void computeRedundancy() {
+    void computeRedundancy(unsigned int repProteomeMemCount) {
         protSimScore = totalSeqId / proteomeAALen;
-
+        jaccardIndex = reciprocalMemCount / (memCount + repProteomeMemCount - reciprocalMemCount);
     }
     void addSeqId(float seqId) {
         totalSeqId += seqId;
@@ -52,10 +53,14 @@ struct ProteomeEntry{
         memCount = 0;
         totalSeqId = 0.0f;
         reciprocalMemCount = 0;
+        jaccardIndex = 0.0f;
     }
     float getProtSimScore() {
         return protSimScore;
     }
+    float getJaccardIndex() {
+        return jaccardIndex;
+    }
     static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) {
         
         if (a.repProtKey < b.repProtKey){
@@ -247,15 +252,19 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
 }
 
 void updateProteomeRedundancyInfo(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId, Parameters& par) {
+    const unsigned int& repProteomeMemCount = proteomeList[RepProteomeId].memCount;
+    Debug(Debug::INFO) << "Rep Proteome " << RepProteomeId << " has " << repProteomeMemCount << " members\n";
     for (size_t i = 0; i < proteomeList.size(); i++) {
         if (proteomeList[i].isCovered() == false) {
             if (i == RepProteomeId){
                 proteomeList[i].repProtKey = RepProteomeId;
                 proteomeList[i].protSimScore = 1.0;
             }else{
-                proteomeList[i].computeRedundancy();
+                proteomeList[i].computeRedundancy(repProteomeMemCount);
                 if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) {
                     proteomeList[i].repProtKey = RepProteomeId;
+                    
+                    Debug(Debug::INFO) << "Proteome " << i << " is covered by " << RepProteomeId << " with ReciprocalMem: " << proteomeList[i].reciprocalMemCount << " with jaccard " << proteomeList[i].getJaccardIndex() << "\n";
                 }else{
                     proteomeList[i].resetProteomeInfo();
                 }
@@ -317,8 +326,12 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEn
                 char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\t';
                 tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
+                //jaccard
+                *(tmpProteomeBuffer - 1) = '\t';
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getJaccardIndex(), tmpProteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\n';
                 proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
+                
             }
             proteomeClustWriter.writeEnd(repProtIdCluster);
             // Reset

From e9ffe220a6f4f60107300646f07a05cc9d46baf9 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Mon, 9 Sep 2024 01:38:00 +0900
Subject: [PATCH 22/31] jaccardindexTrial

---
 src/commons/DBReader.cpp     |   2 +-
 src/util/proteomecluster.cpp | 173 ++++++++++++++++++++---------------
 2 files changed, 98 insertions(+), 77 deletions(-)

diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp
index 598d152f6..1eb80fd06 100644
--- a/src/commons/DBReader.cpp
+++ b/src/commons/DBReader.cpp
@@ -735,7 +735,7 @@ template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){
 
 template <typename T> T DBReader<T>::getSourceKey(size_t id){
     if (id >= sourceSize){
-        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".lookup\n";
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
         Debug(Debug::ERROR) << "getSource id: local id (" << id << ") >= db size (" << sourceSize << ")\n";
         EXIT(EXIT_FAILURE);
     }
diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index b4285baf6..d88cb5c33 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -19,13 +19,13 @@ struct ProteomeEntry{
     unsigned int proteomeAALen;
     int repProtKey;
     float protSimScore;
-    unsigned int memCount;
+    unsigned int clusterCount;
     float totalSeqId;
-    unsigned int reciprocalMemCount;
+    unsigned int reciprocalClusterCount;
     float jaccardIndex;
 
-    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int memCount = 0, float seqId = 0.0f, unsigned int reciprocalMemCount = 0, float jaccardIndex = 0.0f)
-        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), memCount(memCount), totalSeqId(seqId) , reciprocalMemCount(reciprocalMemCount), jaccardIndex(jaccardIndex) {}
+    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f, unsigned int reciprocalClusterCount = 0, float jaccardIndex = 0.0f)
+        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) , reciprocalClusterCount(reciprocalClusterCount), jaccardIndex(jaccardIndex) {}
 
     int getRepProtKey() {
         return repProtKey;
@@ -39,9 +39,9 @@ struct ProteomeEntry{
         }
         return true;
     }
-    void computeRedundancy(unsigned int repProteomeMemCount) {
+    void computeRedundancy(unsigned int repProteomeclusterCount) {
         protSimScore = totalSeqId / proteomeAALen;
-        jaccardIndex = reciprocalMemCount / (memCount + repProteomeMemCount - reciprocalMemCount);
+        jaccardIndex = reciprocalClusterCount / (clusterCount + repProteomeclusterCount - reciprocalClusterCount);
     }
     void addSeqId(float seqId) {
         totalSeqId += seqId;
@@ -50,9 +50,9 @@ struct ProteomeEntry{
         proteomeAALen += seqLength;
     }
     void resetProteomeInfo(){
-        memCount = 0;
+        clusterCount = 0;
         totalSeqId = 0.0f;
-        reciprocalMemCount = 0;
+        reciprocalClusterCount = 0;
         jaccardIndex = 0.0f;
     }
     float getProtSimScore() {
@@ -114,76 +114,89 @@ void calculateProteomeLength(std::vector<ProteomeEntry>& proteomeList, DBReader<
     }
 }
 
-void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterReps, std::vector<unsigned int>& localMemCount, DBReader<unsigned int>& tProteinDB, DBReader<unsigned int>& linResDB, unsigned int thread_idx){
+void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterReps, std::vector<unsigned int>& localProteomeCount, DBReader<unsigned int>& tProteinDB, DBReader<unsigned int>& linResDB, unsigned int thread_idx){
     std::vector<unsigned int> memberKeys;
     memberKeys.reserve(50); // store key for every protein in a cluster
 
-    std::vector<uint8_t> isProteomeInCluster(localMemCount.size(), 0);
+    std::vector<uint8_t> isProteomeInCluster(localProteomeCount.size(), 0);
     char buffer[1024 + 32768*4];
     char *clustData = linResDB.getData(id, thread_idx);
     
+    
     while (*clustData != '\0') {
         Util::parseKey(clustData, buffer);
         const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10);
         memberKeys.push_back(key);
         clustData = Util::skipLine(clustData);
     }
-    if (memberKeys.size() > 1) { //If not a singleton cluster
-        ClusterEntry eachClusterRep(memberKeys.size());
-        //init MemberProteinEntry and add it to memberProteins vector
-        for (auto& eachMemberKey : memberKeys) {
-            const unsigned int proteinId = tProteinDB.getId(eachMemberKey);
-            const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId);
-            MemberProteinEntry mem;
-            mem.proteomeKey = proteomeKey;
-            mem.proteinKey = proteinId;
-            eachClusterRep.memberProteins.push_back(mem);
-            if (isProteomeInCluster[proteomeKey] == 0) {
-                isProteomeInCluster[proteomeKey] = 1;
-            }
+
+    if (memberKeys.size() <= 1) {
+        return; //singleton cluster (1 member only)
+    }
+
+    ClusterEntry eachClusterRep(memberKeys.size());
+    int uniqueProteomeCount = 0;
+    //init MemberProteinEntry and add it to memberProteins vector
+    for (auto& eachMemberKey : memberKeys) {
+        const unsigned int proteinId = tProteinDB.getId(eachMemberKey);
+        const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId);
+        MemberProteinEntry mem;
+        mem.proteomeKey = proteomeKey;
+        mem.proteinKey = proteinId;
+        eachClusterRep.memberProteins.push_back(mem);
+        if (isProteomeInCluster[proteomeKey] == 0) { //If gene is from new proteome(source)
+            isProteomeInCluster[proteomeKey] = 1;
+            uniqueProteomeCount++;
         }
-        localClusterReps.push_back(eachClusterRep);
     }
 
-    for (size_t i = 0; i < localMemCount.size(); i++) {
+    if (uniqueProteomeCount <= 1) {
+        return; //singleton cluster (geneDuplication_1 proteome only with multiple members)
+    }
+
+    localClusterReps.push_back(eachClusterRep);
+    for (size_t i = 0; i < localProteomeCount.size(); i++) {
         if (isProteomeInCluster[i]) {
-            localMemCount[i]++;
+            localProteomeCount[i]++;
         }
     }
 }
 
 bool FindNextRepProteome(std::vector<ProteomeEntry>& proteomeList, unsigned int& RepProteomeId) {
-    bool isAllCovered = true;
+    bool isAllProteomeRedundancyCovered = true;
     
     unsigned int maxMemberCount = 0;
-    unsigned int notCoveredProtCount = 0;
+    unsigned int remainingProteomeCount = 0;
+    unsigned int lastProteomeKey = -1;
 
     for (size_t idx = 0; idx < proteomeList.size(); idx++) {
         if (proteomeList[idx].isCovered()) {
             continue;
         }else{
-            isAllCovered = false;
-            notCoveredProtCount++;
-            if (proteomeList[idx].memCount > maxMemberCount) {
-                maxMemberCount = proteomeList[idx].memCount;
+            isAllProteomeRedundancyCovered = false;
+            remainingProteomeCount++;
+            lastProteomeKey = idx;
+            if (proteomeList[idx].clusterCount > maxMemberCount) {
+                maxMemberCount = proteomeList[idx].clusterCount;
                 RepProteomeId = idx;
             }
         }
     }
 
-    if (isAllCovered){
+    if (isAllProteomeRedundancyCovered){
         return false;
-    }else if (notCoveredProtCount == 1) {
-        //last one and it is singleton
-        proteomeList[RepProteomeId].repProtKey = RepProteomeId;
-        proteomeList[RepProteomeId].protSimScore = 1.0;
+    }else if (remainingProteomeCount == 1) {
+        //Only proteome is left. It isn't redundant with any other proteome
+        proteomeList[lastProteomeKey].repProtKey = proteomeList[lastProteomeKey].proteomeKey;
+        proteomeList[lastProteomeKey].protSimScore = 1.0;
+        proteomeList[lastProteomeKey].jaccardIndex = 1.0;
         return false;
     }else{
         return true;
     }
 }
 
-void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinClustWriter) {
+void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinClustWriter, std::vector<unsigned int>& localProteomeCount) {
     char buffer[1024]; 
     bool isRepFound = false;
     unsigned int lastqLen = 0;
@@ -202,6 +215,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
         }
     }
     if (isRepFound){
+        std::vector<uint8_t> isProteomeInCluster(localProteomeCount.size(), 0);
         proteinClustWriter.writeStart(thread_idx);
         const unsigned int queryId = qproteinKey;
         const unsigned int queryKey = tProteinDB.getDbKey(queryId);
@@ -244,27 +258,33 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
                     size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
                     proteinClustWriter.writeAdd(buffer, len, thread_idx);
                     localSeqIds[tproteomeKey] += result.getSeqId()*target.L;
+                    if (isProteomeInCluster[tproteomeKey] == 0) {
+                        isProteomeInCluster[tproteomeKey] = 1;
+                    }
                 }
             }
         }
+        for (size_t i = 0; i < localProteomeCount.size(); i++) {
+            if (isProteomeInCluster[i]) {
+                localProteomeCount[i]++;
+            }
+        }
         proteinClustWriter.writeEnd(queryKey, thread_idx);
     }
 }
 
 void updateProteomeRedundancyInfo(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId, Parameters& par) {
-    const unsigned int& repProteomeMemCount = proteomeList[RepProteomeId].memCount;
-    Debug(Debug::INFO) << "Rep Proteome " << RepProteomeId << " has " << repProteomeMemCount << " members\n";
+    const unsigned int& repProteomeclusterCount = proteomeList[RepProteomeId].clusterCount;
     for (size_t i = 0; i < proteomeList.size(); i++) {
         if (proteomeList[i].isCovered() == false) {
             if (i == RepProteomeId){
                 proteomeList[i].repProtKey = RepProteomeId;
                 proteomeList[i].protSimScore = 1.0;
+                proteomeList[i].jaccardIndex = 1.0;
             }else{
-                proteomeList[i].computeRedundancy(repProteomeMemCount);
+                proteomeList[i].computeRedundancy(repProteomeclusterCount);
                 if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) {
                     proteomeList[i].repProtKey = RepProteomeId;
-                    
-                    Debug(Debug::INFO) << "Proteome " << i << " is covered by " << RepProteomeId << " with ReciprocalMem: " << proteomeList[i].reciprocalMemCount << " with jaccard " << proteomeList[i].getJaccardIndex() << "\n";
                 }else{
                     proteomeList[i].resetProteomeInfo();
                 }
@@ -274,37 +294,37 @@ void updateProteomeRedundancyInfo(std::vector<ProteomeEntry>& proteomeList, cons
     
 }
 
-void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& proteomeList, std::vector<unsigned int>& localMemCount){
-    std::vector<uint8_t> isProteomeInCluster(localMemCount.size(), 0);
+void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& proteomeList, std::vector<unsigned int>& localProteomeCount){
+    std::vector<uint8_t> isProteomeInCluster(localProteomeCount.size(), 0);
     if (clusterRep.isAvailable) {
-        bool isAllMemberCovered = true;
-        unsigned int notCoveredCount = 0;
-        unsigned int lastProteomeKey = 0;
+        unsigned int notCoveredMemberCount = 0;
+        // unsigned int lastProteomeKey = 0; //gg : unused
+        unsigned int uniqueProteomeCount = 0;
         //update cluster member info
         for (auto& eachMember : clusterRep.memberProteins) {
             if (proteomeList[eachMember.proteomeKey].isCovered() == false) {
-                notCoveredCount++;
-                lastProteomeKey = eachMember.proteomeKey;
-                isAllMemberCovered = false;
+                notCoveredMemberCount++; 
+                // lastProteomeKey = eachMember.proteomeKey;// gg : unused
                 if (isProteomeInCluster[eachMember.proteomeKey] == 0) {
                     isProteomeInCluster[eachMember.proteomeKey] = 1;
+                    uniqueProteomeCount++;
                 }
             }
         }
-        if (isAllMemberCovered) {
-            clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
-        }
-        if (notCoveredCount == 1) { //singleton Cluster
-            isProteomeInCluster[lastProteomeKey] = 0;
+        if (notCoveredMemberCount <= 1 || uniqueProteomeCount <= 1 ) { 
+            //notCoveredMemberCount : All members(genes)' proteome are covere(0) or only one member(gene) is not covered(1)
+            //uniqueProteomeCount : Remaining members(genes) are from same(one) proteome
             clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
+            return;
         }
+        
     }
 
-    //update localMemCount
+    //update localProteomeCount
     if (clusterRep.isAvailable) {
-        for (size_t i=0; i < localMemCount.size(); i++) {
+        for (size_t i=0; i < localProteomeCount.size(); i++) {
             if (isProteomeInCluster[i]) {
-                localMemCount[i]++;
+                localProteomeCount[i]++;
             }
         }
     }
@@ -326,9 +346,6 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEn
                 char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\t';
                 tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
-                //jaccard
-                *(tmpProteomeBuffer - 1) = '\t';
-                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getJaccardIndex(), tmpProteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\n';
                 proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
                 
@@ -409,7 +426,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     gapOpen = par.gapOpen.values.aminoacid();
     gapExtend = par.gapExtend.values.aminoacid();
     EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend);
-    Debug(Debug::INFO) << "Initilize ";
+    Debug(Debug::INFO) << "Initialization\n";
     Timer timer;
     // Debug(Debug::INFO) << "Start Initialization\n";
     #pragma omp parallel
@@ -419,15 +436,15 @@ int proteomecluster(int argc, const char **argv, const Command &command){
         thread_idx = (unsigned int) omp_get_thread_num();
     #endif    
         std::vector<ClusterEntry> localClusterReps;
-        std::vector<unsigned int> localMemCount(proteomeList.size(), 0);
+        std::vector<unsigned int> localProteomeCount(proteomeList.size(), 0);
 
         #pragma omp for schedule(dynamic, 1)
         for (size_t id = 0; id < linResDB.getSize(); id++) {
-            initLocalClusterReps(id, localClusterReps, localMemCount, tProteinDB, linResDB, thread_idx);
+            initLocalClusterReps(id, localClusterReps, localProteomeCount, tProteinDB, linResDB, thread_idx);
         }
 
-        for (size_t idx = 0; idx < localMemCount.size(); idx++) {
-        __sync_fetch_and_add(&proteomeList[idx].memCount, localMemCount[idx]);
+        for (size_t idx = 0; idx < localProteomeCount.size(); idx++) {
+        __sync_fetch_and_add(&proteomeList[idx].clusterCount, localProteomeCount[idx]);
         }
 
         #pragma omp critical
@@ -440,11 +457,16 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     }
     Debug(Debug::INFO) << timer.lap() << "\n";
 
+    // //for debug gg
+    // for (size_t i = 0; i < proteomeList.size(); i++) {
+    //     Debug(Debug::INFO) << "Proteome " << i << " has " << proteomeList[i].clusterCount << " members\n";
+    // }
     unsigned int RepProteomeId = -1;
     // Debug(Debug::INFO) << "Run Alignment\n";
     Debug(Debug::INFO) << "Proteome Clustering ";
     timer.reset();
     while (FindNextRepProteome(proteomeList, RepProteomeId)) {
+        Debug(Debug::INFO) << " | Rep Proteome id : " << RepProteomeId << "\n";
         #pragma omp parallel
         {
             unsigned int thread_idx = 0;
@@ -455,21 +477,20 @@ int proteomecluster(int argc, const char **argv, const Command &command){
             Sequence query(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
             Sequence target(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
             std::vector <float> localSeqIds(proteomeList.size(), 0.0f);
+            std::vector <unsigned int> localClusterCount(proteomeList.size(), 0);
             #pragma omp for schedule(dynamic, 1)
             for (size_t i = 0; i < clusterReps.size(); i++) {
                 if (clusterReps[i].isAvailable) {
-                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter);
+                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter, localClusterCount);
                 }
                 
             }
             #pragma omp critical
             {
                 for (size_t idx = 0; idx < proteomeList.size(); idx++) {
-                    if (localSeqIds[idx] > 0.0f) {
+                    if (proteomeList[idx].isCovered() == false) {
                         proteomeList[idx].addSeqId(localSeqIds[idx]);
-                        if (idx != RepProteomeId) {
-                            proteomeList[idx].reciprocalMemCount++;
-                        }
+                        proteomeList[idx].reciprocalClusterCount += localClusterCount[idx];
                     }
                 }
             }
@@ -481,21 +502,21 @@ int proteomecluster(int argc, const char **argv, const Command &command){
         // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n";
         #pragma omp parallel
         {
-            std::vector<unsigned int> localMemCount(proteomeList.size(), 0);
+            std::vector<unsigned int> localProteomeCount(proteomeList.size(), 0);
             #pragma omp for schedule(dynamic, 1)
             for (size_t i = 0; i < clusterReps.size(); i++) {
-                updateClusterReps(clusterReps[i], proteomeList, localMemCount);
+                updateClusterReps(clusterReps[i], proteomeList, localProteomeCount);
             }
 
             for (size_t i = 0; i < proteomeList.size(); i++) {
-                __sync_fetch_and_add(&proteomeList[i].memCount, localMemCount[i]);
+                __sync_fetch_and_add(&proteomeList[i].clusterCount, localProteomeCount[i]);
             }
         }
     }
     Debug(Debug::INFO) << timer.lap() << "\n";
     //sort proteomeList by repProtKey
     SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey);
-
+    
     //write result of proteome clustering
     writeProteomeClusters(proteomeClustWriter, proteomeList);
 

From d14217f72b87c98a6550cc7873444af1a1e78519 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Mon, 9 Sep 2024 19:49:42 +0900
Subject: [PATCH 23/31] add normalized scoring

---
 src/util/proteomecluster.cpp | 55 +++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index d88cb5c33..e94442e47 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -21,11 +21,10 @@ struct ProteomeEntry{
     float protSimScore;
     unsigned int clusterCount;
     float totalSeqId;
-    unsigned int reciprocalClusterCount;
-    float jaccardIndex;
+    float normalizedSimScore;
 
-    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f, unsigned int reciprocalClusterCount = 0, float jaccardIndex = 0.0f)
-        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) , reciprocalClusterCount(reciprocalClusterCount), jaccardIndex(jaccardIndex) {}
+    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f)
+        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) {}
 
     int getRepProtKey() {
         return repProtKey;
@@ -39,9 +38,12 @@ struct ProteomeEntry{
         }
         return true;
     }
-    void computeRedundancy(unsigned int repProteomeclusterCount) {
+    void computeRedundancy(unsigned int repProteomeSize) {
         protSimScore = totalSeqId / proteomeAALen;
-        jaccardIndex = reciprocalClusterCount / (clusterCount + repProteomeclusterCount - reciprocalClusterCount);
+        normalizedSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen);
+        if (normalizedSimScore >= 1.0) {
+            normalizedSimScore = 1.0;
+        }
     }
     void addSeqId(float seqId) {
         totalSeqId += seqId;
@@ -52,15 +54,16 @@ struct ProteomeEntry{
     void resetProteomeInfo(){
         clusterCount = 0;
         totalSeqId = 0.0f;
-        reciprocalClusterCount = 0;
-        jaccardIndex = 0.0f;
+        normalizedSimScore = 0.0f;
+        protSimScore = 0.0f;
     }
     float getProtSimScore() {
         return protSimScore;
     }
-    float getJaccardIndex() {
-        return jaccardIndex;
+    float getNormalizedSimScore() {
+        return normalizedSimScore;
     }
+
     static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) {
         
         if (a.repProtKey < b.repProtKey){
@@ -189,14 +192,14 @@ bool FindNextRepProteome(std::vector<ProteomeEntry>& proteomeList, unsigned int&
         //Only proteome is left. It isn't redundant with any other proteome
         proteomeList[lastProteomeKey].repProtKey = proteomeList[lastProteomeKey].proteomeKey;
         proteomeList[lastProteomeKey].protSimScore = 1.0;
-        proteomeList[lastProteomeKey].jaccardIndex = 1.0;
+        proteomeList[lastProteomeKey].normalizedSimScore = 1.0;
         return false;
     }else{
         return true;
     }
 }
 
-void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinClustWriter, std::vector<unsigned int>& localProteomeCount) {
+void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinClustWriter) {
     char buffer[1024]; 
     bool isRepFound = false;
     unsigned int lastqLen = 0;
@@ -215,7 +218,6 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
         }
     }
     if (isRepFound){
-        std::vector<uint8_t> isProteomeInCluster(localProteomeCount.size(), 0);
         proteinClustWriter.writeStart(thread_idx);
         const unsigned int queryId = qproteinKey;
         const unsigned int queryKey = tProteinDB.getDbKey(queryId);
@@ -258,31 +260,23 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
                     size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
                     proteinClustWriter.writeAdd(buffer, len, thread_idx);
                     localSeqIds[tproteomeKey] += result.getSeqId()*target.L;
-                    if (isProteomeInCluster[tproteomeKey] == 0) {
-                        isProteomeInCluster[tproteomeKey] = 1;
-                    }
                 }
             }
         }
-        for (size_t i = 0; i < localProteomeCount.size(); i++) {
-            if (isProteomeInCluster[i]) {
-                localProteomeCount[i]++;
-            }
-        }
         proteinClustWriter.writeEnd(queryKey, thread_idx);
     }
 }
 
 void updateProteomeRedundancyInfo(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId, Parameters& par) {
-    const unsigned int& repProteomeclusterCount = proteomeList[RepProteomeId].clusterCount;
+    const unsigned int& repProteomeAASize = proteomeList[RepProteomeId].proteomeAALen;
     for (size_t i = 0; i < proteomeList.size(); i++) {
         if (proteomeList[i].isCovered() == false) {
             if (i == RepProteomeId){
                 proteomeList[i].repProtKey = RepProteomeId;
                 proteomeList[i].protSimScore = 1.0;
-                proteomeList[i].jaccardIndex = 1.0;
+                proteomeList[i].normalizedSimScore = 1.0;
             }else{
-                proteomeList[i].computeRedundancy(repProteomeclusterCount);
+                proteomeList[i].computeRedundancy(repProteomeAASize);
                 if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) {
                     proteomeList[i].repProtKey = RepProteomeId;
                 }else{
@@ -346,6 +340,9 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEn
                 char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\t';
                 tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
+                //gg
+                *(tmpProteomeBuffer - 1) = '\t';
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getNormalizedSimScore(), tmpProteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\n';
                 proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
                 
@@ -466,7 +463,6 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     Debug(Debug::INFO) << "Proteome Clustering ";
     timer.reset();
     while (FindNextRepProteome(proteomeList, RepProteomeId)) {
-        Debug(Debug::INFO) << " | Rep Proteome id : " << RepProteomeId << "\n";
         #pragma omp parallel
         {
             unsigned int thread_idx = 0;
@@ -477,11 +473,10 @@ int proteomecluster(int argc, const char **argv, const Command &command){
             Sequence query(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
             Sequence target(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
             std::vector <float> localSeqIds(proteomeList.size(), 0.0f);
-            std::vector <unsigned int> localClusterCount(proteomeList.size(), 0);
             #pragma omp for schedule(dynamic, 1)
             for (size_t i = 0; i < clusterReps.size(); i++) {
                 if (clusterReps[i].isAvailable) {
-                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter, localClusterCount);
+                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter);
                 }
                 
             }
@@ -490,7 +485,6 @@ int proteomecluster(int argc, const char **argv, const Command &command){
                 for (size_t idx = 0; idx < proteomeList.size(); idx++) {
                     if (proteomeList[idx].isCovered() == false) {
                         proteomeList[idx].addSeqId(localSeqIds[idx]);
-                        proteomeList[idx].reciprocalClusterCount += localClusterCount[idx];
                     }
                 }
             }
@@ -516,7 +510,10 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     Debug(Debug::INFO) << timer.lap() << "\n";
     //sort proteomeList by repProtKey
     SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey);
-    
+    //debug
+    // for (size_t i = 0; i < proteomeList.size(); i++) {
+    //     Debug(Debug::INFO) << "Proteome " << proteomeList[i].proteomeKey << " RepKey " << proteomeList[i].repProtKey << " normalizedscore: " << proteomeList[i].normalizedSimScore << " members\n";
+    // }
     //write result of proteome clustering
     writeProteomeClusters(proteomeClustWriter, proteomeList);
 

From 5e6ee6d58b1cc1114e3972fe785aa9ba1e5238a9 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Mon, 9 Sep 2024 20:28:22 +0900
Subject: [PATCH 24/31] add proteome relative(normalized) similarity threshold
 parameter

---
 src/MMseqsBase.cpp                   |  2 +-
 src/commons/Parameters.cpp           |  4 +++-
 src/commons/Parameters.h             |  2 ++
 src/util/proteomecluster.cpp         | 29 ++++++++++++++--------------
 src/workflow/EasyProteomecluster.cpp |  1 +
 5 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index 67ec4a76b..cccc9ce94 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -90,7 +90,7 @@ std::vector<Command> baseCommands = {
                 "#  - protein_cluster.tsv:  Protein linclust result\n"
                 "#  - proteome_cluster.tsv: Proteome redundancy clustering result\n"
                 "#  - protein_align.tsv: Protein alignment list\n"
-                "mmseqs easy-proteomecluster examples/DB.fasta result tmp --proteome-similarity 0.9\n",
+                "mmseqs easy-proteomecluster examples/DB.fasta result tmp --proteome-similarity 0.9 --proteome-relative-similarity 0.9\n",
                 "Martin Steinegger <martin.steinegger@snu.ac.kr> & Gyuri Kim <gyuribio@snu.ac.kr>",
                 "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <o:clusterPrefix> <tmpDir>",
                 CITATION_MMSEQS2, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index 6d2418287..b299a2112 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -87,7 +87,7 @@ Parameters::Parameters():
         PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
         PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
         PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
-
+        PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-normalized-similarity", "Proteome normalized similarity", "Proteome similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeNormalizedSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
 #ifdef GAP_POS_SCORING
         PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
 #endif
@@ -385,6 +385,7 @@ Parameters::Parameters():
     proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR);
     proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
     proteomecluster.push_back(&PARAM_PROTEOME_SIMILARITY);
+    proteomecluster.push_back(&PARAM_PROTEOME_RELATIVE_SIMILARITY);
 
     proteomecluster.push_back(&PARAM_REALIGN);
 //    proteomecluster.push_back(&PARAM_MAX_REJECTED);
@@ -2356,6 +2357,7 @@ void Parameters::setDefaults() {
     maxAccept   = INT_MAX;
     seqIdThr = 0.0;
     proteomeSimThr = 0.9;
+    proteomeNormalizedSimThr = 0.9;
     alnLenThr = 0;
     altAlignment = 0;
     gapOpen = MultiParam<NuclAA<int>>(NuclAA<int>(11, 5));
diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
index b23b771cd..a19fcfa52 100644
--- a/src/commons/Parameters.h
+++ b/src/commons/Parameters.h
@@ -436,6 +436,7 @@ class Parameters {
     int    altAlignment;                 // show up to this many alternative alignments
     float  seqIdThr;                     // sequence identity threshold for acceptance
     float  proteomeSimThr;              // proteome similarity threshold for acceptance
+    float proteomeNormalizedSimThr;     // proteome similarity threshold normalized by proteome size for acceptance
     int    alnLenThr;                    // min. alignment length
     bool   addBacktrace;                 // store backtrace string (M=Match, D=deletion, I=insertion)
     bool   realign;                      // realign hit with more conservative score
@@ -790,6 +791,7 @@ class Parameters {
     PARAMETER(PARAM_GAP_OPEN)
     PARAMETER(PARAM_GAP_EXTEND)
     PARAMETER(PARAM_PROTEOME_SIMILARITY)
+    PARAMETER(PARAM_PROTEOME_RELATIVE_SIMILARITY)
 #ifdef GAP_POS_SCORING
     PARAMETER(PARAM_GAP_PSEUDOCOUNT)
 #endif
diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index e94442e47..9faddd439 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -21,7 +21,7 @@ struct ProteomeEntry{
     float protSimScore;
     unsigned int clusterCount;
     float totalSeqId;
-    float normalizedSimScore;
+    float relativeSimScore;
 
     ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f)
         : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) {}
@@ -40,9 +40,9 @@ struct ProteomeEntry{
     }
     void computeRedundancy(unsigned int repProteomeSize) {
         protSimScore = totalSeqId / proteomeAALen;
-        normalizedSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen);
-        if (normalizedSimScore >= 1.0) {
-            normalizedSimScore = 1.0;
+        relativeSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen);
+        if (relativeSimScore >= 1.0) {
+            relativeSimScore = 1.0;
         }
     }
     void addSeqId(float seqId) {
@@ -54,14 +54,14 @@ struct ProteomeEntry{
     void resetProteomeInfo(){
         clusterCount = 0;
         totalSeqId = 0.0f;
-        normalizedSimScore = 0.0f;
+        relativeSimScore = 0.0f;
         protSimScore = 0.0f;
     }
     float getProtSimScore() {
         return protSimScore;
     }
-    float getNormalizedSimScore() {
-        return normalizedSimScore;
+    float getrelativeSimScore() {
+        return relativeSimScore;
     }
 
     static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) {
@@ -192,7 +192,7 @@ bool FindNextRepProteome(std::vector<ProteomeEntry>& proteomeList, unsigned int&
         //Only proteome is left. It isn't redundant with any other proteome
         proteomeList[lastProteomeKey].repProtKey = proteomeList[lastProteomeKey].proteomeKey;
         proteomeList[lastProteomeKey].protSimScore = 1.0;
-        proteomeList[lastProteomeKey].normalizedSimScore = 1.0;
+        proteomeList[lastProteomeKey].relativeSimScore = 1.0;
         return false;
     }else{
         return true;
@@ -274,10 +274,10 @@ void updateProteomeRedundancyInfo(std::vector<ProteomeEntry>& proteomeList, cons
             if (i == RepProteomeId){
                 proteomeList[i].repProtKey = RepProteomeId;
                 proteomeList[i].protSimScore = 1.0;
-                proteomeList[i].normalizedSimScore = 1.0;
+                proteomeList[i].relativeSimScore = 1.0;
             }else{
                 proteomeList[i].computeRedundancy(repProteomeAASize);
-                if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr) {
+                if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr && proteomeList[i].getrelativeSimScore() >= par.proteomeNormalizedSimThr){
                     proteomeList[i].repProtKey = RepProteomeId;
                 }else{
                     proteomeList[i].resetProteomeInfo();
@@ -342,7 +342,7 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEn
                 tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
                 //gg
                 *(tmpProteomeBuffer - 1) = '\t';
-                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getNormalizedSimScore(), tmpProteomeBuffer);
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getrelativeSimScore(), tmpProteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\n';
                 proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
                 
@@ -363,6 +363,8 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEn
                 char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\t';
                 tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\t';
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getrelativeSimScore(), tmpProteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\n';
                 proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
             }
@@ -423,9 +425,8 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     gapOpen = par.gapOpen.values.aminoacid();
     gapExtend = par.gapExtend.values.aminoacid();
     EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend);
-    Debug(Debug::INFO) << "Initialization\n";
+    Debug(Debug::INFO) << "Initialization";
     Timer timer;
-    // Debug(Debug::INFO) << "Start Initialization\n";
     #pragma omp parallel
     {
         unsigned int thread_idx = 0;
@@ -512,7 +513,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey);
     //debug
     // for (size_t i = 0; i < proteomeList.size(); i++) {
-    //     Debug(Debug::INFO) << "Proteome " << proteomeList[i].proteomeKey << " RepKey " << proteomeList[i].repProtKey << " normalizedscore: " << proteomeList[i].normalizedSimScore << " members\n";
+    //     Debug(Debug::INFO) << "Proteome " << proteomeList[i].proteomeKey << " RepKey " << proteomeList[i].repProtKey << " normalizedscore: " << proteomeList[i].relativeSimScore << " members\n";
     // }
     //write result of proteome clustering
     writeProteomeClusters(proteomeClustWriter, proteomeList);
diff --git a/src/workflow/EasyProteomecluster.cpp b/src/workflow/EasyProteomecluster.cpp
index f2c2a2ae1..d7e7509f9 100644
--- a/src/workflow/EasyProteomecluster.cpp
+++ b/src/workflow/EasyProteomecluster.cpp
@@ -11,6 +11,7 @@
 void setEasyproteomeclusterDefaults(Parameters *p) {
     p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
     p->proteomeSimThr = 0.9;
+    p->proteomeNormalizedSimThr = 0.0;
 }
 
 void setEasyproteomeclusterMustPassAlong(Parameters *p){

From 743370d355f660cacf2604bb1014e1053e5709f6 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Mon, 9 Sep 2024 20:51:30 +0900
Subject: [PATCH 25/31] relative similarity threshold

---
 src/commons/Parameters.cpp   | 2 +-
 src/util/proteomecluster.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index b299a2112..860bd5abb 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -87,7 +87,7 @@ Parameters::Parameters():
         PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
         PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
         PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-normalized-similarity", "Proteome normalized similarity", "Proteome similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeNormalizedSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-relative-similarity", "Proteome relative similarity", "Proteome relative similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeNormalizedSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
 #ifdef GAP_POS_SCORING
         PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
 #endif
diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 9faddd439..5ddaaceb2 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -425,7 +425,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     gapOpen = par.gapOpen.values.aminoacid();
     gapExtend = par.gapExtend.values.aminoacid();
     EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend);
-    Debug(Debug::INFO) << "Initialization";
+    Debug(Debug::INFO) << "Initialization ";
     Timer timer;
     #pragma omp parallel
     {

From 44a6fcab00bdc696526f615d85c7e9bf6c70ec3d Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Mon, 9 Sep 2024 22:23:42 +0900
Subject: [PATCH 26/31] latest version-bugfix,relativeProteomeSimilarity added,
 singleton proteome handling update

---
 src/util/proteomecluster.cpp | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 5ddaaceb2..a88511978 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -65,7 +65,6 @@ struct ProteomeEntry{
     }
 
     static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) {
-        
         if (a.repProtKey < b.repProtKey){
             return true;
         }
@@ -134,7 +133,7 @@ void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterRep
     }
 
     if (memberKeys.size() <= 1) {
-        return; //singleton cluster (1 member only)
+        return; //singleton protein member cluster (1 member only)
     }
 
     ClusterEntry eachClusterRep(memberKeys.size());
@@ -154,7 +153,7 @@ void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterRep
     }
 
     if (uniqueProteomeCount <= 1) {
-        return; //singleton cluster (geneDuplication_1 proteome only with multiple members)
+        return; //singleton proteome cluster (geneDuplication_1 proteome only with multiple members, exclude cloud genome)
     }
 
     localClusterReps.push_back(eachClusterRep);
@@ -205,12 +204,12 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
     unsigned int lastqLen = 0;
     unsigned int qproteinKey = 0;
     unsigned int qproteomeKey =0;
-    //find Rep
+    //find representative query protein which has the longest sequence length
     for (auto& eachMember : clusterRep.memberProteins){
         if (eachMember.proteomeKey == RepProteomeId) {
             isRepFound = true;
             const unsigned int queryId = eachMember.proteinKey;
-            if( lastqLen < tProteinDB.getSeqLen(queryId)){
+            if( lastqLen < tProteinDB.getSeqLen(queryId)) {
                 lastqLen = tProteinDB.getSeqLen(queryId);
                 qproteinKey = eachMember.proteinKey;
                 qproteomeKey = eachMember.proteomeKey;
@@ -226,12 +225,13 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
         matcher.initQuery(&query);
         const unsigned int queryProteomeLength = proteomeList[qproteomeKey].proteomeAALen;
 
-        // representative protein :same query and target (for createtsv)
+        // Alignment by representative protein itself : same query and target (for createtsv)
         Matcher::result_t rep_reult = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true);
         size_t rep_len = Matcher::resultToBuffer(buffer, rep_reult, par.addBacktrace);
         proteinClustWriter.writeAdd(buffer, rep_len, thread_idx);
         localSeqIds[qproteomeKey] += rep_reult.getSeqId()*query.L;
 
+        //Alignment by representative protein and other proteins in the cluster
         for (auto& eachTargetMember : clusterRep.memberProteins){
             if (eachTargetMember.proteomeKey == RepProteomeId) {
                 continue;
@@ -256,7 +256,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
             Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity);
 
             if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) {
-                if (query.L >= target.L*0.9){
+                if (query.L >= target.L*0.9){ // -s2 lenght difference parameter in cd-hit-2d
                     size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
                     proteinClustWriter.writeAdd(buffer, len, thread_idx);
                     localSeqIds[tproteomeKey] += result.getSeqId()*target.L;
@@ -292,13 +292,11 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& pro
     std::vector<uint8_t> isProteomeInCluster(localProteomeCount.size(), 0);
     if (clusterRep.isAvailable) {
         unsigned int notCoveredMemberCount = 0;
-        // unsigned int lastProteomeKey = 0; //gg : unused
         unsigned int uniqueProteomeCount = 0;
         //update cluster member info
         for (auto& eachMember : clusterRep.memberProteins) {
             if (proteomeList[eachMember.proteomeKey].isCovered() == false) {
                 notCoveredMemberCount++; 
-                // lastProteomeKey = eachMember.proteomeKey;// gg : unused
                 if (isProteomeInCluster[eachMember.proteomeKey] == 0) {
                     isProteomeInCluster[eachMember.proteomeKey] = 1;
                     uniqueProteomeCount++;
@@ -314,7 +312,7 @@ void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& pro
         
     }
 
-    //update localProteomeCount
+    //update localProteomeCount -> update clusterCount for next repProteome finding
     if (clusterRep.isAvailable) {
         for (size_t i=0; i < localProteomeCount.size(); i++) {
             if (isProteomeInCluster[i]) {
@@ -340,7 +338,6 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEn
                 char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\t';
                 tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
-                //gg
                 *(tmpProteomeBuffer - 1) = '\t';
                 tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getrelativeSimScore(), tmpProteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\n';
@@ -455,12 +452,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     }
     Debug(Debug::INFO) << timer.lap() << "\n";
 
-    // //for debug gg
-    // for (size_t i = 0; i < proteomeList.size(); i++) {
-    //     Debug(Debug::INFO) << "Proteome " << i << " has " << proteomeList[i].clusterCount << " members\n";
-    // }
     unsigned int RepProteomeId = -1;
-    // Debug(Debug::INFO) << "Run Alignment\n";
     Debug(Debug::INFO) << "Proteome Clustering ";
     timer.reset();
     while (FindNextRepProteome(proteomeList, RepProteomeId)) {
@@ -511,11 +503,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     Debug(Debug::INFO) << timer.lap() << "\n";
     //sort proteomeList by repProtKey
     SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey);
-    //debug
-    // for (size_t i = 0; i < proteomeList.size(); i++) {
-    //     Debug(Debug::INFO) << "Proteome " << proteomeList[i].proteomeKey << " RepKey " << proteomeList[i].repProtKey << " normalizedscore: " << proteomeList[i].relativeSimScore << " members\n";
-    // }
-    //write result of proteome clustering
+    //write _proteome_cluster
     writeProteomeClusters(proteomeClustWriter, proteomeList);
 
     proteomeClustWriter.close();

From 910bb7f9e6137e44f6243bc2d12b953cfe80a0f6 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Tue, 10 Sep 2024 20:50:19 +0900
Subject: [PATCH 27/31] Change Relative Score formula

---
 src/alignment/Matcher.h              |  3 +++
 src/commons/Parameters.cpp           |  4 ++--
 src/commons/Parameters.h             |  2 +-
 src/util/proteomecluster.cpp         | 35 +++++++++++++++++++++-------
 src/workflow/EasyProteomecluster.cpp |  2 +-
 5 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h
index 6e1d0ab8f..90185de63 100644
--- a/src/alignment/Matcher.h
+++ b/src/alignment/Matcher.h
@@ -144,6 +144,9 @@ class Matcher{
         float getSeqId(){
             return seqId;
         }
+        unsigned int getAlnLength(){
+            return alnLength;
+        }
     };
 
     Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m,
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index 860bd5abb..fe35ab4c2 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -87,7 +87,7 @@ Parameters::Parameters():
         PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
         PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
         PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-relative-similarity", "Proteome relative similarity", "Proteome relative similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeNormalizedSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-relative-similarity", "Proteome relative similarity", "Proteome relative similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeRelativeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
 #ifdef GAP_POS_SCORING
         PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
 #endif
@@ -2357,7 +2357,7 @@ void Parameters::setDefaults() {
     maxAccept   = INT_MAX;
     seqIdThr = 0.0;
     proteomeSimThr = 0.9;
-    proteomeNormalizedSimThr = 0.9;
+    proteomeRelativeSimThr = 0.9;
     alnLenThr = 0;
     altAlignment = 0;
     gapOpen = MultiParam<NuclAA<int>>(NuclAA<int>(11, 5));
diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
index a19fcfa52..0d4b3eee3 100644
--- a/src/commons/Parameters.h
+++ b/src/commons/Parameters.h
@@ -436,7 +436,7 @@ class Parameters {
     int    altAlignment;                 // show up to this many alternative alignments
     float  seqIdThr;                     // sequence identity threshold for acceptance
     float  proteomeSimThr;              // proteome similarity threshold for acceptance
-    float proteomeNormalizedSimThr;     // proteome similarity threshold normalized by proteome size for acceptance
+    float proteomeRelativeSimThr;     // proteome similarity threshold normalized by proteome size for acceptance
     int    alnLenThr;                    // min. alignment length
     bool   addBacktrace;                 // store backtrace string (M=Match, D=deletion, I=insertion)
     bool   realign;                      // realign hit with more conservative score
diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index a88511978..cf54a34f1 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -21,10 +21,11 @@ struct ProteomeEntry{
     float protSimScore;
     unsigned int clusterCount;
     float totalSeqId;
+    unsigned int AAMatchCount;
     float relativeSimScore;
 
-    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f)
-        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId) {}
+    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f, unsigned int matchCount = 0, float relSimScore = 0.0f)
+        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId), AAMatchCount(matchCount), relativeSimScore(relSimScore) {}
 
     int getRepProtKey() {
         return repProtKey;
@@ -40,6 +41,7 @@ struct ProteomeEntry{
     }
     void computeRedundancy(unsigned int repProteomeSize) {
         protSimScore = totalSeqId / proteomeAALen;
+        relativeSimScore = static_cast<float> (AAMatchCount * 2) / static_cast<float> (repProteomeSize + proteomeAALen);
         relativeSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen);
         if (relativeSimScore >= 1.0) {
             relativeSimScore = 1.0;
@@ -56,6 +58,7 @@ struct ProteomeEntry{
         totalSeqId = 0.0f;
         relativeSimScore = 0.0f;
         protSimScore = 0.0f;
+        AAMatchCount = 0;
     }
     float getProtSimScore() {
         return protSimScore;
@@ -198,7 +201,7 @@ bool FindNextRepProteome(std::vector<ProteomeEntry>& proteomeList, unsigned int&
     }
 }
 
-void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, DBWriter& proteinClustWriter) {
+void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, std::vector<unsigned int>& localMatchCount, DBWriter& proteinClustWriter) {
     char buffer[1024]; 
     bool isRepFound = false;
     unsigned int lastqLen = 0;
@@ -226,10 +229,12 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
         const unsigned int queryProteomeLength = proteomeList[qproteomeKey].proteomeAALen;
 
         // Alignment by representative protein itself : same query and target (for createtsv)
-        Matcher::result_t rep_reult = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true);
-        size_t rep_len = Matcher::resultToBuffer(buffer, rep_reult, par.addBacktrace);
+        Matcher::result_t rep_result = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true);
+        size_t rep_len = Matcher::resultToBuffer(buffer, rep_result, par.addBacktrace);
         proteinClustWriter.writeAdd(buffer, rep_len, thread_idx);
-        localSeqIds[qproteomeKey] += rep_reult.getSeqId()*query.L;
+        localSeqIds[qproteomeKey] += rep_result.getSeqId()*query.L;
+        localMatchCount[qproteomeKey] += static_cast<unsigned int> (rep_result.getSeqId() * static_cast<float> (rep_result.getAlnLength()) + 0.5);
+
 
         //Alignment by representative protein and other proteins in the cluster
         for (auto& eachTargetMember : clusterRep.memberProteins){
@@ -259,7 +264,11 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
                 if (query.L >= target.L*0.9){ // -s2 lenght difference parameter in cd-hit-2d
                     size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
                     proteinClustWriter.writeAdd(buffer, len, thread_idx);
-                    localSeqIds[tproteomeKey] += result.getSeqId()*target.L;
+                    // localSeqIds[tproteomeKey] += result.getSeqId()*target.L;
+                    localSeqIds[tproteomeKey] += result.getSeqId() * target.L;
+                    // localMatchCount[tproteomeKey] += static_cast<unsigned int> ((result.getSeqId() * static_cast<float> (result.getAlnLength())) + 0.5);
+                    unsigned int matchCount = static_cast<unsigned int> ((result.getSeqId() * static_cast<float> (result.getAlnLength())) + 0.5);
+                    localMatchCount[tproteomeKey] += matchCount;
                 }
             }
         }
@@ -277,7 +286,7 @@ void updateProteomeRedundancyInfo(std::vector<ProteomeEntry>& proteomeList, cons
                 proteomeList[i].relativeSimScore = 1.0;
             }else{
                 proteomeList[i].computeRedundancy(repProteomeAASize);
-                if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr && proteomeList[i].getrelativeSimScore() >= par.proteomeNormalizedSimThr){
+                if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr && proteomeList[i].getrelativeSimScore() >= par.proteomeRelativeSimThr){
                     proteomeList[i].repProtKey = RepProteomeId;
                 }else{
                     proteomeList[i].resetProteomeInfo();
@@ -466,10 +475,11 @@ int proteomecluster(int argc, const char **argv, const Command &command){
             Sequence query(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
             Sequence target(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
             std::vector <float> localSeqIds(proteomeList.size(), 0.0f);
+            std::vector <unsigned int> localMatchCount(proteomeList.size(), 0);
             #pragma omp for schedule(dynamic, 1)
             for (size_t i = 0; i < clusterReps.size(); i++) {
                 if (clusterReps[i].isAvailable) {
-                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, proteinClustWriter);
+                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, localMatchCount, proteinClustWriter);
                 }
                 
             }
@@ -478,6 +488,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
                 for (size_t idx = 0; idx < proteomeList.size(); idx++) {
                     if (proteomeList[idx].isCovered() == false) {
                         proteomeList[idx].addSeqId(localSeqIds[idx]);
+                        proteomeList[idx].AAMatchCount += localMatchCount[idx];
                     }
                 }
             }
@@ -501,6 +512,12 @@ int proteomecluster(int argc, const char **argv, const Command &command){
         }
     }
     Debug(Debug::INFO) << timer.lap() << "\n";
+
+
+    for (size_t i=0; i < proteomeList.size(); i++) {
+        Debug(Debug::INFO) << "Proteome: " << proteomeList[i].proteomeKey << " relativeScore " << proteomeList[i].relativeSimScore << "\n";
+    }
+
     //sort proteomeList by repProtKey
     SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey);
     //write _proteome_cluster
diff --git a/src/workflow/EasyProteomecluster.cpp b/src/workflow/EasyProteomecluster.cpp
index d7e7509f9..c8e60e18e 100644
--- a/src/workflow/EasyProteomecluster.cpp
+++ b/src/workflow/EasyProteomecluster.cpp
@@ -11,7 +11,7 @@
 void setEasyproteomeclusterDefaults(Parameters *p) {
     p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
     p->proteomeSimThr = 0.9;
-    p->proteomeNormalizedSimThr = 0.0;
+    p->proteomeRelativeSimThr = 0.0;
 }
 
 void setEasyproteomeclusterMustPassAlong(Parameters *p){

From e24b9e3c6d855c8482e2c688c24492d3d34189c9 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Tue, 10 Sep 2024 21:06:50 +0900
Subject: [PATCH 28/31] Remove Debug printout

---
 src/util/proteomecluster.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index cf54a34f1..3c441426d 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -513,11 +513,6 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     }
     Debug(Debug::INFO) << timer.lap() << "\n";
 
-
-    for (size_t i=0; i < proteomeList.size(); i++) {
-        Debug(Debug::INFO) << "Proteome: " << proteomeList[i].proteomeKey << " relativeScore " << proteomeList[i].relativeSimScore << "\n";
-    }
-
     //sort proteomeList by repProtKey
     SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey);
     //write _proteome_cluster

From ad1acdf13d221eb95b9a31f2401d619c85a150fa Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Tue, 10 Sep 2024 21:54:58 +0900
Subject: [PATCH 29/31] style change

---
 src/util/proteomecluster.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 3c441426d..45589ca01 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -66,7 +66,6 @@ struct ProteomeEntry{
     float getrelativeSimScore() {
         return relativeSimScore;
     }
-
     static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) {
         if (a.repProtKey < b.repProtKey){
             return true;
@@ -212,7 +211,7 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
         if (eachMember.proteomeKey == RepProteomeId) {
             isRepFound = true;
             const unsigned int queryId = eachMember.proteinKey;
-            if( lastqLen < tProteinDB.getSeqLen(queryId)) {
+            if (lastqLen < tProteinDB.getSeqLen(queryId)) {
                 lastqLen = tProteinDB.getSeqLen(queryId);
                 qproteinKey = eachMember.proteinKey;
                 qproteomeKey = eachMember.proteomeKey;
@@ -261,12 +260,10 @@ void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProt
             Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity);
 
             if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) {
-                if (query.L >= target.L*0.9){ // -s2 lenght difference parameter in cd-hit-2d
+                if (query.L >= target.L*0.9) { // -s2 lenght difference parameter in cd-hit-2d
                     size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
                     proteinClustWriter.writeAdd(buffer, len, thread_idx);
-                    // localSeqIds[tproteomeKey] += result.getSeqId()*target.L;
                     localSeqIds[tproteomeKey] += result.getSeqId() * target.L;
-                    // localMatchCount[tproteomeKey] += static_cast<unsigned int> ((result.getSeqId() * static_cast<float> (result.getAlnLength())) + 0.5);
                     unsigned int matchCount = static_cast<unsigned int> ((result.getSeqId() * static_cast<float> (result.getAlnLength())) + 0.5);
                     localMatchCount[tproteomeKey] += matchCount;
                 }
@@ -351,7 +348,6 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEn
                 tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getrelativeSimScore(), tmpProteomeBuffer);
                 *(tmpProteomeBuffer - 1) = '\n';
                 proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
-                
             }
             proteomeClustWriter.writeEnd(repProtIdCluster);
             // Reset

From e02626098424ede2cea81de2a328a89916c685b9 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Tue, 10 Sep 2024 22:50:58 +0900
Subject: [PATCH 30/31] Latest Version

---
 src/util/proteomecluster.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 45589ca01..28859a0cc 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -42,7 +42,6 @@ struct ProteomeEntry{
     void computeRedundancy(unsigned int repProteomeSize) {
         protSimScore = totalSeqId / proteomeAALen;
         relativeSimScore = static_cast<float> (AAMatchCount * 2) / static_cast<float> (repProteomeSize + proteomeAALen);
-        relativeSimScore = (totalSeqId * 2)/ (repProteomeSize + proteomeAALen);
         if (relativeSimScore >= 1.0) {
             relativeSimScore = 1.0;
         }

From d9213ef1bbad70acc4f5f39797da82547d7b83e2 Mon Sep 17 00:00:00 2001
From: imgyuri <gyuribio@snu.ac.kr>
Date: Thu, 19 Sep 2024 13:03:25 +0900
Subject: [PATCH 31/31] Add cluster count report

---
 data/workflow/easyproteomecluster.sh |  8 +++++-
 src/MMseqsBase.cpp                   |  7 +++--
 src/util/proteomecluster.cpp         | 42 ++++++++++++++++++++++++++--
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/data/workflow/easyproteomecluster.sh b/data/workflow/easyproteomecluster.sh
index 2bab198fc..3a8bc85bd 100644
--- a/data/workflow/easyproteomecluster.sh
+++ b/data/workflow/easyproteomecluster.sh
@@ -29,7 +29,7 @@ fi
 
 if notExists "${TMP_PATH}/aln.dbtype"; then
     # shellcheck disable=SC2086
-    "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${PROTEOMECLUSTER_PAR} \
+    "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" "${TMP_PATH}/cluster_count" ${PROTEOMECLUSTER_PAR} \
         || fail "Convert Alignments died"
 fi
 
@@ -45,6 +45,12 @@ if notExists "${RESULTS}_proteome_cluster.tsv"; then
             || fail "createtsv proteome cluster died"
 fi
 
+if notExists "${RESULTS}_cluster_count.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/cluster_count" "${RESULTS}_cluster_count.tsv" ${THREADS_PAR} \
+            || fail "createtsv proteome cluster count report died"
+fi
+
 
 if [ -n "${REMOVE_TMP}" ]; then
     # shellcheck disable=SC2086
diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index cccc9ce94..84bf73726 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -656,11 +656,12 @@ std::vector<Command> baseCommands = {
                 "Proteome clustering and alignment",
                 NULL,
                 "Martin Steinegger <martin.steinegger@snu.ac.kr> & Gyuri Kim <gyuribio@snu.ac.kr>",
-                "<i:sequenceDB> <i:clustresultDB> <o:proteinAlignmentDB> <o:proteomeAlignmentDB>",
+                "<i:sequenceDB> <i:clustresultDB> <o:proteinAlignmentDB> <o:proteomeAlignmentDB> <o:proteomeClusterCountReport",
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"clustresultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                                           {"proteinAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb},
-                                                           {"proteomeAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+                                                           {"proteinAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"proteomeAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"proteomeClusterCountReport", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
         {"alignall",             alignall,             &par.alignall,             COMMAND_ALIGNMENT,
                 "Within-result all-vs-all gapped local alignment",
                 NULL,
diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
index 28859a0cc..40fb104ec 100644
--- a/src/util/proteomecluster.cpp
+++ b/src/util/proteomecluster.cpp
@@ -117,7 +117,7 @@ void calculateProteomeLength(std::vector<ProteomeEntry>& proteomeList, DBReader<
     }
 }
 
-void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterReps, std::vector<unsigned int>& localProteomeCount, DBReader<unsigned int>& tProteinDB, DBReader<unsigned int>& linResDB, unsigned int thread_idx){
+void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterReps, std::vector<unsigned int>& localProteomeCount, DBReader<unsigned int>& tProteinDB, DBReader<unsigned int>& linResDB, unsigned int thread_idx) {
     std::vector<unsigned int> memberKeys;
     memberKeys.reserve(50); // store key for every protein in a cluster
 
@@ -375,6 +375,23 @@ void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEn
     }
 }
 
+static char* fastfloatToBuffer(float value, char* buffer) {
+    value *= 100;
+    int integerPart = (int)value;  
+    buffer = Itoa::i32toa_sse2(integerPart, buffer);  
+    *(buffer-1) = '.';
+    float fractionalPart = (value - integerPart) * 100;
+    int fractionalInt = (int)fractionalPart;  
+    if (fractionalInt < 10) {
+        *(buffer) = '0'; 
+        buffer++;
+    }
+    buffer = Itoa::i32toa_sse2(fractionalInt, buffer);  
+
+    *(buffer-1) = '%';
+    buffer++;
+    return buffer;
+}
 
 int proteomecluster(int argc, const char **argv, const Command &command){
     //Initialize parameters
@@ -417,6 +434,8 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     int proteomeDBType = DBReader<unsigned int>::setExtendedDbtype(Parameters::DBTYPE_GENERIC_DB, Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER);
     DBWriter proteomeClustWriter(par.db4.c_str(), par.db4Index.c_str(), 1, par.compressed, proteomeDBType);
     proteomeClustWriter.open();
+    DBWriter clusterCountWriter(par.db5.c_str(), par.db5Index.c_str(), 1, par.compressed, proteomeDBType);
+    clusterCountWriter.open();
 
     std::vector<ClusterEntry> clusterReps; 
 
@@ -428,6 +447,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend);
     Debug(Debug::INFO) << "Initialization ";
     Timer timer;
+
     #pragma omp parallel
     {
         unsigned int thread_idx = 0;
@@ -436,7 +456,6 @@ int proteomecluster(int argc, const char **argv, const Command &command){
     #endif    
         std::vector<ClusterEntry> localClusterReps;
         std::vector<unsigned int> localProteomeCount(proteomeList.size(), 0);
-
         #pragma omp for schedule(dynamic, 1)
         for (size_t id = 0; id < linResDB.getSize(); id++) {
             initLocalClusterReps(id, localClusterReps, localProteomeCount, tProteinDB, linResDB, thread_idx);
@@ -452,10 +471,26 @@ int proteomecluster(int argc, const char **argv, const Command &command){
                                std::make_move_iterator(localClusterReps.begin()),
                                std::make_move_iterator(localClusterReps.end()));
         }
-
+        
     }
     Debug(Debug::INFO) << timer.lap() << "\n";
 
+    unsigned int totalClusterCount = clusterReps.size();
+    Debug(Debug::INFO) << "Total Cluster Count : " << totalClusterCount << "\n";
+    for (size_t idx=0; idx < proteomeList.size(); idx++){
+        float clusterCountRatio = static_cast<float> (proteomeList[idx].clusterCount) / static_cast<float> (totalClusterCount);
+
+        char proteomeBuffer[1024];
+        clusterCountWriter.writeStart();
+        char *basePos = proteomeBuffer;
+        char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[idx].clusterCount, proteomeBuffer);
+        *(tmpProteomeBuffer - 1) = '\t';
+        tmpProteomeBuffer = fastfloatToBuffer(clusterCountRatio, tmpProteomeBuffer);
+        *(tmpProteomeBuffer - 1) = '\n';
+        clusterCountWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
+        clusterCountWriter.writeEnd(proteomeList[idx].proteomeKey);
+    }
+
     unsigned int RepProteomeId = -1;
     Debug(Debug::INFO) << "Proteome Clustering ";
     timer.reset();
@@ -515,6 +550,7 @@ int proteomecluster(int argc, const char **argv, const Command &command){
 
     proteomeClustWriter.close();
     proteinClustWriter.close();
+    clusterCountWriter.close();
     tProteinDB.close();
     linResDB.close();
     return EXIT_SUCCESS;