diff --git a/data/workflow/CMakeLists.txt b/data/workflow/CMakeLists.txt
index f002a27db..3f006a286 100644
--- a/data/workflow/CMakeLists.txt
+++ b/data/workflow/CMakeLists.txt
@@ -3,6 +3,7 @@ set(GENERATED_WORKFLOWS
         workflow/easycluster.sh
         workflow/easytaxonomy.sh
         workflow/easyrbh.sh
+        workflow/easyproteomecluster.sh
         workflow/blastp.sh
         workflow/blastpgp.sh
         workflow/map.sh
diff --git a/data/workflow/easyproteomecluster.sh b/data/workflow/easyproteomecluster.sh
new file mode 100644
index 000000000..3a8bc85bd
--- /dev/null
+++ b/data/workflow/easyproteomecluster.sh
@@ -0,0 +1,70 @@
+#!/bin/sh -e
+fail() {
+    echo "Error: $1"
+    exit 1
+}
+
+notExists() {
+   [ ! -f "$1" ]
+}
+
+
+if notExists "${TMP_PATH}/input.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \
+        || fail "query createdb died"
+fi
+
+if notExists "${TMP_PATH}/clu.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \
+        || fail "Search died"
+fi
+
+if notExists "${RESULTS}_protein_cluster.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \
+            || fail "createtsv protein cluster died"
+fi
+
+if notExists "${TMP_PATH}/aln.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" "${TMP_PATH}/cluster_count" ${PROTEOMECLUSTER_PAR} \
+        || fail "Convert Alignments died"
+fi
+
+if notExists "${RESULTS}_protein_align.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \
+            || fail "createtsv protein align died"
+fi
+
+if notExists "${RESULTS}_proteome_cluster.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
+            || fail "createtsv proteome cluster died"
+fi
+
+if notExists "${RESULTS}_cluster_count.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/cluster_count" "${RESULTS}_cluster_count.tsv" ${THREADS_PAR} \
+            || fail "createtsv proteome cluster count report died"
+fi
+
+
+if [ -n "${REMOVE_TMP}" ]; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_protein" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_proteome" ${VERBOSITY_PAR}
+    rm -rf "${TMP_PATH}/clu_tmp"
+    rm -f "${TMP_PATH}/easyproteomecluster.sh"
+fi
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
index 3c4abb4e0..35939c4e9 100644
--- a/src/CommandDeclarations.h
+++ b/src/CommandDeclarations.h
@@ -4,6 +4,7 @@
 
 extern int align(int argc, const char **argv, const Command& command);
 extern int alignall(int argc, const char **argv, const Command& command);
+extern int proteomecluster(int argc, const char **argv, const Command& command);
 extern int alignbykmer(int argc, const char **argv, const Command& command);
 extern int appenddbtoindex(int argc, const char **argv, const Command& command);
 extern int apply(int argc, const char **argv, const Command& command);
@@ -43,6 +44,7 @@ extern int diffseqdbs(int argc, const char **argv, const Command& command);
 extern int easycluster(int argc, const char **argv, const Command& command);
 extern int easyrbh(int argc, const char **argv, const Command& command);
 extern int easylinclust(int argc, const char **argv, const Command& command);
+extern int easyproteomecluster(int argc, const char **argv, const Command& command);
 extern int easysearch(int argc, const char **argv, const Command& command);
 extern int easylinsearch(int argc, const char **argv, const Command& command);
 extern int tsv2exprofiledb(int argc, const char **argv, const Command& command);
diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index 8325f0d4a..84bf73726 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -11,6 +11,10 @@ std::vector<Command> baseCommands = {
                 "Sensitive homology search",
                 "# Search multiple FASTA against FASTA (like BLASTP, TBLASTN, BLASTX, BLASTN --search-type 3, TBLASTX --search-type 2)\n"
                 "mmseqs easy-search examples/QUERY.fasta examples/QUERY.fasta examples/DB.fasta result.m8 tmp\n\n"
+                "# Search multiple query fasta files against target fasta files using a tsv file containing filepaths\n"
+                "echo -e \"dir1/QUERY1.fasta\\ndir2/QUERY2.fasta\" > examples/queries.tsv\n"
+                "echo -e \"dir3/TARGET1.fasta\\ndir4/TARGET2.fasta\" > examples/targets.tsv\n"
+                "mmseqs easy-search examples/queries.tsv examples/targets.tsv result.m8 tmp\n\n"
                 "# Iterative profile search from stdin (like PSI-BLAST)\n"
                 "cat examples/QUERY.fasta | mmseqs easy-search stdin examples/DB.fasta result.m8 tmp --num-iterations 2\n\n"
                 "# Profile search against small databases (e.g. PFAM, eggNOG)\n"
@@ -79,6 +83,19 @@ std::vector<Command> baseCommands = {
                 CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
                                                             {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
                                                             {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"easy-proteomecluster", easyproteomecluster, &par.easyproteomeclusterworkflow, COMMAND_EASY,
+                "Proteome clustering and alignment",
+                "mmseqs easy-proteomecluster examples/DB.fasta result tmp\n\n"
+                "# ProteomeCluster output\n"
+                "#  - protein_cluster.tsv:  Protein linclust result\n"
+                "#  - proteome_cluster.tsv: Proteome redundancy clustering result\n"
+                "#  - protein_align.tsv: Protein alignment list\n"
+                "mmseqs easy-proteomecluster examples/DB.fasta result tmp --proteome-similarity 0.9 --proteome-relative-similarity 0.9\n",
+                "Martin Steinegger <martin.steinegger@snu.ac.kr> & Gyuri Kim <gyuribio@snu.ac.kr>",
+                "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <o:clusterPrefix> <tmpDir>",
+                CITATION_MMSEQS2, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                        {"outputReports", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                        {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
         {"easy-taxonomy",        easytaxonomy,         &par.easytaxonomy,         COMMAND_EASY,
                 "Taxonomic classification",
                 "# Assign taxonomic labels to FASTA sequences\n"
@@ -125,7 +142,10 @@ std::vector<Command> baseCommands = {
                 "# Create a seqDB from stdin\n"
                 "cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n"
                 "# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n"
-                "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n",
+                "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n\n"
+                "# Create a seqDB from a tsv file containing filepaths of multiple FASTA files in each line\n"
+                "echo -e \"dir1/bacteria.fasta\\ndir2/archea.fasta.gz\" > filepaths.tsv\n"
+                "mmseqs createdb filepaths.tsv sequenceDB\n",
                 "Martin Steinegger <martin.steinegger@snu.ac.kr>",
                 "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB>",
                 CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric },
@@ -632,6 +652,16 @@ std::vector<Command> baseCommands = {
                                                            {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"proteomecluster",             proteomecluster,             &par.proteomecluster,             COMMAND_ALIGNMENT,
+                "Proteome clustering and alignment",
+                NULL,
+                "Martin Steinegger <martin.steinegger@snu.ac.kr> & Gyuri Kim <gyuribio@snu.ac.kr>",
+                "<i:sequenceDB> <i:clustresultDB> <o:proteinAlignmentDB> <o:proteomeAlignmentDB> <o:proteomeClusterCountReport",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"clustresultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"proteinAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"proteomeAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"proteomeClusterCountReport", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
         {"alignall",             alignall,             &par.alignall,             COMMAND_ALIGNMENT,
                 "Within-result all-vs-all gapped local alignment",
                 NULL,
@@ -640,6 +670,7 @@ std::vector<Command> baseCommands = {
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+                                                           
         {"transitivealign",      transitivealign,      &par.align,                COMMAND_ALIGNMENT,
                 "Transfer alignments via transitivity",
                 //"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B",
@@ -1298,5 +1329,6 @@ std::vector<Command> baseCommands = {
                 NULL,
                 "",
                 "",
-                CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}
+                CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
+        
 };
diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h
index a9c28beb4..90185de63 100644
--- a/src/alignment/Matcher.h
+++ b/src/alignment/Matcher.h
@@ -140,6 +140,13 @@ class Matcher{
                 }
             }
         }
+
+        float getSeqId(){
+            return seqId;
+        }
+        unsigned int getAlnLength(){
+            return alnLength;
+        }
     };
 
     Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m,
@@ -217,7 +224,6 @@ class Matcher{
 
 
     static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress  = true, bool addOrfPosition = false);
-
     static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart);
 
     static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer,
diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp
index f52d2369c..1eb80fd06 100644
--- a/src/commons/DBReader.cpp
+++ b/src/commons/DBReader.cpp
@@ -112,6 +112,7 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         totalDataSize = 0;
         dataFileCnt = dataFileNames.size();
+
         dataSizeOffset = new size_t[dataFileNames.size() + 1];
         dataFiles = new char*[dataFileNames.size()];
         for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){
@@ -155,6 +156,22 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         lookupData.close();
     }
+    if (dataMode & USE_SOURCE){
+        std::string sourceFilename = (std::string(dataFileName) + ".source");
+        MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
+        if (sourceData.isValid() == false) {
+            Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        char* sourceDataChar = (char *) sourceData.getData();
+        size_t sourceDataSize = sourceData.size();
+        sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads);
+        source = new(std::nothrow) SourceEntry[this->sourceSize];
+        incrementMemory(sizeof(SourceEntry) * this->sourceSize);
+        readSource(sourceDataChar, sourceDataSize, source);
+
+    }
+
     bool isSortedById = false;
     if (externalData == false) {
         MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
@@ -707,6 +724,24 @@ template <typename T> unsigned int DBReader<T>::getLookupFileNumber(size_t id){
     return lookup[id].fileNumber;
 }
 
+template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){
+    if (id >= sourceSize){
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
+        Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return source[id].fileName;
+}
+
+template <typename T> T DBReader<T>::getSourceKey(size_t id){
+    if (id >= sourceSize){
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
+        Debug(Debug::ERROR) << "getSource id: local id (" << id << ") >= db size (" << sourceSize << ")\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return source[id].id;
+}
+
 template<>
 void DBReader<unsigned int>::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) {
     buffer.append(SSTR(entry.id));
@@ -1048,6 +1083,36 @@ void DBReader<T>::readLookup(char *data, size_t dataSize, DBReader::LookupEntry
         currPos = lookupData - (char *) data;
 
         i++;
+
+        
+    }
+}
+
+template <typename T>
+void DBReader<T>::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) {
+    size_t i=0;
+    size_t currPos = 0;
+    char* sourceData = (char *) data;
+    const char * cols[3];
+    while (currPos < dataSize){
+        if (i >= this->sourceSize) {
+            Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        Util::getFieldsOfLine(sourceData, cols, 3);
+        source[i].id = Util::fast_atoi<size_t>(cols[0]);
+        std::string fileName = std::string(cols[1], (cols[2] - cols[1]));
+        size_t lastDotPosition = fileName.rfind('.');
+
+        if (lastDotPosition != std::string::npos) {
+            fileName = fileName.substr(0, lastDotPosition);
+        }
+        source[i].fileName = fileName;
+        sourceData = Util::skipLine(sourceData);
+
+        currPos = sourceData - (char *) data;
+
+        i++;
     }
 }
 
diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h
index 57589b1f6..2c61664e1 100644
--- a/src/commons/DBReader.h
+++ b/src/commons/DBReader.h
@@ -98,7 +98,10 @@ class DBReader : public MemoryTracker {
             return false;
         }
     };
-
+    struct SourceEntry{
+        T id;
+        std::string fileName;
+    };
     struct LookupEntry {
         T id;
         std::string entryName;
@@ -248,6 +251,8 @@ class DBReader : public MemoryTracker {
     void lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry);
     LookupEntry* getLookup() { return lookup; };
 
+    std::string getSourceFileName(size_t id);
+    T getSourceKey(size_t id);
     static const int NOSORT = 0;
     static const int SORT_BY_LENGTH = 1;
     static const int LINEAR_ACCCESS = 2;
@@ -266,6 +271,7 @@ class DBReader : public MemoryTracker {
     static const unsigned int USE_FREAD      = 4;
     static const unsigned int USE_LOOKUP     = 8;
     static const unsigned int USE_LOOKUP_REV = 16;
+    static const unsigned int USE_SOURCE     = 32;
 
 
     // compressed
@@ -309,6 +315,8 @@ class DBReader : public MemoryTracker {
 
     void readLookup(char *data, size_t dataSize, LookupEntry *lookup);
 
+    void readSource(char *data, size_t dataSize, SourceEntry *source);
+
     void readIndexId(T* id, char * line, const char** cols);
 
     unsigned int indexIdToNum(T* id);
@@ -485,7 +493,9 @@ class DBReader : public MemoryTracker {
 
     Index * index;
     size_t lookupSize;
+    size_t sourceSize;
     LookupEntry * lookup;
+    SourceEntry * source;
     bool sortedByOffset;
 
     unsigned int * id2local;
diff --git a/src/commons/DBWriter.h b/src/commons/DBWriter.h
index 190015cb3..21e5377bb 100644
--- a/src/commons/DBWriter.h
+++ b/src/commons/DBWriter.h
@@ -66,6 +66,7 @@ class DBWriter : public MemoryTracker  {
     }
 
     static void sortIndex(const char *inFileNameIndex, const char *outFileNameIndex, const bool lexicographicOrder);
+    
 private:
     size_t addToThreadBuffer(const void *data, size_t itmesize, size_t nitems, int threadIdx);
     void writeThreadBuffer(unsigned int idx, size_t dataSize);
diff --git a/src/commons/IndexReader.h b/src/commons/IndexReader.h
index 17d637f12..298c2bb8c 100644
--- a/src/commons/IndexReader.h
+++ b/src/commons/IndexReader.h
@@ -95,6 +95,8 @@ class IndexReader {
                     }else{
                         failSuffix = "_aln";
                     }
+                } else if (databaseType & SOURCE) {
+                    failSuffix = "";
                 }
             }
             sequenceReader = new DBReader<unsigned int>(
@@ -115,6 +117,7 @@ class IndexReader {
     static const unsigned int SRC_HEADERS = 4;
     static const unsigned int SRC_SEQUENCES =  8;
     static const unsigned int ALIGNMENTS = 16;
+    static const unsigned int SOURCE = 32;
     static const unsigned int USER_SELECT = 1 << 31;
 
     static unsigned int makeUserDatabaseType(unsigned int baseKey) {
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index 34e429811..fe35ab4c2 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -86,6 +86,8 @@ Parameters::Parameters():
         PARAM_ALT_ALIGNMENT(PARAM_ALT_ALIGNMENT_ID, "--alt-ali", "Alternative alignments", "Show up to this many alternative alignments", typeid(int), (void *) &altAlignment, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
         PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
         PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam<NuclAA<int>>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-relative-similarity", "Proteome relative similarity", "Proteome relative similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeRelativeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
 #ifdef GAP_POS_SCORING
         PARAM_GAP_PSEUDOCOUNT(PARAM_GAP_PSEUDOCOUNT_ID, "--gap-pc", "Gap pseudo count", "Pseudo count for calculating position-specific gap opening penalties", typeid(int), &gapPseudoCount, "^[0-9]+$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
 #endif
@@ -338,6 +340,7 @@ Parameters::Parameters():
     alignall.push_back(&PARAM_SUB_MAT);
     alignall.push_back(&PARAM_ADD_BACKTRACE);
     alignall.push_back(&PARAM_ALIGNMENT_MODE);
+    alignall.push_back(&PARAM_REALIGN_SCORE_BIAS);
 //    alignall.push_back(&PARAM_WRAPPED_SCORING);
     alignall.push_back(&PARAM_E);
     alignall.push_back(&PARAM_MIN_SEQ_ID);
@@ -365,6 +368,41 @@ Parameters::Parameters():
     alignall.push_back(&PARAM_COMPRESSED);
     alignall.push_back(&PARAM_V);
 
+    //proteomecluster
+    proteomecluster.push_back(&PARAM_SUB_MAT);
+    proteomecluster.push_back(&PARAM_ADD_BACKTRACE);
+    proteomecluster.push_back(&PARAM_ALIGNMENT_MODE);
+    proteomecluster.push_back(&PARAM_REALIGN_SCORE_BIAS); 
+//    proteomecluster.push_back(&PARAM_WRAPPED_SCORING);
+    proteomecluster.push_back(&PARAM_E);
+    proteomecluster.push_back(&PARAM_MIN_SEQ_ID);
+    proteomecluster.push_back(&PARAM_MIN_ALN_LEN);
+    proteomecluster.push_back(&PARAM_SEQ_ID_MODE);
+//    proteomecluster.push_back(&PARAM_ALT_ALIGNMENT);
+    proteomecluster.push_back(&PARAM_C);
+    proteomecluster.push_back(&PARAM_COV_MODE);
+    proteomecluster.push_back(&PARAM_MAX_SEQ_LEN);
+    proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR);
+    proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
+    proteomecluster.push_back(&PARAM_PROTEOME_SIMILARITY);
+    proteomecluster.push_back(&PARAM_PROTEOME_RELATIVE_SIMILARITY);
+
+    proteomecluster.push_back(&PARAM_REALIGN);
+//    proteomecluster.push_back(&PARAM_MAX_REJECTED);
+//    proteomecluster.push_back(&PARAM_MAX_ACCEPT);
+    proteomecluster.push_back(&PARAM_INCLUDE_IDENTITY);
+    proteomecluster.push_back(&PARAM_PRELOAD_MODE);
+    proteomecluster.push_back(&PARAM_PCA);
+    proteomecluster.push_back(&PARAM_PCB);
+    proteomecluster.push_back(&PARAM_SCORE_BIAS);
+    proteomecluster.push_back(&PARAM_GAP_OPEN);
+    proteomecluster.push_back(&PARAM_GAP_EXTEND);
+    proteomecluster.push_back(&PARAM_ZDROP);
+    proteomecluster.push_back(&PARAM_THREADS);
+    proteomecluster.push_back(&PARAM_COMPRESSED);
+    proteomecluster.push_back(&PARAM_V);
+    
+
     // alignment
     align.push_back(&PARAM_SUB_MAT);
     align.push_back(&PARAM_ADD_BACKTRACE);
@@ -1311,6 +1349,9 @@ Parameters::Parameters():
     // easylinclustworkflow
     easylinclustworkflow = combineList(linclustworkflow, createdb);
 
+    // easyproteomeclusterworkflow
+    easyproteomeclusterworkflow = combineList(easylinclustworkflow, proteomecluster);
+
     // clustering workflow
     clusterworkflow = combineList(prefilter, align);
     clusterworkflow = combineList(clusterworkflow, rescorediagonal);
@@ -2315,6 +2356,8 @@ void Parameters::setDefaults() {
     maxRejected = INT_MAX;
     maxAccept   = INT_MAX;
     seqIdThr = 0.0;
+    proteomeSimThr = 0.9;
+    proteomeRelativeSimThr = 0.9;
     alnLenThr = 0;
     altAlignment = 0;
     gapOpen = MultiParam<NuclAA<int>>(NuclAA<int>(11, 5));
diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
index 745d2f809..0d4b3eee3 100644
--- a/src/commons/Parameters.h
+++ b/src/commons/Parameters.h
@@ -90,6 +90,7 @@ class Parameters {
     static const unsigned int DBTYPE_EXTENDED_COMPRESSED = 1;
     static const unsigned int DBTYPE_EXTENDED_INDEX_NEED_SRC = 2;
     static const unsigned int DBTYPE_EXTENDED_CONTEXT_PSEUDO_COUNTS = 4;
+    static const unsigned int DBTYPE_EXTENDED_SRC_IDENTIFIER = 8;
 
     // don't forget to add new database types to DBReader::getDbTypeName and Parameters::PARAM_OUTPUT_DBTYPE
 
@@ -434,6 +435,8 @@ class Parameters {
     int    maxAccept;                    // after n accepted sequences stop
     int    altAlignment;                 // show up to this many alternative alignments
     float  seqIdThr;                     // sequence identity threshold for acceptance
+    float  proteomeSimThr;              // proteome similarity threshold for acceptance
+    float proteomeRelativeSimThr;     // proteome similarity threshold normalized by proteome size for acceptance
     int    alnLenThr;                    // min. alignment length
     bool   addBacktrace;                 // store backtrace string (M=Match, D=deletion, I=insertion)
     bool   realign;                      // realign hit with more conservative score
@@ -787,6 +790,8 @@ class Parameters {
     PARAMETER(PARAM_ALT_ALIGNMENT)
     PARAMETER(PARAM_GAP_OPEN)
     PARAMETER(PARAM_GAP_EXTEND)
+    PARAMETER(PARAM_PROTEOME_SIMILARITY)
+    PARAMETER(PARAM_PROTEOME_RELATIVE_SIMILARITY)
 #ifdef GAP_POS_SCORING
     PARAMETER(PARAM_GAP_PSEUDOCOUNT)
 #endif
@@ -1078,6 +1083,7 @@ class Parameters {
     std::vector<MMseqsParameter*> threadsandcompression;
 
     std::vector<MMseqsParameter*> alignall;
+    std::vector<MMseqsParameter*> proteomecluster;
     std::vector<MMseqsParameter*> align;
     std::vector<MMseqsParameter*> rescorediagonal;
     std::vector<MMseqsParameter*> alignbykmer;
@@ -1115,6 +1121,7 @@ class Parameters {
     std::vector<MMseqsParameter*> kmersearch;
     std::vector<MMseqsParameter*> countkmer;
     std::vector<MMseqsParameter*> easylinclustworkflow;
+    std::vector<MMseqsParameter*> easyproteomeclusterworkflow;
     std::vector<MMseqsParameter*> linclustworkflow;
     std::vector<MMseqsParameter*> easysearchworkflow;
     std::vector<MMseqsParameter*> searchworkflow;
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index c43c3356d..6a5ee5a36 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -80,5 +80,6 @@ set(util_source_files
         util/proteinaln2nucl.cpp
         util/versionstring.cpp
         util/diskspaceavail.cpp
+        util/proteomecluster.cpp
         PARENT_SCOPE
         )
diff --git a/src/util/alignall.cpp b/src/util/alignall.cpp
index 3a07596f6..39f2e199c 100644
--- a/src/util/alignall.cpp
+++ b/src/util/alignall.cpp
@@ -27,7 +27,8 @@ int alignall(int argc, const char **argv, const Command &command) {
     }
     unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr);
 
-    DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+    DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_LOOKUP);
+
     tdbr.open(DBReader<unsigned int>::NOSORT);
     if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
         tdbr.readMmapedDataInMemory();
@@ -80,7 +81,6 @@ int alignall(int argc, const char **argv, const Command &command) {
 #pragma omp for schedule(dynamic, 1)
             for (size_t id = start; id < (start + bucketSize); id++) {
                 progress.updateProgress();
-
                 const unsigned int key = dbr_res.getDbKey(id);
                 char *data = dbr_res.getData(id, thread_idx);
 
diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp
index 18d2fc7d0..e1d4ead3d 100644
--- a/src/util/createdb.cpp
+++ b/src/util/createdb.cpp
@@ -19,6 +19,28 @@ int createdb(int argc, const char **argv, const Command& command) {
     std::vector<std::string> filenames(par.filenames);
     std::string dataFile = filenames.back();
     filenames.pop_back();
+    if (Util::endsWith(".tsv", filenames[0])) {
+	    if (filenames.size() > 1) {
+		    Debug(Debug::ERROR) << "Only one tsv file can be given\n";
+		    EXIT(EXIT_FAILURE);
+	    }
+	    std::string tsv = filenames.back();
+	    filenames.pop_back();
+
+	    FILE* file = FileUtil::openFileOrDie(tsv.c_str(), "r", true);
+	    char* line = NULL;
+	    size_t len = 0;
+	    ssize_t read;
+	    while ((read = getline(&line, &len, file)) != -1) {
+		    if (line[read - 1] == '\n') {
+			    line[read - 1] = '\0';
+			    read--;
+		    }
+		    filenames.push_back(line);
+	    }
+	    free(line);
+	    fclose(file);
+    }
 
     for (size_t i = 0; i < filenames.size(); i++) {
         if (FileUtil::directoryExists(filenames[i].c_str()) == true) {
diff --git a/src/util/createtsv.cpp b/src/util/createtsv.cpp
index 7b75c0f9a..e8e87089c 100644
--- a/src/util/createtsv.cpp
+++ b/src/util/createtsv.cpp
@@ -18,18 +18,34 @@
 int createtsv(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
+    const bool hasTargetDB = par.filenames.size() > 3;
+    DBReader<unsigned int> *reader;
+    if (hasTargetDB) {
+        reader = new DBReader<unsigned int>(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    } else {
+        reader = new DBReader<unsigned int>(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    }
+    reader->open(DBReader<unsigned int>::LINEAR_ACCCESS);
+    const bool useSourceIdentifier = DBReader<unsigned int>::getExtendedDbtype(reader->getDbtype()) & Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER;
 
     bool queryNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db1.c_str()), Parameters::DBTYPE_NUCLEOTIDES);
     bool targetNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db2.c_str()), Parameters::DBTYPE_NUCLEOTIDES);
     const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
-    int queryHeaderType = (queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS;
+    int queryHeaderType = (useSourceIdentifier) ? IndexReader::SOURCE : 
+                        (queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS;
     queryHeaderType = (par.idxSeqSrc == 0) ? queryHeaderType :  (par.idxSeqSrc == 1) ?  IndexReader::HEADERS : IndexReader::SRC_HEADERS;
-    IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
+    unsigned int preloadMode = (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0;
+    unsigned int dataMode = (useSourceIdentifier)
+        ? (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_SOURCE)
+        : (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+    IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, preloadMode, dataMode);
+
+    // IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
     IndexReader * tDbrHeader=NULL;
     DBReader<unsigned int> * queryDB = qDbrHeader.sequenceReader;
     DBReader<unsigned int> * targetDB = NULL;
     bool sameDB = (par.db2.compare(par.db1) == 0);
-    const bool hasTargetDB = par.filenames.size() > 3;
+    
     DBReader<unsigned int>::Index * qHeaderIndex = qDbrHeader.sequenceReader->getIndex();
     DBReader<unsigned int>::Index * tHeaderIndex = NULL;
 
@@ -49,23 +65,13 @@ int createtsv(int argc, const char **argv, const Command &command) {
         }
     }
 
-    DBReader<unsigned int> *reader;
-    if (hasTargetDB) {
-
-        reader = new DBReader<unsigned int>(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-    } else {
-
-        reader = new DBReader<unsigned int>(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-    }
-    reader->open(DBReader<unsigned int>::LINEAR_ACCCESS);
-
     const std::string& dataFile = hasTargetDB ? par.db4 : par.db3;
     const std::string& indexFile = hasTargetDB ? par.db4Index : par.db3Index;
     const bool shouldCompress = par.dbOut == true && par.compressed == true;
     const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE;
     DBWriter writer(dataFile.c_str(), indexFile.c_str(), par.threads, shouldCompress, dbType);
     writer.open();
-
+    
     const size_t targetColumn = (par.targetTsvColumn == 0) ? SIZE_T_MAX :  par.targetTsvColumn - 1;
 #pragma omp parallel
     {
@@ -84,6 +90,10 @@ int createtsv(int argc, const char **argv, const Command &command) {
         for (size_t i = 0; i < reader->getSize(); ++i) {
             unsigned int queryKey = reader->getDbKey(i);
             size_t queryIndex = queryDB->getId(queryKey);
+            size_t querySourceIndex = SIZE_T_MAX;
+            if (useSourceIdentifier){
+                querySourceIndex = queryDB->getSourceKey(queryKey);
+            }
 
             char *headerData = queryDB->getData(queryIndex, thread_idx);
             if (headerData == NULL) {
@@ -92,7 +102,10 @@ int createtsv(int argc, const char **argv, const Command &command) {
             }
 
             std::string queryHeader;
-            if (par.fullHeader) {
+            if (useSourceIdentifier){
+                queryHeader = queryDB->getSourceFileName(querySourceIndex);
+            }
+            else if (par.fullHeader) {
                 queryHeader = "\"";
                 queryHeader.append(headerData, qHeaderIndex[queryIndex].length - 2);
                 queryHeader.append("\"");
@@ -118,12 +131,19 @@ int createtsv(int argc, const char **argv, const Command &command) {
                 } else if (hasTargetDB) {
                     unsigned int targetKey = (unsigned int) strtoul(dbKey, NULL, 10);
                     size_t targetIndex = targetDB->getId(targetKey);
+                    size_t targetSourceIdex = SIZE_T_MAX;
+                    if (useSourceIdentifier){
+                        targetSourceIdex = targetDB->getSourceKey(targetKey);
+                    }
                     char *targetData = targetDB->getData(targetIndex, thread_idx);
                     if (targetData == NULL) {
                         Debug(Debug::WARNING) << "Invalid header entry in query " << queryKey << " and target " << targetKey << "!\n";
                         continue;
                     }
-                    if (par.fullHeader) {
+                    if (useSourceIdentifier){
+                        targetAccession = targetDB->getSourceFileName(targetSourceIdex);
+                    }
+                    else if (par.fullHeader) {
                         targetAccession = "\"";
                         targetAccession.append(targetData, tHeaderIndex[targetIndex].length - 2);
                         targetAccession.append("\"");
diff --git a/src/util/proteomecluster.cpp b/src/util/proteomecluster.cpp
new file mode 100644
index 000000000..40fb104ec
--- /dev/null
+++ b/src/util/proteomecluster.cpp
@@ -0,0 +1,557 @@
+#include "Util.h"
+#include "Parameters.h"
+#include "Matcher.h"
+#include "Debug.h"
+#include "DBReader.h"
+#include "DBWriter.h"
+#include "NucleotideMatrix.h"
+#include "SubstitutionMatrix.h"
+#include "Alignment.h"
+#include "itoa.h"
+#include "Timer.h"
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
+struct ProteomeEntry{
+    int proteomeKey;
+    unsigned int proteomeAALen;
+    int repProtKey;
+    float protSimScore;
+    unsigned int clusterCount;
+    float totalSeqId;
+    unsigned int AAMatchCount;
+    float relativeSimScore;
+
+    ProteomeEntry(unsigned int proteomeKey = -1, unsigned int pLength = 0, int repKey = -1, float simScore = 0.0f, unsigned int clusterCount = 0, float seqId = 0.0f, unsigned int matchCount = 0, float relSimScore = 0.0f)
+        : proteomeKey(proteomeKey), proteomeAALen(pLength), repProtKey(repKey), protSimScore(simScore), clusterCount(clusterCount), totalSeqId(seqId), AAMatchCount(matchCount), relativeSimScore(relSimScore) {}
+
+    int getRepProtKey() {
+        return repProtKey;
+    }
+    int getProteomeKey() {
+        return proteomeKey;
+    }
+    bool isCovered(){
+        if (repProtKey == -1) {
+            return false;
+        }
+        return true;
+    }
+    void computeRedundancy(unsigned int repProteomeSize) {
+        protSimScore = totalSeqId / proteomeAALen;
+        relativeSimScore = static_cast<float> (AAMatchCount * 2) / static_cast<float> (repProteomeSize + proteomeAALen);
+        if (relativeSimScore >= 1.0) {
+            relativeSimScore = 1.0;
+        }
+    }
+    void addSeqId(float seqId) {
+        totalSeqId += seqId;
+    }
+    void addSeqLen(unsigned int seqLength) {
+        proteomeAALen += seqLength;
+    }
+    void resetProteomeInfo(){
+        clusterCount = 0;
+        totalSeqId = 0.0f;
+        relativeSimScore = 0.0f;
+        protSimScore = 0.0f;
+        AAMatchCount = 0;
+    }
+    float getProtSimScore() {
+        return protSimScore;
+    }
+    float getrelativeSimScore() {
+        return relativeSimScore;
+    }
+    static bool compareByKey(const ProteomeEntry& a, const ProteomeEntry& b) {
+        if (a.repProtKey < b.repProtKey){
+            return true;
+        }
+        if (a.repProtKey > b.repProtKey){
+            return false;
+        }
+        if (a.proteomeKey == a.repProtKey && b.proteomeKey != b.repProtKey){
+            return true;
+        }
+        if (a.proteomeKey != a.repProtKey && b.proteomeKey == b.repProtKey){
+            return false;
+        }
+        if (a.proteomeKey != a.repProtKey && b.proteomeKey != b.repProtKey) {
+            return a.protSimScore > b.protSimScore;
+        }
+        return false;
+    }
+};
+
+struct __attribute__((__packed__)) MemberProteinEntry{
+    unsigned int proteomeKey;
+    unsigned int proteinKey;
+};
+
+struct ClusterEntry {
+    bool isAvailable;
+    std::vector<MemberProteinEntry> memberProteins;
+
+    ClusterEntry() : isAvailable(false) {}
+
+    ClusterEntry(size_t memberProteinSize) : isAvailable(true) {
+        memberProteins.reserve(memberProteinSize);
+    }
+
+    void resetClusterInfo(){
+        memberProteins.clear();
+        isAvailable = false;
+    }
+};
+
+void calculateProteomeLength(std::vector<ProteomeEntry>& proteomeList, DBReader<unsigned int>::LookupEntry* const& lookup, size_t lookupSize, DBReader<unsigned int>& tProteinDB) {
+    for (size_t i = 0; i < lookupSize; i++) {
+        const unsigned int ProteomeId = lookup[i].fileNumber;
+        const unsigned int ProteinId = lookup[i].id;
+        proteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId));
+        if (proteomeList[ProteomeId].proteomeKey == -1) {
+            proteomeList[ProteomeId].proteomeKey = ProteomeId;
+        }
+    }
+}
+
+void initLocalClusterReps(size_t& id, std::vector<ClusterEntry>& localClusterReps, std::vector<unsigned int>& localProteomeCount, DBReader<unsigned int>& tProteinDB, DBReader<unsigned int>& linResDB, unsigned int thread_idx) {
+    std::vector<unsigned int> memberKeys;
+    memberKeys.reserve(50); // store key for every protein in a cluster
+
+    std::vector<uint8_t> isProteomeInCluster(localProteomeCount.size(), 0);
+    char buffer[1024 + 32768*4];
+    char *clustData = linResDB.getData(id, thread_idx);
+    
+    
+    while (*clustData != '\0') {
+        Util::parseKey(clustData, buffer);
+        const unsigned int key = (unsigned int) strtoul(buffer, NULL, 10);
+        memberKeys.push_back(key);
+        clustData = Util::skipLine(clustData);
+    }
+
+    if (memberKeys.size() <= 1) {
+        return; //singleton protein member cluster (1 member only)
+    }
+
+    ClusterEntry eachClusterRep(memberKeys.size());
+    int uniqueProteomeCount = 0;
+    //init MemberProteinEntry and add it to memberProteins vector
+    for (auto& eachMemberKey : memberKeys) {
+        const unsigned int proteinId = tProteinDB.getId(eachMemberKey);
+        const unsigned int proteomeKey = tProteinDB.getLookupFileNumber(proteinId);
+        MemberProteinEntry mem;
+        mem.proteomeKey = proteomeKey;
+        mem.proteinKey = proteinId;
+        eachClusterRep.memberProteins.push_back(mem);
+        if (isProteomeInCluster[proteomeKey] == 0) { //If gene is from new proteome(source)
+            isProteomeInCluster[proteomeKey] = 1;
+            uniqueProteomeCount++;
+        }
+    }
+
+    if (uniqueProteomeCount <= 1) {
+        return; //singleton proteome cluster (geneDuplication_1 proteome only with multiple members, exclude cloud genome)
+    }
+
+    localClusterReps.push_back(eachClusterRep);
+    for (size_t i = 0; i < localProteomeCount.size(); i++) {
+        if (isProteomeInCluster[i]) {
+            localProteomeCount[i]++;
+        }
+    }
+}
+
+bool FindNextRepProteome(std::vector<ProteomeEntry>& proteomeList, unsigned int& RepProteomeId) {
+    bool isAllProteomeRedundancyCovered = true;
+    
+    unsigned int maxMemberCount = 0;
+    unsigned int remainingProteomeCount = 0;
+    unsigned int lastProteomeKey = -1;
+
+    for (size_t idx = 0; idx < proteomeList.size(); idx++) {
+        if (proteomeList[idx].isCovered()) {
+            continue;
+        }else{
+            isAllProteomeRedundancyCovered = false;
+            remainingProteomeCount++;
+            lastProteomeKey = idx;
+            if (proteomeList[idx].clusterCount > maxMemberCount) {
+                maxMemberCount = proteomeList[idx].clusterCount;
+                RepProteomeId = idx;
+            }
+        }
+    }
+
+    if (isAllProteomeRedundancyCovered){
+        return false;
+    }else if (remainingProteomeCount == 1) {
+        //Only proteome is left. It isn't redundant with any other proteome
+        proteomeList[lastProteomeKey].repProtKey = proteomeList[lastProteomeKey].proteomeKey;
+        proteomeList[lastProteomeKey].protSimScore = 1.0;
+        proteomeList[lastProteomeKey].relativeSimScore = 1.0;
+        return false;
+    }else{
+        return true;
+    }
+}
+
+void runAlignmentForCluster(const ClusterEntry& clusterRep, unsigned int RepProteomeId, DBReader<unsigned int>& tProteinDB, Matcher& matcher, Sequence& query, Sequence& target, std::vector<ProteomeEntry>& proteomeList, Parameters& par, unsigned int thread_idx, int swMode, std::vector<float>& localSeqIds, std::vector<unsigned int>& localMatchCount, DBWriter& proteinClustWriter) {
+    char buffer[1024]; 
+    bool isRepFound = false;
+    unsigned int lastqLen = 0;
+    unsigned int qproteinKey = 0;
+    unsigned int qproteomeKey =0;
+    //find representative query protein which has the longest sequence length
+    for (auto& eachMember : clusterRep.memberProteins){
+        if (eachMember.proteomeKey == RepProteomeId) {
+            isRepFound = true;
+            const unsigned int queryId = eachMember.proteinKey;
+            if (lastqLen < tProteinDB.getSeqLen(queryId)) {
+                lastqLen = tProteinDB.getSeqLen(queryId);
+                qproteinKey = eachMember.proteinKey;
+                qproteomeKey = eachMember.proteomeKey;
+            }
+        }
+    }
+    if (isRepFound){
+        proteinClustWriter.writeStart(thread_idx);
+        const unsigned int queryId = qproteinKey;
+        const unsigned int queryKey = tProteinDB.getDbKey(queryId);
+        char* querySeq = tProteinDB.getData(queryId, thread_idx); 
+        query.mapSequence(queryId, queryKey, querySeq, tProteinDB.getSeqLen(queryId));
+        matcher.initQuery(&query);
+        const unsigned int queryProteomeLength = proteomeList[qproteomeKey].proteomeAALen;
+
+        // Alignment by representative protein itself : same query and target (for createtsv)
+        Matcher::result_t rep_result = matcher.getSWResult(&query, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, true);
+        size_t rep_len = Matcher::resultToBuffer(buffer, rep_result, par.addBacktrace);
+        proteinClustWriter.writeAdd(buffer, rep_len, thread_idx);
+        localSeqIds[qproteomeKey] += rep_result.getSeqId()*query.L;
+        localMatchCount[qproteomeKey] += static_cast<unsigned int> (rep_result.getSeqId() * static_cast<float> (rep_result.getAlnLength()) + 0.5);
+
+
+        //Alignment by representative protein and other proteins in the cluster
+        for (auto& eachTargetMember : clusterRep.memberProteins){
+            if (eachTargetMember.proteomeKey == RepProteomeId) {
+                continue;
+            }
+            // if query Proteome's length < target Proteome's length * proteomeSimThr, skip
+            const unsigned int targetProteomeLength = proteomeList[eachTargetMember.proteomeKey].proteomeAALen;
+            if (queryProteomeLength < targetProteomeLength * par.proteomeSimThr) {
+                continue;
+            }
+            const unsigned int targetId = eachTargetMember.proteinKey;
+            const unsigned int targetKey = tProteinDB.getDbKey(targetId);
+            unsigned int tproteomeKey = eachTargetMember.proteomeKey;
+
+            char* targetSeq = tProteinDB.getData(targetId, thread_idx);
+            target.mapSequence(targetId, targetKey, targetSeq, tProteinDB.getSeqLen(targetId));
+
+            if (Util::canBeCovered(par.covThr, par.covMode, query.L, target.L) == false) {
+                continue;
+            }
+
+            const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false;
+            Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr, swMode, par.seqIdMode, isIdentity);
+
+            if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) {
+                if (query.L >= target.L*0.9) { // -s2 lenght difference parameter in cd-hit-2d
+                    size_t len = Matcher::resultToBuffer(buffer, result, par.addBacktrace);
+                    proteinClustWriter.writeAdd(buffer, len, thread_idx);
+                    localSeqIds[tproteomeKey] += result.getSeqId() * target.L;
+                    unsigned int matchCount = static_cast<unsigned int> ((result.getSeqId() * static_cast<float> (result.getAlnLength())) + 0.5);
+                    localMatchCount[tproteomeKey] += matchCount;
+                }
+            }
+        }
+        proteinClustWriter.writeEnd(queryKey, thread_idx);
+    }
+}
+
+void updateProteomeRedundancyInfo(std::vector<ProteomeEntry>& proteomeList, const unsigned int& RepProteomeId, Parameters& par) {
+    const unsigned int& repProteomeAASize = proteomeList[RepProteomeId].proteomeAALen;
+    for (size_t i = 0; i < proteomeList.size(); i++) {
+        if (proteomeList[i].isCovered() == false) {
+            if (i == RepProteomeId){
+                proteomeList[i].repProtKey = RepProteomeId;
+                proteomeList[i].protSimScore = 1.0;
+                proteomeList[i].relativeSimScore = 1.0;
+            }else{
+                proteomeList[i].computeRedundancy(repProteomeAASize);
+                if (proteomeList[i].getProtSimScore() >= par.proteomeSimThr && proteomeList[i].getrelativeSimScore() >= par.proteomeRelativeSimThr){
+                    proteomeList[i].repProtKey = RepProteomeId;
+                }else{
+                    proteomeList[i].resetProteomeInfo();
+                }
+            }
+        }
+    }
+    
+}
+
+void updateClusterReps(ClusterEntry& clusterRep, std::vector<ProteomeEntry>& proteomeList, std::vector<unsigned int>& localProteomeCount){
+    std::vector<uint8_t> isProteomeInCluster(localProteomeCount.size(), 0);
+    if (clusterRep.isAvailable) {
+        unsigned int notCoveredMemberCount = 0;
+        unsigned int uniqueProteomeCount = 0;
+        //update cluster member info
+        for (auto& eachMember : clusterRep.memberProteins) {
+            if (proteomeList[eachMember.proteomeKey].isCovered() == false) {
+                notCoveredMemberCount++; 
+                if (isProteomeInCluster[eachMember.proteomeKey] == 0) {
+                    isProteomeInCluster[eachMember.proteomeKey] = 1;
+                    uniqueProteomeCount++;
+                }
+            }
+        }
+        if (notCoveredMemberCount <= 1 || uniqueProteomeCount <= 1 ) { 
+            //notCoveredMemberCount : All members(genes)' proteome are covere(0) or only one member(gene) is not covered(1)
+            //uniqueProteomeCount : Remaining members(genes) are from same(one) proteome
+            clusterRep.resetClusterInfo(); //set clusterRep.isAvailable = false;
+            return;
+        }
+        
+    }
+
+    //update localProteomeCount -> update clusterCount for next repProteome finding
+    if (clusterRep.isAvailable) {
+        for (size_t i=0; i < localProteomeCount.size(); i++) {
+            if (isProteomeInCluster[i]) {
+                localProteomeCount[i]++;
+            }
+        }
+    }
+
+}
+
+
+void writeProteomeClusters(DBWriter &proteomeClustWriter, std::vector<ProteomeEntry> &proteomeList) {
+    std::vector<size_t> proteomeClusterIdx;
+    char proteomeBuffer[1024];
+    int repProtIdCluster = proteomeList[0].getRepProtKey();
+
+    for (size_t idx = 0; idx < proteomeList.size(); idx++) {
+        int repProtId = proteomeList[idx].getRepProtKey();
+        if (repProtIdCluster != repProtId) {
+            proteomeClustWriter.writeStart();
+            for (auto &eachIdx : proteomeClusterIdx) {
+                char *basePos = proteomeBuffer;
+                char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\t';
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\t';
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getrelativeSimScore(), tmpProteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\n';
+                proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
+            }
+            proteomeClustWriter.writeEnd(repProtIdCluster);
+            // Reset
+            repProtIdCluster = repProtId;
+            proteomeClusterIdx.clear();
+            proteomeClusterIdx.push_back(idx);
+        } else {
+            proteomeClusterIdx.push_back(idx);
+        }
+
+        if (idx == proteomeList.size() - 1) {
+            proteomeClustWriter.writeStart();
+            for (auto &eachIdx : proteomeClusterIdx) {
+                char *basePos = proteomeBuffer;
+                char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[eachIdx].getProteomeKey(), proteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\t';
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getProtSimScore(), tmpProteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\t';
+                tmpProteomeBuffer = Util::fastSeqIdToBuffer(proteomeList[eachIdx].getrelativeSimScore(), tmpProteomeBuffer);
+                *(tmpProteomeBuffer - 1) = '\n';
+                proteomeClustWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
+            }
+            proteomeClustWriter.writeEnd(repProtIdCluster);
+            proteomeClusterIdx.clear();
+        }
+    }
+}
+
+static char* fastfloatToBuffer(float value, char* buffer) {
+    value *= 100;
+    int integerPart = (int)value;  
+    buffer = Itoa::i32toa_sse2(integerPart, buffer);  
+    *(buffer-1) = '.';
+    float fractionalPart = (value - integerPart) * 100;
+    int fractionalInt = (int)fractionalPart;  
+    if (fractionalInt < 10) {
+        *(buffer) = '0'; 
+        buffer++;
+    }
+    buffer = Itoa::i32toa_sse2(fractionalInt, buffer);  
+
+    *(buffer-1) = '%';
+    buffer++;
+    return buffer;
+}
+
+int proteomecluster(int argc, const char **argv, const Command &command){
+    //Initialize parameters
+    Parameters &par = Parameters::getInstance();
+    par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0);
+    par.parseParameters(argc, argv, command, true, 0, 0);
+
+    if (par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED) {
+        Debug(Debug::ERROR) << "Use rescorediagonal for ungapped alignment mode.\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (par.addBacktrace == true) {
+        par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
+    }
+    
+    unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr);
+
+    //Open the target protein database
+    DBReader<unsigned int> tProteinDB(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_LOOKUP|DBReader<unsigned int>::USE_SOURCE);
+    tProteinDB.open(DBReader<unsigned int>::NOSORT);
+    const int tProteinSeqType = tProteinDB.getDbtype();
+
+    if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
+        tProteinDB.readMmapedDataInMemory();
+    }
+    //Open lookup table to get the source of the protein and the protein length
+    DBReader<unsigned int>::LookupEntry* tLookup = tProteinDB.getLookup();
+    const size_t tLookupSize = tProteinDB.getLookupSize();
+    unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber;
+    std::vector<ProteomeEntry> proteomeList(totalProteomeNumber + 1);
+    calculateProteomeLength(proteomeList, tLookup, tLookupSize, tProteinDB);
+
+    //Open the linclust result
+    DBReader<unsigned int> linResDB(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+    linResDB.open(DBReader<unsigned int>::LINEAR_ACCCESS);
+
+    //Open the DBWriter
+    DBWriter proteinClustWriter(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB);
+    proteinClustWriter.open();
+    int proteomeDBType = DBReader<unsigned int>::setExtendedDbtype(Parameters::DBTYPE_GENERIC_DB, Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER);
+    DBWriter proteomeClustWriter(par.db4.c_str(), par.db4Index.c_str(), 1, par.compressed, proteomeDBType);
+    proteomeClustWriter.open();
+    DBWriter clusterCountWriter(par.db5.c_str(), par.db5Index.c_str(), 1, par.compressed, proteomeDBType);
+    clusterCountWriter.open();
+
+    std::vector<ClusterEntry> clusterReps; 
+
+    int gapOpen, gapExtend;
+    // BaseMatrix *subMat;
+    SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
+    gapOpen = par.gapOpen.values.aminoacid();
+    gapExtend = par.gapExtend.values.aminoacid();
+    EvalueComputation evaluer(tProteinDB.getAminoAcidDBSize(), &subMat, gapOpen, gapExtend);
+    Debug(Debug::INFO) << "Initialization ";
+    Timer timer;
+
+    #pragma omp parallel
+    {
+        unsigned int thread_idx = 0;
+    #ifdef OPENMP
+        thread_idx = (unsigned int) omp_get_thread_num();
+    #endif    
+        std::vector<ClusterEntry> localClusterReps;
+        std::vector<unsigned int> localProteomeCount(proteomeList.size(), 0);
+        #pragma omp for schedule(dynamic, 1)
+        for (size_t id = 0; id < linResDB.getSize(); id++) {
+            initLocalClusterReps(id, localClusterReps, localProteomeCount, tProteinDB, linResDB, thread_idx);
+        }
+
+        for (size_t idx = 0; idx < localProteomeCount.size(); idx++) {
+        __sync_fetch_and_add(&proteomeList[idx].clusterCount, localProteomeCount[idx]);
+        }
+
+        #pragma omp critical
+        {
+            clusterReps.insert(clusterReps.end(),
+                               std::make_move_iterator(localClusterReps.begin()),
+                               std::make_move_iterator(localClusterReps.end()));
+        }
+        
+    }
+    Debug(Debug::INFO) << timer.lap() << "\n";
+
+    unsigned int totalClusterCount = clusterReps.size();
+    Debug(Debug::INFO) << "Total Cluster Count : " << totalClusterCount << "\n";
+    for (size_t idx=0; idx < proteomeList.size(); idx++){
+        float clusterCountRatio = static_cast<float> (proteomeList[idx].clusterCount) / static_cast<float> (totalClusterCount);
+
+        char proteomeBuffer[1024];
+        clusterCountWriter.writeStart();
+        char *basePos = proteomeBuffer;
+        char *tmpProteomeBuffer = Itoa::i32toa_sse2(proteomeList[idx].clusterCount, proteomeBuffer);
+        *(tmpProteomeBuffer - 1) = '\t';
+        tmpProteomeBuffer = fastfloatToBuffer(clusterCountRatio, tmpProteomeBuffer);
+        *(tmpProteomeBuffer - 1) = '\n';
+        clusterCountWriter.writeAdd(proteomeBuffer, tmpProteomeBuffer - basePos);
+        clusterCountWriter.writeEnd(proteomeList[idx].proteomeKey);
+    }
+
+    unsigned int RepProteomeId = -1;
+    Debug(Debug::INFO) << "Proteome Clustering ";
+    timer.reset();
+    while (FindNextRepProteome(proteomeList, RepProteomeId)) {
+        #pragma omp parallel
+        {
+            unsigned int thread_idx = 0;
+        #ifdef OPENMP
+            thread_idx = (unsigned int) omp_get_thread_num();
+        #endif  
+            Matcher matcher(tProteinSeqType, tProteinSeqType, par.maxSeqLen, &subMat, &evaluer, par.compBiasCorrection, par.compBiasCorrectionScale, gapOpen, gapExtend, 0.0, par.zdrop);
+            Sequence query(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
+            Sequence target(par.maxSeqLen, tProteinSeqType, &subMat, 0, false, par.compBiasCorrection);
+            std::vector <float> localSeqIds(proteomeList.size(), 0.0f);
+            std::vector <unsigned int> localMatchCount(proteomeList.size(), 0);
+            #pragma omp for schedule(dynamic, 1)
+            for (size_t i = 0; i < clusterReps.size(); i++) {
+                if (clusterReps[i].isAvailable) {
+                    runAlignmentForCluster(clusterReps[i], RepProteomeId, tProteinDB, matcher, query, target, proteomeList, par, thread_idx, swMode, localSeqIds, localMatchCount, proteinClustWriter);
+                }
+                
+            }
+            #pragma omp critical
+            {
+                for (size_t idx = 0; idx < proteomeList.size(); idx++) {
+                    if (proteomeList[idx].isCovered() == false) {
+                        proteomeList[idx].addSeqId(localSeqIds[idx]);
+                        proteomeList[idx].AAMatchCount += localMatchCount[idx];
+                    }
+                }
+            }
+        }
+
+        // Debug(Debug::INFO) << "2. Update ProteomeDB. Calculate the similarity score and check redundancy | Rep Proteome id : " << RepProteomeId << "\n";
+        updateProteomeRedundancyInfo(proteomeList, RepProteomeId, par);
+
+        // Debug(Debug::INFO) << "3. Re-Setup Proteome and ClusterReps\n";
+        #pragma omp parallel
+        {
+            std::vector<unsigned int> localProteomeCount(proteomeList.size(), 0);
+            #pragma omp for schedule(dynamic, 1)
+            for (size_t i = 0; i < clusterReps.size(); i++) {
+                updateClusterReps(clusterReps[i], proteomeList, localProteomeCount);
+            }
+
+            for (size_t i = 0; i < proteomeList.size(); i++) {
+                __sync_fetch_and_add(&proteomeList[i].clusterCount, localProteomeCount[i]);
+            }
+        }
+    }
+    Debug(Debug::INFO) << timer.lap() << "\n";
+
+    //sort proteomeList by repProtKey
+    SORT_PARALLEL(proteomeList.begin(), proteomeList.end(), ProteomeEntry::compareByKey);
+    //write _proteome_cluster
+    writeProteomeClusters(proteomeClustWriter, proteomeList);
+
+    proteomeClustWriter.close();
+    proteinClustWriter.close();
+    clusterCountWriter.close();
+    tProteinDB.close();
+    linResDB.close();
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt
index ce266e9c7..5aa4e0bd9 100644
--- a/src/workflow/CMakeLists.txt
+++ b/src/workflow/CMakeLists.txt
@@ -14,6 +14,7 @@ set(workflow_source_files
         workflow/Search.cpp
         workflow/Taxonomy.cpp
         workflow/EasyTaxonomy.cpp
+        workflow/EasyProteomecluster.cpp
         workflow/CreateIndex.cpp
         PARENT_SCOPE
         )
diff --git a/src/workflow/EasyLinclust.cpp b/src/workflow/EasyLinclust.cpp
index 375ff1610..181968a8b 100644
--- a/src/workflow/EasyLinclust.cpp
+++ b/src/workflow/EasyLinclust.cpp
@@ -28,7 +28,6 @@ void setEasyLinclustMustPassAlong(Parameters *p) {
     p->PARAM_REMOVE_TMP_FILES.wasSet = true;
     p->PARAM_C.wasSet = true;
     p->PARAM_E.wasSet = true;
-    //p->PARAM_ALIGNMENT_MODE.wasSet = true;
     p->PARAM_ORF_START_MODE.wasSet = true;
     p->PARAM_ORF_MIN_LENGTH.wasSet = true;
     p->PARAM_ORF_MAX_LENGTH.wasSet = true;
diff --git a/src/workflow/EasyProteomecluster.cpp b/src/workflow/EasyProteomecluster.cpp
new file mode 100644
index 000000000..c8e60e18e
--- /dev/null
+++ b/src/workflow/EasyProteomecluster.cpp
@@ -0,0 +1,55 @@
+#include <cassert>
+
+#include "FileUtil.h"
+#include "CommandCaller.h"
+#include "Util.h"
+#include "Debug.h"
+#include "Parameters.h"
+
+#include "easyproteomecluster.sh.h"
+
+void setEasyproteomeclusterDefaults(Parameters *p) {
+    p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
+    p->proteomeSimThr = 0.9;
+    p->proteomeRelativeSimThr = 0.0;
+}
+
+void setEasyproteomeclusterMustPassAlong(Parameters *p){
+    p->PARAM_REMOVE_TMP_FILES.wasSet = true;
+    p->PARAM_ALIGNMENT_MODE.wasSet = true;
+
+}
+
+int easyproteomecluster(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    setEasyproteomeclusterDefaults(&par);
+    par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
+    setEasyproteomeclusterMustPassAlong(&par);
+    std::string tmpDir = par.filenames.back();
+    std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params));
+    if (par.reuseLatest) {
+        hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
+    }
+    tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
+    par.filenames.pop_back();
+
+    CommandCaller cmd;
+    cmd.addVariable("TMP_PATH", tmpDir.c_str());
+    cmd.addVariable("RESULTS", par.filenames.back().c_str());
+    par.filenames.pop_back();
+    cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
+
+    cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.createdb).c_str());
+    cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clusterworkflow, true).c_str());
+    cmd.addVariable("PROTEOMECLUSTER_PAR", par.createParameterString(par.proteomecluster).c_str());
+    cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
+    cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str());
+
+    std::string program = tmpDir + "/easyproteomecluster.sh";
+    FileUtil::writeFile(program, easyproteomecluster_sh, easyproteomecluster_sh_len);
+    cmd.execProgram(program.c_str(), par.filenames);
+    
+    // Should never get here
+    assert(false);
+    return 0;
+}