Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

alignproteome added #875

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
1cb2074
alignproteome added
Gyuuul2 Aug 16, 2024
12c0c81
add workflow, revise based on feedback
Gyuuul2 Aug 19, 2024
df80f07
Change createdb.cpp so that it takes in ".txt" file containing paths …
elpis51613 Aug 25, 2024
1073ec9
Merge remote-tracking branch 'refs/remotes/origin/master'
elpis51613 Aug 25, 2024
3617ea7
Change filepath format from txt to tsv. Also add explanations of its …
elpis51613 Aug 26, 2024
3fc7fcd
createtsv - handle source identifier
Gyuuul2 Aug 27, 2024
0487a1f
proteomcluster - change writer / way to select repProteome
Gyuuul2 Aug 27, 2024
3f4fc91
easy proteomecluster workflow
Gyuuul2 Aug 27, 2024
eeabad3
indexreader - handle source identifier
Gyuuul2 Aug 27, 2024
9996821
update mmseqsbase and change module name into proteomecluster
Gyuuul2 Aug 27, 2024
27d9f68
set alignment mode default
Gyuuul2 Aug 28, 2024
e2ef229
latest
Gyuuul2 Aug 28, 2024
fd1f5d6
original index
Gyuuul2 Aug 28, 2024
0d7db0f
use sync_fetch_and_add when increment memcount
Gyuuul2 Aug 28, 2024
1fe489e
change submat
Gyuuul2 Aug 28, 2024
86f6b15
add timer
Gyuuul2 Aug 28, 2024
8344e5b
Merge branch 'pr-879'
Gyuuul2 Aug 28, 2024
be5a53d
Remove __packed__ to resolve sync_fetch_and_add error
Gyuuul2 Aug 28, 2024
8d3a38f
change chmod in createdb
Gyuuul2 Aug 29, 2024
bfa1911
update proteome-similarity threshold parameter
Gyuuul2 Aug 29, 2024
7e912d2
Sort redundant proteomes by similarity score in proteome_cluster.tsv
Gyuuul2 Sep 8, 2024
618e2bf
delete redundant code - checking proteome is singleton(no redudant me…
Gyuuul2 Sep 9, 2024
e990773
stop
Gyuuul2 Sep 9, 2024
e9ffe22
jaccardindexTrial
Gyuuul2 Sep 8, 2024
d14217f
add normalized scoring
Gyuuul2 Sep 9, 2024
5e6ee6d
add proteome relative(normalized) similarity threshold parameter
Gyuuul2 Sep 9, 2024
743370d
relative similarity threshold
Gyuuul2 Sep 9, 2024
44a6fca
latest version-bugfix,relativeProteomeSimilarity added, singleton pro…
Gyuuul2 Sep 9, 2024
910bb7f
Change Relative Score formula
Gyuuul2 Sep 10, 2024
e24b9e3
Remove Debug printout
Gyuuul2 Sep 10, 2024
ad1acdf
style change
Gyuuul2 Sep 10, 2024
e026260
Latest Version
Gyuuul2 Sep 10, 2024
d9213ef
Add cluster count report
Gyuuul2 Sep 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions data/workflow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ set(GENERATED_WORKFLOWS
workflow/easycluster.sh
workflow/easytaxonomy.sh
workflow/easyrbh.sh
workflow/easyproteomecluster.sh
workflow/blastp.sh
workflow/blastpgp.sh
workflow/map.sh
Expand Down
70 changes: 70 additions & 0 deletions data/workflow/easyproteomecluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/bin/sh -e
fail() {
echo "Error: $1"
exit 1
}

notExists() {
[ ! -f "$1" ]
}


if notExists "${TMP_PATH}/input.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \
|| fail "query createdb died"
fi

if notExists "${TMP_PATH}/clu.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \
|| fail "Search died"
fi

if notExists "${RESULTS}_protein_cluster.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \
|| fail "createtsv protein cluster died"
fi

if notExists "${TMP_PATH}/aln.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" "${TMP_PATH}/cluster_count" ${PROTEOMECLUSTER_PAR} \
|| fail "Convert Alignments died"
fi

if notExists "${RESULTS}_protein_align.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \
|| fail "createtsv protein align died"
fi

if notExists "${RESULTS}_proteome_cluster.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
|| fail "createtsv proteome cluster died"
fi

if notExists "${RESULTS}_cluster_count.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/cluster_count" "${RESULTS}_cluster_count.tsv" ${THREADS_PAR} \
|| fail "createtsv proteome cluster count report died"
fi


if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/aln_protein" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/aln_proteome" ${VERBOSITY_PAR}
rm -rf "${TMP_PATH}/clu_tmp"
rm -f "${TMP_PATH}/easyproteomecluster.sh"
fi
2 changes: 2 additions & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

extern int align(int argc, const char **argv, const Command& command);
extern int alignall(int argc, const char **argv, const Command& command);
extern int proteomecluster(int argc, const char **argv, const Command& command);
extern int alignbykmer(int argc, const char **argv, const Command& command);
extern int appenddbtoindex(int argc, const char **argv, const Command& command);
extern int apply(int argc, const char **argv, const Command& command);
Expand Down Expand Up @@ -43,6 +44,7 @@ extern int diffseqdbs(int argc, const char **argv, const Command& command);
extern int easycluster(int argc, const char **argv, const Command& command);
extern int easyrbh(int argc, const char **argv, const Command& command);
extern int easylinclust(int argc, const char **argv, const Command& command);
extern int easyproteomecluster(int argc, const char **argv, const Command& command);
extern int easysearch(int argc, const char **argv, const Command& command);
extern int easylinsearch(int argc, const char **argv, const Command& command);
extern int tsv2exprofiledb(int argc, const char **argv, const Command& command);
Expand Down
36 changes: 34 additions & 2 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ std::vector<Command> baseCommands = {
"Sensitive homology search",
"# Search multiple FASTA against FASTA (like BLASTP, TBLASTN, BLASTX, BLASTN --search-type 3, TBLASTX --search-type 2)\n"
"mmseqs easy-search examples/QUERY.fasta examples/QUERY.fasta examples/DB.fasta result.m8 tmp\n\n"
"# Search multiple query fasta files against target fasta files using a tsv file containing filepaths\n"
"echo -e \"dir1/QUERY1.fasta\\ndir2/QUERY2.fasta\" > examples/queries.tsv\n"
"echo -e \"dir3/TARGET1.fasta\\ndir4/TARGET2.fasta\" > examples/targets.tsv\n"
"mmseqs easy-search examples/queries.tsv examples/targets.tsv result.m8 tmp\n\n"
"# Iterative profile search from stdin (like PSI-BLAST)\n"
"cat examples/QUERY.fasta | mmseqs easy-search stdin examples/DB.fasta result.m8 tmp --num-iterations 2\n\n"
"# Profile search against small databases (e.g. PFAM, eggNOG)\n"
Expand Down Expand Up @@ -79,6 +83,19 @@ std::vector<Command> baseCommands = {
CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
{"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
{"easy-proteomecluster", easyproteomecluster, &par.easyproteomeclusterworkflow, COMMAND_EASY,
"Proteome clustering and alignment",
"mmseqs easy-proteomecluster examples/DB.fasta result tmp\n\n"
"# ProteomeCluster output\n"
"# - protein_cluster.tsv: Protein linclust result\n"
"# - proteome_cluster.tsv: Proteome redundancy clustering result\n"
"# - protein_align.tsv: Protein alignment list\n"
"mmseqs easy-proteomecluster examples/DB.fasta result tmp --proteome-similarity 0.9 --proteome-relative-similarity 0.9\n",
"Martin Steinegger <[email protected]> & Gyuri Kim <[email protected]>",
"<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <o:clusterPrefix> <tmpDir>",
CITATION_MMSEQS2, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
{"outputReports", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
{"easy-taxonomy", easytaxonomy, &par.easytaxonomy, COMMAND_EASY,
"Taxonomic classification",
"# Assign taxonomic labels to FASTA sequences\n"
Expand Down Expand Up @@ -125,7 +142,10 @@ std::vector<Command> baseCommands = {
"# Create a seqDB from stdin\n"
"cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n"
"# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n"
"mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n",
"mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n\n"
"# Create a seqDB from a tsv file containing filepaths of multiple FASTA files in each line\n"
"echo -e \"dir1/bacteria.fasta\\ndir2/archea.fasta.gz\" > filepaths.tsv\n"
"mmseqs createdb filepaths.tsv sequenceDB\n",
"Martin Steinegger <[email protected]>",
"<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB>",
CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric },
Expand Down Expand Up @@ -632,6 +652,16 @@ std::vector<Command> baseCommands = {
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
{"proteomecluster", proteomecluster, &par.proteomecluster, COMMAND_ALIGNMENT,
"Proteome clustering and alignment",
NULL,
"Martin Steinegger <[email protected]> & Gyuri Kim <[email protected]>",
"<i:sequenceDB> <i:clustresultDB> <o:proteinAlignmentDB> <o:proteomeAlignmentDB> <o:proteomeClusterCountReport",
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"clustresultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"proteinAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
{"proteomeAlignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
{"proteomeClusterCountReport", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
{"alignall", alignall, &par.alignall, COMMAND_ALIGNMENT,
"Within-result all-vs-all gapped local alignment",
NULL,
Expand All @@ -640,6 +670,7 @@ std::vector<Command> baseCommands = {
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},

{"transitivealign", transitivealign, &par.align, COMMAND_ALIGNMENT,
"Transfer alignments via transitivity",
//"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B",
Expand Down Expand Up @@ -1298,5 +1329,6 @@ std::vector<Command> baseCommands = {
NULL,
"",
"",
CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}
CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},

};
8 changes: 7 additions & 1 deletion src/alignment/Matcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@ class Matcher{
}
}
}

float getSeqId(){
return seqId;
}
unsigned int getAlnLength(){
return alnLength;
}
};

Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m,
Expand Down Expand Up @@ -217,7 +224,6 @@ class Matcher{


static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress = true, bool addOrfPosition = false);

static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart);

static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer,
Expand Down
65 changes: 65 additions & 0 deletions src/commons/DBReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ template <typename T> bool DBReader<T>::open(int accessType){
}
totalDataSize = 0;
dataFileCnt = dataFileNames.size();

dataSizeOffset = new size_t[dataFileNames.size() + 1];
dataFiles = new char*[dataFileNames.size()];
for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){
Expand Down Expand Up @@ -155,6 +156,22 @@ template <typename T> bool DBReader<T>::open(int accessType){
}
lookupData.close();
}
if (dataMode & USE_SOURCE){
std::string sourceFilename = (std::string(dataFileName) + ".source");
MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
if (sourceData.isValid() == false) {
Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n";
EXIT(EXIT_FAILURE);
}
char* sourceDataChar = (char *) sourceData.getData();
size_t sourceDataSize = sourceData.size();
sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads);
source = new(std::nothrow) SourceEntry[this->sourceSize];
incrementMemory(sizeof(SourceEntry) * this->sourceSize);
readSource(sourceDataChar, sourceDataSize, source);

}

bool isSortedById = false;
if (externalData == false) {
MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
Expand Down Expand Up @@ -707,6 +724,24 @@ template <typename T> unsigned int DBReader<T>::getLookupFileNumber(size_t id){
return lookup[id].fileNumber;
}

template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){
if (id >= sourceSize){
Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n";
EXIT(EXIT_FAILURE);
}
return source[id].fileName;
}

template <typename T> T DBReader<T>::getSourceKey(size_t id){
if (id >= sourceSize){
Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
Debug(Debug::ERROR) << "getSource id: local id (" << id << ") >= db size (" << sourceSize << ")\n";
EXIT(EXIT_FAILURE);
}
return source[id].id;
}

template<>
void DBReader<unsigned int>::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) {
buffer.append(SSTR(entry.id));
Expand Down Expand Up @@ -1048,6 +1083,36 @@ void DBReader<T>::readLookup(char *data, size_t dataSize, DBReader::LookupEntry
currPos = lookupData - (char *) data;

i++;


}
}

template <typename T>
void DBReader<T>::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) {
size_t i=0;
size_t currPos = 0;
char* sourceData = (char *) data;
const char * cols[3];
while (currPos < dataSize){
if (i >= this->sourceSize) {
Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n";
EXIT(EXIT_FAILURE);
}
Util::getFieldsOfLine(sourceData, cols, 3);
source[i].id = Util::fast_atoi<size_t>(cols[0]);
std::string fileName = std::string(cols[1], (cols[2] - cols[1]));
size_t lastDotPosition = fileName.rfind('.');

if (lastDotPosition != std::string::npos) {
fileName = fileName.substr(0, lastDotPosition);
}
source[i].fileName = fileName;
sourceData = Util::skipLine(sourceData);

currPos = sourceData - (char *) data;

i++;
}
}

Expand Down
12 changes: 11 additions & 1 deletion src/commons/DBReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,10 @@ class DBReader : public MemoryTracker {
return false;
}
};

struct SourceEntry{
T id;
std::string fileName;
};
struct LookupEntry {
T id;
std::string entryName;
Expand Down Expand Up @@ -248,6 +251,8 @@ class DBReader : public MemoryTracker {
void lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry);
LookupEntry* getLookup() { return lookup; };

std::string getSourceFileName(size_t id);
T getSourceKey(size_t id);
static const int NOSORT = 0;
static const int SORT_BY_LENGTH = 1;
static const int LINEAR_ACCCESS = 2;
Expand All @@ -266,6 +271,7 @@ class DBReader : public MemoryTracker {
static const unsigned int USE_FREAD = 4;
static const unsigned int USE_LOOKUP = 8;
static const unsigned int USE_LOOKUP_REV = 16;
static const unsigned int USE_SOURCE = 32;


// compressed
Expand Down Expand Up @@ -309,6 +315,8 @@ class DBReader : public MemoryTracker {

void readLookup(char *data, size_t dataSize, LookupEntry *lookup);

void readSource(char *data, size_t dataSize, SourceEntry *source);

void readIndexId(T* id, char * line, const char** cols);

unsigned int indexIdToNum(T* id);
Expand Down Expand Up @@ -485,7 +493,9 @@ class DBReader : public MemoryTracker {

Index * index;
size_t lookupSize;
size_t sourceSize;
LookupEntry * lookup;
SourceEntry * source;
bool sortedByOffset;

unsigned int * id2local;
Expand Down
1 change: 1 addition & 0 deletions src/commons/DBWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class DBWriter : public MemoryTracker {
}

static void sortIndex(const char *inFileNameIndex, const char *outFileNameIndex, const bool lexicographicOrder);

private:
size_t addToThreadBuffer(const void *data, size_t itmesize, size_t nitems, int threadIdx);
void writeThreadBuffer(unsigned int idx, size_t dataSize);
Expand Down
3 changes: 3 additions & 0 deletions src/commons/IndexReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ class IndexReader {
}else{
failSuffix = "_aln";
}
} else if (databaseType & SOURCE) {
failSuffix = "";
}
}
sequenceReader = new DBReader<unsigned int>(
Expand All @@ -115,6 +117,7 @@ class IndexReader {
static const unsigned int SRC_HEADERS = 4;
static const unsigned int SRC_SEQUENCES = 8;
static const unsigned int ALIGNMENTS = 16;
static const unsigned int SOURCE = 32;
static const unsigned int USER_SELECT = 1 << 31;

static unsigned int makeUserDatabaseType(unsigned int baseKey) {
Expand Down
Loading