Skip to content

Commit

Permalink
Changes needed for cluster search
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed Aug 7, 2023
1 parent eb01b5b commit ad6dfc6
Show file tree
Hide file tree
Showing 9 changed files with 129 additions and 21 deletions.
1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ extern int search(int argc, const char **argv, const Command& command);
extern int linsearch(int argc, const char **argv, const Command& command);
extern int sortresult(int argc, const char **argv, const Command& command);
extern int splitdb(int argc, const char **argv, const Command& command);
extern int setextendeddbtype(int argc, const char **argv, const Command& command);
extern int splitsequence(int argc, const char **argv, const Command& command);
extern int subtractdbs(int argc, const char **argv, const Command& command);
extern int suffixid(int argc, const char **argv, const Command& command);
Expand Down
9 changes: 7 additions & 2 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -793,8 +793,13 @@ std::vector<Command> baseCommands = {
CITATION_MMSEQS2, {{"resultDBLeft", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"resultDBRight", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},


{"setextendeddbtype", setextendeddbtype, &par.extendeddbtype, COMMAND_DB,
"Write an extended DB ",
"# Print entries with keys 1, 2 and 3 from a sequence DB to stdout\n"
"mmseqs setextendedbtype db --extended-dbtype 2\n",
"Martin Steinegger <[email protected]>",
"<i:DB>",
CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},

{"view", view, &par.view, COMMAND_DB,
"Print DB entries given in --id-list to stdout",
Expand Down
24 changes: 22 additions & 2 deletions src/commons/IndexReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,28 @@ class IndexReader {
}

if (sequenceReader == NULL) {
if ((databaseType & USER_SELECT) == false && databaseType & (HEADERS | SRC_HEADERS)) {
failSuffix = "_h";
if((databaseType & USER_SELECT) == false && failSuffix == "" ){
if (databaseType & HEADERS) {
failSuffix = "_h";
} else if (databaseType & SRC_HEADERS) {
failSuffix = "_seq_h";
if(FileUtil::fileExists((dataName + "_seq_h.dbtype").c_str())==false){
failSuffix = "_h";
}
} else if (databaseType & SRC_SEQUENCES) {
failSuffix = "_seq";
if(FileUtil::fileExists((dataName + "_seq.dbtype").c_str())==false){
failSuffix = "";
}
} else if (databaseType & SEQUENCES) {
failSuffix = "";
} else if (databaseType & ALIGNMENTS) {
if(FileUtil::fileExists((dataName + "_clu.dbtype").c_str())){
failSuffix = "_clu";
}else{
failSuffix = "_aln";
}
}
}
sequenceReader = new DBReader<unsigned int>(
(dataName + failSuffix).c_str(), (dataName + failSuffix + ".index").c_str(),
Expand Down
11 changes: 10 additions & 1 deletion src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ Parameters::Parameters():
// indexdb
PARAM_CHECK_COMPATIBLE(PARAM_CHECK_COMPATIBLE_ID, "--check-compatible", "Check compatible", "0: Always recreate index, 1: Check if recreating index is needed, 2: Fail if index is incompatible", typeid(int), (void *) &checkCompatible, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC),
PARAM_SEARCH_TYPE(PARAM_SEARCH_TYPE_ID, "--search-type", "Search type", "Search type 0: auto 1: amino acid, 2: translated, 3: nucleotide, 4: translated nucleotide alignment", typeid(int), (void *) &searchType, "^[0-4]{1}"),
PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries\n0: normal index\n1: index without headers\n2: index without prefiltering data\nFlags can be combined bit wise", typeid(int), (void *) &indexSubset, "^[0-3]{1}", MMseqsParameter::COMMAND_EXPERT),
PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries\n0: normal index\n1: index without headers\n2: index without prefiltering data\n4: index without aln (for cluster db)\nFlags can be combined bit wise", typeid(int), (void *) &indexSubset, "^[0-7]{1}", MMseqsParameter::COMMAND_EXPERT),
PARAM_INDEX_DBSUFFIX(PARAM_INDEX_DBSUFFIX_ID, "--index-dbsuffix", "Index dbsuffix", "A suffix of the db (used for cluster dbs)", typeid(std::string), (void *) &indexDbsuffix, "", MMseqsParameter::COMMAND_HIDDEN),
// createdb
PARAM_USE_HEADER(PARAM_USE_HEADER_ID, "--use-fasta-header", "Use fasta header", "Use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers", typeid(bool), (void *) &useHeader, ""),
PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "Numeric ids in index file are offset by this value", typeid(int), (void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"),
Expand All @@ -197,6 +198,8 @@ Parameters::Parameters():
PARAM_SHUFFLE(PARAM_SHUFFLE_ID, "--shuffle", "Shuffle input database", "Shuffle input database", typeid(bool), (void *) &shuffleDatabase, ""),
PARAM_WRITE_LOOKUP(PARAM_WRITE_LOOKUP_ID, "--write-lookup", "Write lookup file", "write .lookup file containing mapping from internal id, fasta id and file number", typeid(int), (void *) &writeLookup, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT),
PARAM_USE_HEADER_FILE(PARAM_USE_HEADER_FILE_ID, "--use-header-file", "Use header DB", "use the sequence header DB instead of the body to map the entry keys", typeid(bool), (void *) &useHeaderFile, ""),
// setextendeddbtype
PARAM_EXTENDED_DBTYPE(PARAM_EXTENDED_DBTYPE_ID, "--extended-dbtype", "Extended dbtype", "Set extended dbtype 1: compressed, 2: need src, 4: context pseudoe cnts", typeid(int), (void *) &extendedDbtype, "^[0-4]{1}"),
// splitsequence
PARAM_SEQUENCE_OVERLAP(PARAM_SEQUENCE_OVERLAP_ID, "--sequence-overlap", "Overlap between sequences", "Overlap between sequences", typeid(int), (void *) &sequenceOverlap, "^(0|[1-9]{1}[0-9]*)$"),
PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "Sequence split mode 0: copy data, 1: soft link data and write new index,", typeid(int), (void *) &sequenceSplitMode, "^[0-1]{1}$"),
Expand Down Expand Up @@ -752,6 +755,7 @@ Parameters::Parameters():
indexdb.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
indexdb.push_back(&PARAM_MAX_SEQ_LEN);
indexdb.push_back(&PARAM_MAX_SEQS);
indexdb.push_back(&PARAM_INDEX_DBSUFFIX);
indexdb.push_back(&PARAM_MASK_RESIDUES);
indexdb.push_back(&PARAM_MASK_PROBABILTY);
indexdb.push_back(&PARAM_MASK_LOWER_CASE);
Expand Down Expand Up @@ -918,6 +922,10 @@ Parameters::Parameters():
subtractdbs.push_back(&PARAM_COMPRESSED);
subtractdbs.push_back(&PARAM_V);
// setextendeddbtype
extendeddbtype.push_back(&PARAM_EXTENDED_DBTYPE);
extendeddbtype.push_back(&PARAM_V);
// clusthash
clusthash.push_back(&PARAM_SUB_MAT);
clusthash.push_back(&PARAM_ALPH_SIZE);
Expand Down Expand Up @@ -2324,6 +2332,7 @@ void Parameters::setDefaults() {
checkCompatible = 0;
searchType = SEARCH_TYPE_AUTO;
indexSubset = INDEX_SUBSET_NORMAL;
indexDbsuffix = "";

// createdb
createdbMode = SEQUENCE_SPLIT_MODE_HARD;
Expand Down
11 changes: 11 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ class Parameters {
static const int INDEX_SUBSET_NORMAL = 0;
static const int INDEX_SUBSET_NO_HEADERS = 1;
static const int INDEX_SUBSET_NO_PREFILTER = 2;
static const int INDEX_SUBSET_NO_ALIGNMENT = 4;


static std::vector<int> getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders,
bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy);
Expand Down Expand Up @@ -546,6 +548,7 @@ class Parameters {
int checkCompatible;
int searchType;
int indexSubset;
std::string indexDbsuffix;

// createdb
int identifierOffset;
Expand Down Expand Up @@ -610,6 +613,9 @@ class Parameters {
float overlap;
int msaType;

// setextendeddbtype
int extendedDbtype;

// extractalignedregion
int extractMode;

Expand Down Expand Up @@ -892,6 +898,7 @@ class Parameters {
PARAMETER(PARAM_CHECK_COMPATIBLE)
PARAMETER(PARAM_SEARCH_TYPE)
PARAMETER(PARAM_INDEX_SUBSET)
PARAMETER(PARAM_INDEX_DBSUFFIX)

// createdb
PARAMETER(PARAM_USE_HEADER) // also used by extractorfs
Expand All @@ -904,6 +911,9 @@ class Parameters {
// convert2fasta
PARAMETER(PARAM_USE_HEADER_FILE)

// setextendedbtype
PARAMETER(PARAM_EXTENDED_DBTYPE)

// split sequence
PARAMETER(PARAM_SEQUENCE_OVERLAP)
PARAMETER(PARAM_SEQUENCE_SPLIT_MODE)
Expand Down Expand Up @@ -1116,6 +1126,7 @@ class Parameters {
std::vector<MMseqsParameter*> offsetalignment;
std::vector<MMseqsParameter*> proteinaln2nucl;
std::vector<MMseqsParameter*> subtractdbs;
std::vector<MMseqsParameter*> extendeddbtype;
std::vector<MMseqsParameter*> diff;
std::vector<MMseqsParameter*> concatdbs;
std::vector<MMseqsParameter*> mergedbs;
Expand Down
2 changes: 2 additions & 0 deletions src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ set(util_source_files
util/cpmvrmlndb.cpp
util/extractframes.cpp
util/sequence2profile.cpp
util/setextendeddbtype.cpp
util/sortresult.cpp
util/splitdb.cpp
util/setextendeddbtype.cpp
util/splitsequence.cpp
util/subtractdbs.cpp
util/summarizealis.cpp
Expand Down
43 changes: 33 additions & 10 deletions src/util/indexdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,39 @@ int indexdb(int argc, const char **argv, const Command &command) {

const bool sameDB = (par.db1 == par.db2);
std::string alnDbtypeFile = par.db1 + "_aln.dbtype";
std::string seqDbtypeFile = par.db1 + "_seq.dbtype";
const bool ppDB = FileUtil::fileExists(alnDbtypeFile.c_str()) && FileUtil::fileExists(seqDbtypeFile.c_str());

std::string db2 = ppDB ? par.db1 + "_seq" : par.db2;
std::string db2Index = ppDB ? par.db1 + "_seq.index" : par.db2Index;
std::string alnFile = par.db1 + "_aln";
std::string alnIndexFile = par.db1 + "_aln.index";
if(FileUtil::fileExists(alnDbtypeFile.c_str()) == false){
alnDbtypeFile = par.db1 + "_clu.dbtype";
alnFile = par.db1 + "_clu";
alnIndexFile = par.db1 + "_clu.index";
}

std::string hdr1 = ppDB ? par.db1 + "_seq_h" : par.hdr1;
std::string hdr1Index = ppDB ? par.db1 + "_seq_h.index" : par.hdr1Index;

DBReader<unsigned int> dbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
dbr.open(DBReader<unsigned int>::NOSORT);

// remove par.indexDbsuffix from db1
std::string seqDb = par.db1 + "_seq";
std::string seqDbIndex = par.db1 + "_seq.index";
std::string seqDbtypeFile = par.db1 + "_seq.dbtype";
if (par.indexDbsuffix != "") {
std::string::size_type pos = par.db1.find(par.indexDbsuffix);
if (pos != std::string::npos) {
par.db1 = par.db1.substr(0, pos);
}
seqDb = par.db1 + "_seq" + par.indexDbsuffix;
seqDbIndex = par.db1 + "_seq" + par.indexDbsuffix + ".index";
seqDbtypeFile = par.db1 + "_seq" + par.indexDbsuffix + ".dbtype";
}
const bool ppDB = FileUtil::fileExists(alnDbtypeFile.c_str()) && FileUtil::fileExists(seqDbtypeFile.c_str());

std::string db2 = ppDB ? seqDb : par.db2;
std::string db2Index = ppDB ? seqDbIndex : par.db2Index;

std::string hdr1 = ppDB ? seqDb + "_h" : par.hdr1;
std::string hdr1Index = ppDB ? seqDb + "_h.index" : par.hdr1Index;

DBReader<unsigned int> *dbr2 = NULL;
if ((sameDB == false) || ppDB) {
dbr2 = new DBReader<unsigned int>(db2.c_str(), db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
Expand Down Expand Up @@ -102,7 +123,7 @@ int indexdb(int argc, const char **argv, const Command &command) {
// query seq type is actually unknown here, but if we pass DBTYPE_HMM_PROFILE then its +20 k-score
int kmerScore = Prefiltering::getKmerThreshold(par.sensitivity, isProfileSearch, contextPseudoCnts, par.kmerScore.values, par.kmerSize);

const std::string& baseDB = ppDB ? par.db1 : par.db2;
const std::string& baseDB = ppDB ? par.db1 + par.indexDbsuffix : par.db2;
std::string indexDB = PrefilteringIndexReader::indexName(baseDB);

int status = EXIT_SUCCESS;
Expand Down Expand Up @@ -150,8 +171,10 @@ int indexdb(int argc, const char **argv, const Command &command) {
}

DBReader<unsigned int> *alndbr = NULL;
if (ppDB == true) {
alndbr = new DBReader<unsigned int>((par.db1 + "_aln").c_str(), (par.db1 + "_aln.index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
const bool noAlignment = (par.indexSubset & Parameters::INDEX_SUBSET_NO_ALIGNMENT) != 0;
if (ppDB == true && noAlignment == false) {
alndbr = new DBReader<unsigned int>(alnFile.c_str(), alnIndexFile.c_str(),
par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
alndbr->open(DBReader<unsigned int>::NOSORT);
}

Expand Down
28 changes: 22 additions & 6 deletions src/util/mergeresultsbyset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "DBReader.h"
#include "DBWriter.h"
#include "Util.h"
#include "IndexReader.h"

#ifdef OPENMP
#include <omp.h>
Expand All @@ -14,10 +15,26 @@ int mergeresultsbyset(int argc, const char **argv, const Command &command) {
DBReader<unsigned int> setReader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
setReader.open(DBReader<unsigned int>::LINEAR_ACCCESS);

DBReader<unsigned int> resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
resultReader.open(DBReader<unsigned int>::NOSORT);
// DBReader<unsigned int> resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
// resultReader.open(DBReader<unsigned int>::NOSORT);

DBWriter dbw(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, resultReader.getDbtype());

const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
unsigned int databaseType = IndexReader::USER_SELECT;
int dbtype = FileUtil::parseDbType(par.db2.c_str());
if(Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_INDEX_DB)||
Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_AMINO_ACIDS)||
Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES)||
Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE)){
databaseType = IndexReader::ALIGNMENTS;
}
IndexReader resultReader(par.db2, par.threads,
databaseType,
(touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);

int dbType = resultReader.sequenceReader->getDbtype();
dbType = DBReader<unsigned int>::setExtendedDbtype(dbType, Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC);
DBWriter dbw(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, dbType);
dbw.open();
#pragma omp parallel
{
Expand All @@ -35,20 +52,19 @@ int mergeresultsbyset(int argc, const char **argv, const Command &command) {
while (*data != '\0'){
Util::parseKey(data, dbKey);
unsigned int key = Util::fast_atoi<unsigned int>(dbKey);
size_t id = resultReader.getId(key);
size_t id = resultReader.sequenceReader->getId(key);
if (id == UINT_MAX) {
Debug(Debug::ERROR) << "Invalid key " << key << " in entry " << i << ".\n";
EXIT(EXIT_FAILURE);
}
buffer.append(resultReader.getData(id, thread_idx));
buffer.append(resultReader.sequenceReader->getData(id, thread_idx));
data = Util::skipLine(data);
}
dbw.writeData(buffer.c_str(), buffer.length(), setReader.getDbKey(i), thread_idx);
buffer.clear();
}
}
dbw.close();
resultReader.close();
setReader.close();

return EXIT_SUCCESS;
Expand Down
21 changes: 21 additions & 0 deletions src/util/setextendeddbtype.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include "Parameters.h"
#include "Util.h"
#include "Debug.h"
#include "DBReader.h"
#include "DBWriter.h"
#include "HeaderSummarizer.h"

#ifdef OPENMP
#include <omp.h>
#endif

int setextendeddbtype(int argc, const char **argv, const Command& command) {
Parameters& par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);
int dbtype = FileUtil::parseDbType(par.db1.c_str());
// check if dbtype uses isCompressed flag
bool isCompressed = (dbtype & (1 << 31));
dbtype = DBReader<unsigned int>::setExtendedDbtype(dbtype, par.extendedDbtype);
DBWriter::writeDbtypeFile(par.db1.c_str(), dbtype, isCompressed);
return EXIT_SUCCESS;
}

0 comments on commit ad6dfc6

Please sign in to comment.