Skip to content

Commit

Permalink
Add createclusearchdb, rm mkrepseqdb
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed Jul 31, 2023
1 parent 80f8b0b commit 9ae4458
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 143 deletions.
2 changes: 1 addition & 1 deletion src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ extern int mergedbs(int argc, const char **argv, const Command& command);
extern int mergeresultsbyset(int argc, const char **argv, const Command &command);
extern int msa2profile(int argc, const char **argv, const Command& command);
extern int sequence2profile(int argc, const char **argv, const Command& command);
extern int mkrepseqdb(int argc, const char **argv, const Command& command);
extern int createclusearchdb(int argc, const char **argv, const Command& command);
extern int msa2result(int argc, const char **argv, const Command& command);
extern int multihitdb(int argc, const char **argv, const Command& command);
extern int multihitsearch(int argc, const char **argv, const Command& command);
Expand Down
4 changes: 2 additions & 2 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1245,8 +1245,8 @@ std::vector<Command> baseCommands = {
"Martin Steinegger <[email protected]>",
"<i:sequenceDB> ",
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
{"mkrepseqdb", mkrepseqdb, &par.threadsandcompression, COMMAND_HIDDEN,
"Seperates a sequence DB into a representative and a non-representative DB",
{"createclusearchdb", createclusearchdb, &par.createclusearchdb, COMMAND_HIDDEN,
"Separates a sequence DB into a representative and a non-representative DB",
NULL,
"Martin Steinegger <[email protected]>",
"<i:sequenceDB> <i:resultDB> <o:sequenceDB>",
Expand Down
8 changes: 8 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ Parameters::Parameters():
PARAM_SEQUENCE_OVERLAP(PARAM_SEQUENCE_OVERLAP_ID, "--sequence-overlap", "Overlap between sequences", "Overlap between sequences", typeid(int), (void *) &sequenceOverlap, "^(0|[1-9]{1}[0-9]*)$"),
PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "Sequence split mode 0: copy data, 1: soft link data and write new index,", typeid(int), (void *) &sequenceSplitMode, "^[0-1]{1}$"),
PARAM_HEADER_SPLIT_MODE(PARAM_HEADER_SPLIT_MODE_ID, "--headers-split-mode", "Header split mode", "Header split mode: 0: split position, 1: original header", typeid(int), (void *) &headerSplitMode, "^[0-1]{1}$"),
// createclusearchdb
PARAM_DB_SUFFIX_LIST(PARAM_DB_SUFFIX_LIST_ID, "--db-suffix-list", "Database suffixes", "Suffixes for database to be split in rep/seq", typeid(std::string), (void *) &dbSuffixList, ""),
// gff2db
PARAM_GFF_TYPE(PARAM_GFF_TYPE_ID, "--gff-type", "GFF type", "Comma separated list of feature types in the GFF file to select", typeid(std::string), (void *) &gffType, ""),
// translatenucs
Expand Down Expand Up @@ -322,6 +324,12 @@ Parameters::Parameters():
threadsandcompression.push_back(&PARAM_COMPRESSED);
threadsandcompression.push_back(&PARAM_V);
// createclusearchdb
createclusearchdb.push_back(&PARAM_THREADS);
createclusearchdb.push_back(&PARAM_COMPRESSED);
createclusearchdb.push_back(&PARAM_V);
createclusearchdb.push_back(&PARAM_DB_SUFFIX_LIST);
// alignall
alignall.push_back(&PARAM_SUB_MAT);
alignall.push_back(&PARAM_ADD_BACKTRACE);
Expand Down
7 changes: 7 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,9 @@ class Parameters {
// result2flat
bool useHeader;

// createclusearchdb
std::string dbSuffixList;

// gff2db
std::string gffType;

Expand Down Expand Up @@ -906,6 +909,9 @@ class Parameters {
PARAMETER(PARAM_SEQUENCE_SPLIT_MODE)
PARAMETER(PARAM_HEADER_SPLIT_MODE)

// createclusearchdb
PARAMETER(PARAM_DB_SUFFIX_LIST)

// gff2db
PARAMETER(PARAM_GFF_TYPE)

Expand Down Expand Up @@ -1118,6 +1124,7 @@ class Parameters {
std::vector<MMseqsParameter*> summarizeresult;
std::vector<MMseqsParameter*> summarizetabs;
std::vector<MMseqsParameter*> extractdomains;
std::vector<MMseqsParameter*> createclusearchdb;
std::vector<MMseqsParameter*> extractalignedregion;
std::vector<MMseqsParameter*> convertkb;
std::vector<MMseqsParameter*> tsv2db;
Expand Down
2 changes: 1 addition & 1 deletion src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ set(util_source_files
util/mergeclusters.cpp
util/mergeresultsbyset.cpp
util/mergedbs.cpp
util/mkrepseqdb.cpp
util/createclusterdb.cpp
util/msa2profile.cpp
util/msa2result.cpp
util/nrtotaxmapping.cpp
Expand Down
148 changes: 148 additions & 0 deletions src/util/createclusterdb.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#include "DBReader.h"
#include "DBWriter.h"
#include "Debug.h"
#include "Util.h"
#include "FastSort.h"
#include "Parameters.h"

#ifdef OPENMP
#include <omp.h>
#endif

int createclusearchdb(int argc, const char **argv, const Command& command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_ALIGN);
DBReader<unsigned int> clusterReader(par.db2.c_str(), par.db2Index.c_str(), par.threads,
DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
clusterReader.open(DBReader<unsigned int>::NOSORT);
std::vector<std::string> suffixes = Util::split(par.dbSuffixList, ",");
suffixes.insert(suffixes.begin(), "");
for(size_t prefix = 0; prefix < suffixes.size(); prefix++) {
std::string db1 = par.db1 + suffixes[prefix];
std::string db1Index = par.db1 + suffixes[prefix] + ".index";
DBReader<unsigned int> reader(db1.c_str(), db1Index.c_str(), par.threads,
DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
reader.open(DBReader<unsigned int>::NOSORT);
reader.readMmapedDataInMemory();

std::string repDbSeq = par.db3 + suffixes[prefix];
std::string repDbSeqIdx = par.db3 + suffixes[prefix] + ".index";

DBWriter dbwRep(repDbSeq.c_str(), repDbSeqIdx.c_str(), static_cast<unsigned int>(par.threads), par.compressed,
reader.getDbtype());
dbwRep.open();
std::string seqsDbSeq = par.db3 + "_seq" + suffixes[prefix];
std::string seqsDbSeqIdx = par.db3 + "_seq" + suffixes[prefix] + ".index";
DBWriter dbwClu(seqsDbSeq.c_str(), seqsDbSeqIdx.c_str(), static_cast<unsigned int>(par.threads), par.compressed,
reader.getDbtype());
dbwClu.open();
Debug::Progress progress(clusterReader.getSize());
#pragma omp parallel
{
unsigned int thread_idx = 0;
#ifdef OPENMP
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
#endif
std::string resultBuffer;
// write output file
#pragma omp for schedule(dynamic, 1)
for (size_t id = 0; id < clusterReader.getSize(); id++) {
progress.updateProgress();
char *data = clusterReader.getData(id, thread_idx);
size_t repKey = clusterReader.getDbKey(id);
size_t repDataId = reader.getId(repKey);
size_t repEntryLen = reader.getEntryLen(repDataId);
dbwRep.writeData(reader.getData(repDataId, thread_idx), repEntryLen - 1, repKey, thread_idx);
while (*data != '\0') {
// parse dbkey
size_t dbKey = Util::fast_atoi<unsigned int>(data);
if (dbKey == repKey) {
data = Util::skipLine(data);
continue;
}
size_t readerId = reader.getId(dbKey);
dbwClu.writeData(reader.getData(readerId, thread_idx),
reader.getEntryLen(readerId) - 1, dbKey, thread_idx);
data = Util::skipLine(data);
}
resultBuffer.clear();
}
}
dbwRep.close(true);
dbwClu.close(true);
reader.close();

// merge index
DBReader<unsigned int> dbrRep(repDbSeq.c_str(), repDbSeqIdx.c_str(), par.threads,
DBReader<unsigned int>::USE_INDEX);
dbrRep.open(DBReader<unsigned int>::NOSORT);
DBReader<unsigned int> dbrClu(seqsDbSeq.c_str(), seqsDbSeqIdx.c_str(), par.threads,
DBReader<unsigned int>::USE_INDEX);
dbrClu.open(DBReader<unsigned int>::NOSORT);
std::string seqsDbSeqIdxTmp = seqsDbSeqIdx + "_tmp";

FILE *sIndex = FileUtil::openAndDelete(seqsDbSeqIdxTmp.c_str(), "w");
std::vector<DBReader<unsigned int>::Index> allIndex(dbrClu.getSize() + dbrRep.getSize());
size_t dataSize = 0;
for (size_t i = 0; i < dbrRep.getSize(); i++) {
allIndex[i] = *dbrRep.getIndex(i);
dataSize += allIndex[i].length;
}
for (size_t i = 0; i < dbrClu.getSize(); i++) {
DBReader<unsigned int>::Index *index = dbrClu.getIndex(i);
index->offset += dataSize;
allIndex[dbrRep.getSize() + i] = *index;
}
SORT_PARALLEL(allIndex.begin(), allIndex.end(), DBReader<unsigned int>::Index::compareById);
char buffer[1024];
for (size_t i = 0; i < allIndex.size(); i++) {
size_t len = DBWriter::indexToBuffer(buffer, allIndex[i].id, allIndex[i].offset, allIndex[i].length);
size_t written = fwrite(buffer, sizeof(char), len, sIndex);
if (written != len) {
Debug(Debug::ERROR) << "Cannot write index file " << seqsDbSeqIdxTmp << "\n";
EXIT(EXIT_FAILURE);
}
}
if (fclose(sIndex) != 0) {
Debug(Debug::ERROR) << "Cannot close index file " << seqsDbSeqIdxTmp << "\n";
EXIT(EXIT_FAILURE);
}
FileUtil::move(seqsDbSeqIdxTmp.c_str(), seqsDbSeqIdx.c_str());
FileUtil::symlinkAlias(repDbSeq, seqsDbSeq + ".0");
FileUtil::move(seqsDbSeq.c_str(), (seqsDbSeq + ".1").c_str());
}
clusterReader.close();
DBReader<unsigned int>::copyDb(par.db2, par.db3 + "_clu");

struct DBSuffix {
DBFiles::Files flag;
const char *suffix;
};

const DBSuffix suffices[] = {
{DBFiles::HEADER, "_h"},
{DBFiles::HEADER_INDEX, "_h.index"},
{DBFiles::HEADER_DBTYPE, "_h.dbtype"},
{DBFiles::LOOKUP, ".lookup"},
{DBFiles::SOURCE, ".source"},
{DBFiles::TAX_MAPPING, "_mapping"},
{DBFiles::TAX_NAMES, "_names.dmp"},
{DBFiles::TAX_NODES, "_nodes.dmp"},
{DBFiles::TAX_MERGED, "_merged.dmp"},
{DBFiles::TAX_MERGED, "_taxonomy"},
};

for (size_t i = 0; i < ARRAY_SIZE(suffices); ++i) {
std::string file = par.db1 + suffices[i].suffix;
if (suffices[i].flag && FileUtil::fileExists(file.c_str())) {
DBReader<unsigned int>::copyDb(file, par.db3 + suffices[i].suffix);
}
}
for (size_t i = 0; i < ARRAY_SIZE(suffices); ++i) {
std::string file = par.db3 + suffices[i].suffix;
if (suffices[i].flag && FileUtil::fileExists(file.c_str())) {
DBReader<unsigned int>::aliasDb(file, par.db3 + "_seq" + suffices[i].suffix);
}
}
return EXIT_SUCCESS;
}
139 changes: 0 additions & 139 deletions src/util/mkrepseqdb.cpp

This file was deleted.

0 comments on commit 9ae4458

Please sign in to comment.