diff --git a/src/global/Constants.h b/src/global/Constants.h index c4d825f00a..5a79575d82 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -1,7 +1,6 @@ -// Copyright 2023, University of Freiburg, +// Copyright 2023 - 2025, University of Freiburg, // Chair of Algorithms and Data Structures. -// -// Authors: Björn Buchhold +// Authors: Björn Buchhold [2014 - 2017] // Johannes Kalmbach // Hannah Bast @@ -22,6 +21,7 @@ using namespace ad_utility::memory_literals; constexpr inline ad_utility::MemorySize DEFAULT_MEMORY_LIMIT_INDEX_BUILDING = 5_GB; constexpr inline ad_utility::MemorySize STXXL_DISK_SIZE_INDEX_BUILDER = 1_GB; +constexpr inline ad_utility::MemorySize DEFAULT_PARSER_BUFFER_SIZE = 10_MB; constexpr inline ad_utility::MemorySize DEFAULT_MEM_FOR_QUERIES = 4_GB; diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index 71d4897878..d7c1802969 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -29,10 +29,6 @@ constexpr inline size_t PARSER_BATCH_SIZE = 1'000'000; // streams faster. constexpr inline size_t PARSER_MIN_TRIPLES_AT_ONCE = 10'000; -// When reading from a file, Chunks of this size will -// be fed to the parser at once (10 MiB). -constinit inline std::atomic FILE_BUFFER_SIZE = 10 * (1ul << 20); - constinit inline std::atomic BUFFER_SIZE_JOIN_PATTERNS_WITH_OSP = 50'000; diff --git a/src/index/Index.cpp b/src/index/Index.cpp index c70706b341..f66914bfca 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -181,13 +181,23 @@ ad_utility::MemorySize& Index::memoryLimitIndexBuilding() { } // ____________________________________________________________________________ -ad_utility::MemorySize& Index::blocksizePermutationsPerColumn() { - return pimpl_->blocksizePermutationPerColumn(); +const ad_utility::MemorySize& Index::memoryLimitIndexBuilding() const { + return std::as_const(*pimpl_).memoryLimitIndexBuilding(); } // ____________________________________________________________________________ -const ad_utility::MemorySize& Index::memoryLimitIndexBuilding() const { - return std::as_const(*pimpl_).memoryLimitIndexBuilding(); +ad_utility::MemorySize& Index::parserBufferSize() { + return pimpl_->parserBufferSize(); +} + +// ____________________________________________________________________________ +const ad_utility::MemorySize& Index::parserBufferSize() const { + return std::as_const(*pimpl_).parserBufferSize(); +} + +// ____________________________________________________________________________ +ad_utility::MemorySize& Index::blocksizePermutationsPerColumn() { + return pimpl_->blocksizePermutationPerColumn(); } // ____________________________________________________________________________ diff --git a/src/index/Index.h b/src/index/Index.h index e815b2a5bf..8c6dd1cd40 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -195,6 +195,9 @@ class Index { ad_utility::MemorySize& memoryLimitIndexBuilding(); const ad_utility::MemorySize& memoryLimitIndexBuilding() const; + ad_utility::MemorySize& parserBufferSize(); + const ad_utility::MemorySize& parserBufferSize() const; + ad_utility::MemorySize& blocksizePermutationsPerColumn(); void setOnDiskBase(const std::string& onDiskBase); diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index 1b500c9dde..cfc121a2d1 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -1,8 +1,8 @@ -// Copyright 2014, University of Freiburg, +// Copyright 2014 - 2025 University of Freiburg // Chair of Algorithms and Data Structures. -// Author: -// 2014-2017 Björn Buchhold (buchhold@informatik.uni-freiburg.de) -// 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) +// Authors: Björn Buchhold [2014 - 2017] +// Johannes Kalmbach +// Hannah Bast #include #include @@ -165,6 +165,7 @@ int main(int argc, char** argv) { bool onlyPsoAndPos = false; bool addWordsFromLiterals = false; std::optional stxxlMemory; + std::optional parserBufferSize; optind = 1; Index index{ad_utility::makeUnlimitedAllocator()}; @@ -228,6 +229,9 @@ int main(int argc, char** argv) { add("stxxl-memory,m", po::value(&stxxlMemory), "The amount of memory in to use for sorting during the index build. " "Decrease if the index builder runs out of memory."); + add("parser-buffer-size,b", po::value(&parserBufferSize), + "The size of the buffer used for parsing the input files. This must be " + "large enough to hold a single input triple. Default: 10 MB."); add("keep-temporary-files,k", po::bool_switch(&keepTemporaryFiles), "Do not delete temporary files from index creation for debugging."); @@ -249,6 +253,9 @@ int main(int argc, char** argv) { if (stxxlMemory.has_value()) { index.memoryLimitIndexBuilding() = stxxlMemory.value(); } + if (parserBufferSize.has_value()) { + index.parserBufferSize() = parserBufferSize.value(); + } // If no text index name was specified, take the part of the wordsfile after // the last slash. diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 4f5ce915fe..1f7279cd3c 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -71,10 +71,11 @@ IndexBuilderDataAsFirstPermutationSorter IndexImpl::createIdTriplesAndVocab( std::unique_ptr IndexImpl::makeRdfParser( const std::vector& files) const { auto makeRdfParserImpl = - [&files]() -> std::unique_ptr { + [this, &files]() -> std::unique_ptr { using TokenizerT = std::conditional_t; - return std::make_unique>(files); + return std::make_unique>( + files, this->parserBufferSize()); }; // `callFixedSize` litfts runtime integers to compile time integers. We use it diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index b98d0d5788..d9ec19eb14 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -128,6 +128,7 @@ class IndexImpl { bool keepTempFiles_ = false; ad_utility::MemorySize memoryLimitIndexBuilding_ = DEFAULT_MEMORY_LIMIT_INDEX_BUILDING; + ad_utility::MemorySize parserBufferSize_ = DEFAULT_PARSER_BUFFER_SIZE; ad_utility::MemorySize blocksizePermutationPerColumn_ = UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN; json configurationJson_; @@ -406,6 +407,11 @@ class IndexImpl { return memoryLimitIndexBuilding_; } + ad_utility::MemorySize& parserBufferSize() { return parserBufferSize_; } + const ad_utility::MemorySize& parserBufferSize() const { + return parserBufferSize_; + } + ad_utility::MemorySize& blocksizePermutationPerColumn() { return blocksizePermutationPerColumn_; } diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 3ae8a09da7..6775a13217 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -105,14 +105,14 @@ class Vocabulary { static constexpr bool isCompressed_ = std::is_same_v; - // If a word uses one of these language tags it will be internalized. - vector internalizedLangs_{"en"}; - - // If a word starts with one of those prefixes, it will be externalized When - // a word matched both `externalizedPrefixes_` and `internalizedLangs_`, it - // will be externalized. Qlever-internal prefixes are currently not - // externalized. - vector externalizedPrefixes_; + // If a literal uses one of these language tags or starts with one of these + // prefixes, it will be externalized. By default, everything is externalized. + // Both of these settings can be overridden using the `settings.json` file. + // + // NOTE: Qlever-internal prefixes are currently always internalized, no matter + // how `internalizedLangs_` and `externalizedPrefixes_` are set. + vector internalizedLangs_; + vector externalizedPrefixes_{""}; using UnderlyingVocabulary = std::conditional_tsize()), + ','), + "; possible fixes are: " + "use `--parser-buffer-size` to increase the buffer size or " + "use `--parse-parallel false` to disable parallel parsing")); } endPosition = rawInput->size(); exhausted_ = true; diff --git a/src/parser/ParallelBuffer.h b/src/parser/ParallelBuffer.h index 81bfac84c6..5a1b5d82a2 100644 --- a/src/parser/ParallelBuffer.h +++ b/src/parser/ParallelBuffer.h @@ -47,6 +47,9 @@ class ParallelBuffer { */ virtual std::optional getNextBlock() = 0; + // Get the blocksize of this buffer. + size_t getBlocksize() const { return blocksize_; } + protected: size_t blocksize_ = 100 * (2 << 20); }; diff --git a/src/parser/RdfParser.cpp b/src/parser/RdfParser.cpp index 8b3bf0681e..9cf4bffe93 100644 --- a/src/parser/RdfParser.cpp +++ b/src/parser/RdfParser.cpp @@ -785,6 +785,7 @@ template bool RdfStreamParser::resetStateAndRead( RdfStreamParser::TurtleParserBackupState* bPtr) { auto& b = *bPtr; + AD_CORRECTNESS_CHECK(fileBuffer_); auto nextBytesOpt = fileBuffer_->getNextBlock(); if (!nextBytesOpt || nextBytesOpt.value().empty()) { // there are no more decompressed bytes, just continue with what we've got @@ -821,7 +822,8 @@ bool RdfStreamParser::resetStateAndRead( } template -void RdfStreamParser::initialize(const string& filename) { +void RdfStreamParser::initialize(const string& filename, + ad_utility::MemorySize bufferSize) { this->clear(); // Make sure that a block of data ends with a newline. This is important for // two reasons: @@ -834,10 +836,10 @@ void RdfStreamParser::initialize(const string& filename) { // The reason is that with a `.` at the end, we cannot decide whether we are // in the middle of a `PN_LOCAL` (that continues in the next buffer) or at the // end of a statement. - fileBuffer_ = - std::make_unique(bufferSize_, "([\\r\\n]+)"); + fileBuffer_ = std::make_unique( + bufferSize.getBytes(), "([\\r\\n]+)"); fileBuffer_->open(filename); - byteVec_.resize(bufferSize_); + byteVec_.resize(bufferSize.getBytes()); // decompress the first block and initialize Tokenizer if (auto res = fileBuffer_->getNextBlock(); res) { byteVec_ = std::move(res.value()); @@ -998,7 +1000,7 @@ void RdfParallelParser::feedBatchesToParser( inputBatch = std::move(remainingBatchFromInitialization); first = false; } else { - auto nextOptional = fileBuffer_.getNextBlock(); + auto nextOptional = fileBuffer_->getNextBlock(); if (!nextOptional) { return; } @@ -1026,10 +1028,13 @@ void RdfParallelParser::feedBatchesToParser( // _______________________________________________________________________ template -void RdfParallelParser::initialize(const string& filename) { +void RdfParallelParser::initialize( + const string& filename, ad_utility::MemorySize bufferSize) { + fileBuffer_ = std::make_unique( + bufferSize.getBytes(), "\\.[\\t ]*([\\r\\n]+)"); ParallelBuffer::BufferType remainingBatchFromInitialization; - fileBuffer_.open(filename); - if (auto batch = fileBuffer_.getNextBlock(); !batch) { + fileBuffer_->open(filename); + if (auto batch = fileBuffer_->getNextBlock(); !batch) { LOG(WARN) << "Empty input to the TURTLE parser, is this what you intended?" << std::endl; } else { @@ -1109,7 +1114,8 @@ RdfParallelParser::~RdfParallelParser() { // file is to be parsed in parallel. template static std::unique_ptr makeSingleRdfParser( - const Index::InputFileSpecification& file) { + const Index::InputFileSpecification& file, + ad_utility::MemorySize bufferSize) { auto graph = [file]() -> TripleComponent { if (file.defaultGraph_.has_value()) { return TripleComponent::Iri::fromIrirefWithoutBrackets( @@ -1118,7 +1124,7 @@ static std::unique_ptr makeSingleRdfParser( return qlever::specialIds().at(DEFAULT_GRAPH_IRI); } }; - auto makeRdfParserImpl = [&filename = file.filename_, + auto makeRdfParserImpl = [&filename = file.filename_, &bufferSize, &graph]() -> std::unique_ptr { using InnerParser = @@ -1127,7 +1133,7 @@ static std::unique_ptr makeSingleRdfParser( using Parser = std::conditional_t, RdfStreamParser>; - return std::make_unique(filename, graph()); + return std::make_unique(filename, bufferSize, graph()); }; // The call to `callFixedSize` lifts runtime integers to compile time @@ -1142,13 +1148,15 @@ static std::unique_ptr makeSingleRdfParser( // ______________________________________________________________ template RdfMultifileParser::RdfMultifileParser( - const std::vector& files) { + const std::vector& files, + ad_utility::MemorySize bufferSize) { using namespace qlever; // This lambda parses a single file and pushes the results and all occurring // exceptions to the `finishedBatchQueue_`. - auto parseFile = [this](const InputFileSpecification& file) { + auto parseFile = [this](const InputFileSpecification& file, + ad_utility::MemorySize bufferSize) { try { - auto parser = makeSingleRdfParser(file); + auto parser = makeSingleRdfParser(file, bufferSize); while (auto batch = parser->getBatch()) { bool active = finishedBatchQueue_.push(std::move(batch.value())); if (!active) { @@ -1169,10 +1177,11 @@ RdfMultifileParser::RdfMultifileParser( }; // Feed all the input files to the `parsingQueue_`. - auto makeParsers = [files, this, parseFile]() { + auto makeParsers = [files, bufferSize, this, parseFile]() { for (const auto& file : files) { numActiveParsers_++; - bool active = parsingQueue_.push(std::bind_front(parseFile, file)); + bool active = + parsingQueue_.push(std::bind_front(parseFile, file, bufferSize)); if (!active) { // The queue was finished prematurely, stop this thread. This is // important to avoid deadlocks. diff --git a/src/parser/RdfParser.h b/src/parser/RdfParser.h index 76929a44bb..d65c05934d 100644 --- a/src/parser/RdfParser.h +++ b/src/parser/RdfParser.h @@ -477,8 +477,9 @@ class RdfStringParser : public Parser { return positionOffset_ + tmpToParse_.size() - this->tok_.data().size(); } - void initialize(const string& filename) { + void initialize(const string& filename, ad_utility::MemorySize bufferSize) { (void)filename; + (void)bufferSize; throw std::runtime_error( "RdfStringParser doesn't support calls to initialize. Only use " "parseUtf8String() for unit tests\n"); @@ -586,18 +587,20 @@ class RdfStreamParser : public Parser { public: // Default construction needed for tests RdfStreamParser() = default; - explicit RdfStreamParser(const string& filename, - TripleComponent defaultGraphIri = - qlever::specialIds().at(DEFAULT_GRAPH_IRI)) + explicit RdfStreamParser( + const string& filename, + ad_utility::MemorySize bufferSize = DEFAULT_PARSER_BUFFER_SIZE, + TripleComponent defaultGraphIri = + qlever::specialIds().at(DEFAULT_GRAPH_IRI)) : Parser{std::move(defaultGraphIri)} { LOG(DEBUG) << "Initialize RDF parsing from uncompressed file or stream " << filename << std::endl; - initialize(filename); + initialize(filename, bufferSize); } bool getLineImpl(TurtleTriple* triple) override; - void initialize(const string& filename); + void initialize(const string& filename, ad_utility::MemorySize bufferSize); size_t getParsePosition() const override { return numBytesBeforeCurrentBatch_ + (tok_.data().data() - byteVec_.data()); @@ -624,10 +627,7 @@ class RdfStreamParser : public Parser { // that's why we need the backupState() and resetStateAndRead() methods ParallelBuffer::BufferType byteVec_; - size_t bufferSize_ = FILE_BUFFER_SIZE; std::unique_ptr fileBuffer_; - // this many characters will be buffered at once, - // defaults to a global constant // that many bytes were already parsed before dealing with the current batch // in member byteVec_ @@ -649,22 +649,24 @@ class RdfParallelParser : public Parser { // If the `sleepTimeForTesting` is set, then after the initialization the // parser will sleep for the specified time before parsing each batch s.t. // certain corner cases can be tested. - explicit RdfParallelParser(const string& filename, - std::chrono::milliseconds sleepTimeForTesting = - std::chrono::milliseconds{0}) + explicit RdfParallelParser( + const string& filename, + ad_utility::MemorySize bufferSize = DEFAULT_PARSER_BUFFER_SIZE, + std::chrono::milliseconds sleepTimeForTesting = + std::chrono::milliseconds{0}) : sleepTimeForTesting_(sleepTimeForTesting) { LOG(DEBUG) << "Initialize parallel Turtle Parsing from uncompressed file or " "stream " << filename << std::endl; - initialize(filename); + initialize(filename, bufferSize); } // Construct a parser from a file and a given default graph iri. - RdfParallelParser(const string& filename, + RdfParallelParser(const string& filename, ad_utility::MemorySize bufferSize, const TripleComponent& defaultGraphIri) : Parser{defaultGraphIri}, defaultGraphIri_{defaultGraphIri} { - initialize(filename); + initialize(filename, bufferSize); } // inherit the wrapper overload @@ -679,7 +681,7 @@ class RdfParallelParser : public Parser { parallelParser_.resetTimers(); } - void initialize(const string& filename); + void initialize(const string& filename, ad_utility::MemorySize bufferSize); size_t getParsePosition() const override { // TODO: can we really define this position here? @@ -707,11 +709,8 @@ class RdfParallelParser : public Parser { using Parser::tok_; using Parser::triples_; - // this many characters will be buffered at once, - // defaults to a global constant - size_t bufferSize_ = FILE_BUFFER_SIZE; - - ParallelBufferWithEndRegex fileBuffer_{bufferSize_, "\\.[\\t ]*([\\r\\n]+)"}; + // Initialized in the call to `initialize`. + std::unique_ptr fileBuffer_; ad_utility::data_structures::ThreadSafeQueue> tripleCollector_{QUEUE_SIZE_AFTER_PARALLEL_PARSING}; @@ -741,7 +740,8 @@ class RdfMultifileParser : public RdfParserBase { // Construct the parser from a vector of file specifications and eagerly start // parsing them on background threads. explicit RdfMultifileParser( - const std::vector& files); + const std::vector& files, + ad_utility::MemorySize bufferSize = DEFAULT_PARSER_BUFFER_SIZE); // This function is needed for the interface, but always throws an exception. // `getBatch` (below) has to be used instead. diff --git a/test/GroupByTest.cpp b/test/GroupByTest.cpp index b3a6725909..a23fef2475 100644 --- a/test/GroupByTest.cpp +++ b/test/GroupByTest.cpp @@ -51,7 +51,6 @@ auto optionalHasTable = [](const VectorTable& table) { class GroupByTest : public ::testing::Test { public: GroupByTest() { - FILE_BUFFER_SIZE = 1000; // Create the index. The full index creation is run here to allow for // loading a docsDb file, which is not otherwise accessible std::string docsFileContent = "0\tExert 1\n1\tExert 2\n2\tExert3"; @@ -81,6 +80,7 @@ class GroupByTest : public ::testing::Test { _index.buildDocsDB("group_by_test.documents"); _index.addTextFromOnDiskIndex(); + _index.parserBufferSize() = 1_kB; } virtual ~GroupByTest() { diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index 5bb641b1ef..68cee86b02 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -406,11 +406,11 @@ TEST(IndexTest, TripleToInternalRepresentation) { index.tripleToInternalRepresentation(std::move(turtleTriple)); EXPECT_TRUE(res.langtag_.empty()); EXPECT_THAT(res.triple_[0], - IsPossiblyExternalString(iri(""), false)); + IsPossiblyExternalString(iri(""), true)); EXPECT_THAT(res.triple_[1], - IsPossiblyExternalString(iri(""), false)); + IsPossiblyExternalString(iri(""), true)); EXPECT_THAT(res.triple_[2], - IsPossiblyExternalString(lit("\"literal\""), false)); + IsPossiblyExternalString(lit("\"literal\""), true)); } { IndexImpl index{ad_utility::makeUnlimitedAllocator()}; @@ -525,6 +525,9 @@ TEST(IndexTest, trivialGettersAndSetters) { index.memoryLimitIndexBuilding() = 7_kB; EXPECT_EQ(index.memoryLimitIndexBuilding(), 7_kB); EXPECT_EQ(std::as_const(index).memoryLimitIndexBuilding(), 7_kB); + index.parserBufferSize() = 8_kB; + EXPECT_EQ(index.parserBufferSize(), 8_kB); + EXPECT_EQ(std::as_const(index).parserBufferSize(), 8_kB); } TEST(IndexTest, updateInputFileSpecificationsAndLog) { diff --git a/test/RdfParserTest.cpp b/test/RdfParserTest.cpp index bbde3375a5..6eba30f45d 100644 --- a/test/RdfParserTest.cpp +++ b/test/RdfParserTest.cpp @@ -1,6 +1,7 @@ -// Copyright 2018, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach(joka921) +// Copyright 2018 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Johannes Kalmbach +// Hannah Bast #include #include @@ -14,6 +15,7 @@ #include "parser/RdfParser.h" #include "parser/TripleComponent.h" #include "util/Conversions.h" +#include "util/MemorySize/MemorySize.h" using std::string; using namespace std::literals; @@ -794,15 +796,19 @@ TEST(RdfParserTest, iriref) { // Parse the file at `filename` using a parser of type `Parser` and return the // sorted result. Iff `useBatchInterface` then the `getBatch()` function is used -// for parsing, else `getLine()` is used. +// for parsing, else `getLine()` is used. The default size for the parse buffer +// in the following tests is 1 kB (which is much less than the default value +// `DEFAULT_PARSER_BUFFER_SIZE` defined in `src/global/Constants.h`). template -std::vector parseFromFile(const std::string& filename, - bool useBatchInterface) { +std::vector parseFromFile( + const std::string& filename, bool useBatchInterface, + ad_utility::MemorySize bufferSize = 1_kB) { auto parserChild = [&]() { if constexpr (ad_utility::isInstantiation) { - return Parser{{{filename, qlever::Filetype::Turtle, std::nullopt}}}; + return Parser{{{filename, qlever::Filetype::Turtle, std::nullopt}}, + bufferSize}; } else { - return Parser{filename}; + return Parser{filename, bufferSize}; } }(); RdfParserBase& parser = parserChild; @@ -869,7 +875,6 @@ TEST(RdfParserTest, TurtleStreamAndParallelParser) { } } - FILE_BUFFER_SIZE = 1000; auto testWithParser = [&](bool useBatchInterface) { auto result = parseFromFile(filename, useBatchInterface); EXPECT_THAT(result, ::testing::UnorderedElementsAreArray(expectedTriples)); @@ -883,7 +888,6 @@ TEST(RdfParserTest, TurtleStreamAndParallelParser) { // _______________________________________________________________________ TEST(RdfParserTest, emptyInput) { std::string filename{"turtleParserEmptyInput.dat"}; - FILE_BUFFER_SIZE = 1000; auto testWithParser = [&](bool useBatchInterface, std::string_view input = "") { { @@ -904,7 +908,6 @@ TEST(RdfParserTest, emptyInput) { // ________________________________________________________________________ TEST(RdfParserTest, multilineComments) { std::string filename{"turtleParserMultilineComments.dat"}; - FILE_BUFFER_SIZE = 1000; auto testWithParser = [&](bool useBatchInterface, std::string_view input, const auto& expectedTriples) { @@ -957,7 +960,6 @@ TEST(RdfParserTest, multilineComments) { // actual parsing happens on background threads. TEST(RdfParserTest, exceptionPropagation) { std::string filename{"turtleParserExceptionPropagation.dat"}; - FILE_BUFFER_SIZE = 1000; auto testWithParser = [&](bool useBatchInterface, std::string_view input) { { @@ -977,22 +979,27 @@ TEST(RdfParserTest, exceptionPropagation) { TEST(RdfParserTest, exceptionPropagationFileBufferReading) { std::string filename{"turtleParserExceptionPropagationFileBufferReading.dat"}; auto testWithParser = [&](bool useBatchInterface, + ad_utility::MemorySize bufferSize, std::string_view input) { { auto of = ad_utility::makeOfstream(filename); of << input; } AD_EXPECT_THROW_WITH_MESSAGE( - (parseFromFile(filename, useBatchInterface)), - ::testing::ContainsRegex("Please increase the FILE_BUFFER_SIZE")); + (parseFromFile(filename, useBatchInterface, bufferSize)), + ::testing::AllOf( + ::testing::HasSubstr("end of a statement was not found"), + ::testing::HasSubstr("use `--parser-buffer-size`"), + ::testing::HasSubstr("use `--parse-parallel false`"))); ad_utility::deleteFile(filename); }; - // Deliberately chosen s.t. the first triple fits in a block, but the second - // one doesn't. - FILE_BUFFER_SIZE = 40; - forAllParallelParsers(testWithParser, - " . \n " - " ."); + // Input, where the first triple fits into a 40_B buffer, but the second + // one does not. + std::string inputWithLongTriple = + " . \n " + " " + " ."; + forAllParallelParsers(testWithParser, 40_B, inputWithLongTriple); } // Test that the parallel parser's destructor can be run quickly and without @@ -1014,9 +1021,10 @@ TEST(RdfParserTest, stopParsingOnOutsideFailure) { { [[maybe_unused]] Parser parserChild = [&]() { if constexpr (ad_utility::isInstantiation) { - return Parser{{{filename, qlever::Filetype::Turtle, std::nullopt}}}; + return Parser{{{filename, qlever::Filetype::Turtle, std::nullopt}}, + 40_B}; } else { - return Parser{filename, 10ms}; + return Parser{filename, 40_B, 10ms}; } }(); t.cont(); @@ -1032,7 +1040,6 @@ TEST(RdfParserTest, stopParsingOnOutsideFailure) { } return longBlock; }(); - FILE_BUFFER_SIZE = 40; forAllParallelParsers(testWithParser, input); forAllMultifileParsers(testWithParser, input); } diff --git a/test/parser/ParallelBufferTest.cpp b/test/parser/ParallelBufferTest.cpp index f37af96aa1..062ac46f07 100644 --- a/test/parser/ParallelBufferTest.cpp +++ b/test/parser/ParallelBufferTest.cpp @@ -17,6 +17,7 @@ TEST(ParallelBuffer, ParallelFileBuffer) { size_t blocksize = 4; ParallelFileBuffer buf(blocksize); + EXPECT_EQ(buf.getBlocksize(), blocksize); buf.open(filename); std::vector expected{ {'a', 'b', 'c', 'd'}, {'e', 'f', 'g', 'h'}, {'i', 'j'}}; diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 0dcfd334a6..8e1a693209 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -23,6 +23,7 @@ Index makeIndexWithTestSettings() { BATCH_SIZE_VOCABULARY_MERGE = 2; DEFAULT_PROGRESS_BAR_BATCH_SIZE = 2; index.memoryLimitIndexBuilding() = 50_MB; + index.parserBufferSize() = 1_kB; return index; } @@ -155,7 +156,6 @@ Index makeTestIndex(const std::string& indexBasename, "\"zz\"@en .