From 2953f1620837d97e1c79144d366f012ec70076a3 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <joka921@users.noreply.github.com>
Date: Sat, 4 Jan 2025 04:55:05 +0100
Subject: [PATCH 1/2] Hash-map based GROUP BY can now process lazy inputs
 (#1651)

Since #1229, QLever supports a hash-map based GROUP BY, which can be activated via the runtime parameter `group-by-hash-map-enabled` (which is `false` by default). So far, this required a fully materialized input. With this change, the input can be processed lazily. The output is still fully materialized and never lazy (because, when the input is not sorted, we cannot output any group until we have seen the last input row).
---
 src/engine/GroupBy.cpp         | 170 ++++++++++++++++++++-------------
 src/engine/GroupBy.h           |  14 ++-
 test/GroupByTest.cpp           |  46 +++++++++
 test/engine/ValuesForTesting.h |   2 +
 4 files changed, 155 insertions(+), 77 deletions(-)
diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp
index 9dde7353a3..6fdeca1833 100644
--- a/src/engine/GroupBy.cpp
+++ b/src/engine/GroupBy.cpp
@@ -330,7 +330,7 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) {
   if (useHashMapOptimization) {
     const auto* child = _subtree->getRootOperation()->getChildren().at(0);
     // Skip sorting
-    subresult = child->getResult();
+    subresult = child->getResult(true);
     // Update runtime information
     auto runTimeInfoChildren =
         child->getRootOperation()->getRuntimeInfoPointer();
@@ -366,13 +366,28 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) {
   }
 
   if (useHashMapOptimization) {
-    auto localVocab = subresult->getCopyOfLocalVocab();
-    IdTable idTable = CALL_FIXED_SIZE(
-        groupByCols.size(), &GroupBy::computeGroupByForHashMapOptimization,
-        this, metadataForUnsequentialData->aggregateAliases_,
-        subresult->idTable(), groupByCols, &localVocab);
+    // Helper lambda that calls `computeGroupByForHashMapOptimization` for the
+    // given `subresults`.
+    auto computeWithHashMap = [this, &metadataForUnsequentialData,
+                               &groupByCols](auto&& subresults) {
+      auto doCompute = [&]<int NumCols> {
+        return computeGroupByForHashMapOptimization<NumCols>(
+            metadataForUnsequentialData->aggregateAliases_, AD_FWD(subresults),
+            groupByCols);
+      };
+      return ad_utility::callFixedSize(groupByCols.size(), doCompute);
+    };
 
-    return {std::move(idTable), resultSortedOn(), std::move(localVocab)};
+    // Now call `computeWithHashMap` and return the result. It expects a range
+    // of results, so if the result is fully materialized, we create an array
+    // with a single element.
+    if (subresult->isFullyMaterialized()) {
+      return computeWithHashMap(
+          std::array{std::pair{std::cref(subresult->idTable()),
+                               std::cref(subresult->localVocab())}});
+    } else {
+      return computeWithHashMap(std::move(subresult->idTables()));
+    }
   }
 
   size_t inWidth = _subtree->getResultWidth();
@@ -846,7 +861,7 @@ std::optional<IdTable> GroupBy::computeGroupByForJoinWithFullScan() const {
   const auto& index = getExecutionContext()->getIndex();
 
   // TODO<joka921, C++23> Simplify the following pattern by using
-  // `ql::views::chunkd_by` and implement a lazy version of this view for
+  // `ql::views::chunk_by` and implement a lazy version of this view for
   // input iterators.
 
   // Take care of duplicate values in the input.
@@ -1487,78 +1502,95 @@ static constexpr auto makeProcessGroupsVisitor =
 
 // _____________________________________________________________________________
 template <size_t NUM_GROUP_COLUMNS>
-IdTable GroupBy::computeGroupByForHashMapOptimization(
-    std::vector<HashMapAliasInformation>& aggregateAliases,
-    const IdTable& subresult, const std::vector<size_t>& columnIndices,
-    LocalVocab* localVocab) const {
-  AD_CONTRACT_CHECK(columnIndices.size() == NUM_GROUP_COLUMNS ||
-                    NUM_GROUP_COLUMNS == 0);
-
-  // Initialize aggregation data
+Result GroupBy::computeGroupByForHashMapOptimization(
+    std::vector<HashMapAliasInformation>& aggregateAliases, auto subresults,
+    const std::vector<size_t>& columnIndices) const {
+  AD_CORRECTNESS_CHECK(columnIndices.size() == NUM_GROUP_COLUMNS ||
+                       NUM_GROUP_COLUMNS == 0);
+  LocalVocab localVocab;
+
+  // Initialize the data for the aggregates of the GROUP BY operation.
   HashMapAggregationData<NUM_GROUP_COLUMNS> aggregationData(
       getExecutionContext()->getAllocator(), aggregateAliases,
       columnIndices.size());
 
-  // Initialize evaluation context
-  sparqlExpression::EvaluationContext evaluationContext(
-      *getExecutionContext(), _subtree->getVariableColumns(), subresult,
-      getExecutionContext()->getAllocator(), *localVocab, cancellationHandle_,
-      deadline_);
-
-  evaluationContext._groupedVariables = ad_utility::HashSet<Variable>{
-      _groupByVariables.begin(), _groupByVariables.end()};
-  evaluationContext._isPartOfGroupBy = true;
-
+  // Process the input blocks (pairs of `IdTable` and `LocalVocab`) one after
+  // the other.
   ad_utility::Timer lookupTimer{ad_utility::Timer::Stopped};
   ad_utility::Timer aggregationTimer{ad_utility::Timer::Stopped};
-  for (size_t i = 0; i < subresult.size(); i += GROUP_BY_HASH_MAP_BLOCK_SIZE) {
-    checkCancellation();
-
-    evaluationContext._beginIndex = i;
-    evaluationContext._endIndex =
-        std::min(i + GROUP_BY_HASH_MAP_BLOCK_SIZE, subresult.size());
-
-    auto currentBlockSize = evaluationContext.size();
-
-    // Perform HashMap lookup once for all groups in current block
-    using U = HashMapAggregationData<NUM_GROUP_COLUMNS>::template ArrayOrVector<
-        std::span<const Id>>;
-    U groupValues;
-    resizeIfVector(groupValues, columnIndices.size());
-
-    // TODO<C++23> use views::enumerate
-    size_t j = 0;
-    for (auto& idx : columnIndices) {
-      groupValues[j] = subresult.getColumn(idx).subspan(
-          evaluationContext._beginIndex, currentBlockSize);
-      ++j;
-    }
-    lookupTimer.cont();
-    auto hashEntries = aggregationData.getHashEntries(groupValues);
-    lookupTimer.stop();
-
-    aggregationTimer.cont();
-    for (auto& aggregateAlias : aggregateAliases) {
-      for (auto& aggregate : aggregateAlias.aggregateInfo_) {
-        sparqlExpression::ExpressionResult expressionResult =
-            GroupBy::evaluateChildExpressionOfAggregateFunction(
-                aggregate, evaluationContext);
-
-        auto& aggregationDataVariant =
-            aggregationData.getAggregationDataVariant(
-                aggregate.aggregateDataIndex_);
-
-        std::visit(makeProcessGroupsVisitor(currentBlockSize,
-                                            &evaluationContext, hashEntries),
-                   std::move(expressionResult), aggregationDataVariant);
+  for (const auto& [inputTableRef, inputLocalVocabRef] : subresults) {
+    const IdTable& inputTable = inputTableRef;
+    const LocalVocab& inputLocalVocab = inputLocalVocabRef;
+
+    // Merge the local vocab of each input block.
+    //
+    // NOTE: If the input blocks have very similar or even identical non-empty
+    // local vocabs, no deduplication is performed.
+    localVocab.mergeWith(std::span{&inputLocalVocab, 1});
+
+    // Setup the `EvaluationContext` for this input block.
+    sparqlExpression::EvaluationContext evaluationContext(
+        *getExecutionContext(), _subtree->getVariableColumns(), inputTable,
+        getExecutionContext()->getAllocator(), localVocab, cancellationHandle_,
+        deadline_);
+    evaluationContext._groupedVariables = ad_utility::HashSet<Variable>{
+        _groupByVariables.begin(), _groupByVariables.end()};
+    evaluationContext._isPartOfGroupBy = true;
+
+    // Iterate of the rows of this input block. Process (up to)
+    // `GROUP_BY_HASH_MAP_BLOCK_SIZE` rows at a time.
+    for (size_t i = 0; i < inputTable.size();
+         i += GROUP_BY_HASH_MAP_BLOCK_SIZE) {
+      checkCancellation();
+
+      evaluationContext._beginIndex = i;
+      evaluationContext._endIndex =
+          std::min(i + GROUP_BY_HASH_MAP_BLOCK_SIZE, inputTable.size());
+
+      auto currentBlockSize = evaluationContext.size();
+
+      // Perform HashMap lookup once for all groups in current block
+      using U = HashMapAggregationData<
+          NUM_GROUP_COLUMNS>::template ArrayOrVector<std::span<const Id>>;
+      U groupValues;
+      resizeIfVector(groupValues, columnIndices.size());
+
+      // TODO<C++23> use views::enumerate
+      size_t j = 0;
+      for (auto& idx : columnIndices) {
+        groupValues[j] = inputTable.getColumn(idx).subspan(
+            evaluationContext._beginIndex, currentBlockSize);
+        ++j;
+      }
+      lookupTimer.cont();
+      auto hashEntries = aggregationData.getHashEntries(groupValues);
+      lookupTimer.stop();
+
+      aggregationTimer.cont();
+      for (auto& aggregateAlias : aggregateAliases) {
+        for (auto& aggregate : aggregateAlias.aggregateInfo_) {
+          sparqlExpression::ExpressionResult expressionResult =
+              GroupBy::evaluateChildExpressionOfAggregateFunction(
+                  aggregate, evaluationContext);
+
+          auto& aggregationDataVariant =
+              aggregationData.getAggregationDataVariant(
+                  aggregate.aggregateDataIndex_);
+
+          std::visit(makeProcessGroupsVisitor(currentBlockSize,
+                                              &evaluationContext, hashEntries),
+                     std::move(expressionResult), aggregationDataVariant);
+        }
       }
+      aggregationTimer.stop();
     }
-    aggregationTimer.stop();
   }
+
   runtimeInfo().addDetail("timeMapLookup", lookupTimer.msecs());
   runtimeInfo().addDetail("timeAggregation", aggregationTimer.msecs());
-
-  return createResultFromHashMap(aggregationData, aggregateAliases, localVocab);
+  IdTable resultTable =
+      createResultFromHashMap(aggregationData, aggregateAliases, &localVocab);
+  return {std::move(resultTable), resultSortedOn(), std::move(localVocab)};
 }
 
 // _____________________________________________________________________________
diff --git a/src/engine/GroupBy.h b/src/engine/GroupBy.h
index afe824d492..8232f381ab 100644
--- a/src/engine/GroupBy.h
+++ b/src/engine/GroupBy.h
@@ -1,8 +1,7 @@
-// Copyright 2018, University of Freiburg,
+// Copyright 2018 - 2024, University of Freiburg
 // Chair of Algorithms and Data Structures.
-// Author:
-//   2018      Florian Kramer (florian.kramer@mail.uni-freiburg.de)
-//   2020-     Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de)
+// Authors: Florian Kramer [2018]
+//          Johannes Kalmbach <kalmbach@cs.uni-freiburg.de>
 
 #pragma once
 
@@ -316,10 +315,9 @@ class GroupBy : public Operation {
   // Create result IdTable by using a HashMap mapping groups to aggregation data
   // and subsequently calling `createResultFromHashMap`.
   template <size_t NUM_GROUP_COLUMNS>
-  IdTable computeGroupByForHashMapOptimization(
-      std::vector<HashMapAliasInformation>& aggregateAliases,
-      const IdTable& subresult, const std::vector<size_t>& columnIndices,
-      LocalVocab* localVocab) const;
+  Result computeGroupByForHashMapOptimization(
+      std::vector<HashMapAliasInformation>& aggregateAliases, auto subresults,
+      const std::vector<size_t>& columnIndices) const;
 
   using AggregationData =
       std::variant<AvgAggregationData, CountAggregationData, MinAggregationData,
diff --git a/test/GroupByTest.cpp b/test/GroupByTest.cpp
index 0c2e314e46..b3a6725909 100644
--- a/test/GroupByTest.cpp
+++ b/test/GroupByTest.cpp
@@ -3,6 +3,7 @@
 // Authors: Florian Kramer (florian.kramer@mail.uni-freiburg.de)
 //          Johannes Kalmbach (kalmbach@cs.uni-freiburg.de)
 
+#include <engine/SpatialJoinAlgorithms.h>
 #include <gmock/gmock.h>
 
 #include <cstdio>
@@ -36,6 +37,7 @@ using ::testing::Optional;
 
 namespace {
 auto I = IntId;
+auto D = DoubleId;
 
 // Return a matcher that checks, whether a given `std::optional<IdTable` has a
 // value and that value is equal to `makeIdTableFromVector(table)`.
@@ -747,6 +749,50 @@ TEST_F(GroupByOptimizations, correctResultForHashMapOptimization) {
             resultWithoutOptimization->asDebugString());
 }
 
+// _____________________________________________________________________________
+TEST_F(GroupByOptimizations, hashMapOptimizationLazyAndMaterializedInputs) {
+  /* Setup query:
+  SELECT ?x (AVG(?y) as ?avg) WHERE {
+    # explicitly defined subresult.
+  } GROUP BY ?x
+ */
+  // Setup three unsorted input blocks. The first column will be the grouped
+  // `?x`, and the second column the variable `?y` of which we compute the
+  // average.
+  auto runTest = [this](bool inputIsLazy) {
+    std::vector<IdTable> tables;
+    tables.push_back(makeIdTableFromVector({{3, 6}, {8, 27}, {5, 7}}, I));
+    tables.push_back(makeIdTableFromVector({{8, 27}, {5, 9}}, I));
+    tables.push_back(makeIdTableFromVector({{5, 2}, {3, 4}}, I));
+    // The expected averages are as follows: (3 -> 5.0), (5 -> 6.0), (8
+    // -> 27.0).
+    auto subtree = ad_utility::makeExecutionTree<ValuesForTesting>(
+        qec, std::move(tables),
+        std::vector<std::optional<Variable>>{Variable{"?x"}, Variable{"?y"}});
+    auto& values =
+        dynamic_cast<ValuesForTesting&>(*subtree->getRootOperation());
+    values.forceFullyMaterialized() = !inputIsLazy;
+
+    SparqlExpressionPimpl avgYPimpl = makeAvgPimpl(varY);
+    std::vector<Alias> aliasesAvgY{Alias{avgYPimpl, Variable{"?avg"}}};
+
+    // Calculate result with optimization
+    qec->getQueryTreeCache().clearAll();
+    RuntimeParameters().set<"group-by-hash-map-enabled">(true);
+    GroupBy groupBy{qec, variablesOnlyX, aliasesAvgY, std::move(subtree)};
+    auto result = groupBy.computeResultOnlyForTesting();
+    ASSERT_TRUE(result.isFullyMaterialized());
+    EXPECT_THAT(
+        result.idTable(),
+        matchesIdTableFromVector({{I(3), D(5)}, {I(5), D(6)}, {I(8), D(27)}}));
+  };
+  runTest(true);
+  runTest(false);
+
+  // Disable optimization for following tests
+  RuntimeParameters().set<"group-by-hash-map-enabled">(false);
+}
+
 // _____________________________________________________________________________
 TEST_F(GroupByOptimizations, correctResultForHashMapOptimizationForCountStar) {
   /* Setup query:
diff --git a/test/engine/ValuesForTesting.h b/test/engine/ValuesForTesting.h
index c02a9826bc..097ccd9c78 100644
--- a/test/engine/ValuesForTesting.h
+++ b/test/engine/ValuesForTesting.h
@@ -120,6 +120,8 @@ class ValuesForTesting : public Operation {
   }
   bool supportsLimit() const override { return supportsLimit_; }
 
+  bool& forceFullyMaterialized() { return forceFullyMaterialized_; }
+
  private:
   // ___________________________________________________________________________
   string getCacheKeyImpl() const override {

From c5e6c80f9cb9370db436c94f059afee2e302eec7 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <joka921@users.noreply.github.com>
Date: Sun, 5 Jan 2025 22:18:23 +0100
Subject: [PATCH 2/2] Don't use Abseil for `Hash(Set|Map)WithMemoryLimit`
 (#1689)

So far, `HashSetWithMemoryLimit` and `HashMapWithMemoryLimit` were implemented as `absl::flat_hash_set` and `absl::flat_hash_map`, respectively. However, the Abseil data structures are not exception safe, which potentially leads to unexpected or erroneous behavior in Qlever. The two data structures are now implemented using `std::unordered_set`.

These two classes are currently used in the following operations: `GroupBy`, `TransitivePath`, and `Describe`. A quick performance comparison (of the current master and this PR on 20 example queries, and in a small standalone program), shows no significant performance difference.
---
 src/util/HashMap.h   | 12 +++++++++---
 src/util/HashSet.h   | 10 +++++-----
 test/CMakeLists.txt  |  2 +-
 test/HashMapTest.cpp |  2 +-
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/util/HashMap.h b/src/util/HashMap.h
index 2a6b94e2e9..39c8a7a4ca 100644
--- a/src/util/HashMap.h
+++ b/src/util/HashMap.h
@@ -6,7 +6,10 @@
 #pragma once
 
 #include <absl/container/flat_hash_map.h>
-#include <util/AllocatorWithLimit.h>
+
+#include <unordered_map>
+
+#include "util/AllocatorWithLimit.h"
 
 namespace ad_utility {
 // Wrapper for HashMaps to be used everywhere throughout code for the semantic
@@ -15,10 +18,13 @@ namespace ad_utility {
 template <typename... Ts>
 using HashMap = absl::flat_hash_map<Ts...>;
 
-// A HashMap with a memory limit.
+// A HashMap with a memory limit. Note: We cannot use `absl::flat_hash_map`
+// here, because it is inherently not exception safe, and the
+// `AllocatorWithLimit` uses exceptions.
 template <class K, class V,
           class HashFct = absl::container_internal::hash_default_hash<K>,
           class EqualElem = absl::container_internal::hash_default_eq<K>,
           class Alloc = ad_utility::AllocatorWithLimit<std::pair<const K, V>>>
-using HashMapWithMemoryLimit = HashMap<K, V, HashFct, EqualElem, Alloc>;
+using HashMapWithMemoryLimit =
+    std::unordered_map<K, V, HashFct, EqualElem, Alloc>;
 }  // namespace ad_utility
diff --git a/src/util/HashSet.h b/src/util/HashSet.h
index 4cf9bd56f5..a851de6fd7 100644
--- a/src/util/HashSet.h
+++ b/src/util/HashSet.h
@@ -6,13 +6,11 @@
 
 #pragma once
 
-#include <string>
+#include <unordered_set>
 
 #include "absl/container/flat_hash_set.h"
 #include "util/AllocatorWithLimit.h"
 
-using std::string;
-
 namespace ad_utility {
 // Wrapper for HashSets (with elements of type T) to be used everywhere
 // throughout code for the semantic search. This wrapper interface is not
@@ -25,11 +23,13 @@ template <class T,
 using HashSet = absl::flat_hash_set<T, HashFct, EqualElem, Alloc>;
 
 // A hash set (with elements of type T) with a memory Limit.
+// Note: We cannot use `absl::flat_hash_set`
+// here, because it is inherently not exception safe, and the
+// `AllocatorWithLimit` uses exceptions.
 template <class T,
           class HashFct = absl::container_internal::hash_default_hash<T>,
           class EqualElem = absl::container_internal::hash_default_eq<T>,
           class Alloc = ad_utility::AllocatorWithLimit<T>>
-using HashSetWithMemoryLimit =
-    absl::flat_hash_set<T, HashFct, EqualElem, Alloc>;
+using HashSetWithMemoryLimit = std::unordered_set<T, HashFct, EqualElem, Alloc>;
 
 }  // namespace ad_utility
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 46a83d5b7a..bd375f4826 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -160,7 +160,7 @@ addLinkAndDiscoverTest(TextLimitOperationTest engine)
 
 addLinkAndDiscoverTestSerial(QueryPlannerTest engine)
 
-addLinkAndDiscoverTest(HashMapTest)
+addLinkAndDiscoverTestNoLibs(HashMapTest)
 
 addLinkAndDiscoverTest(HashSetTest)
 
diff --git a/test/HashMapTest.cpp b/test/HashMapTest.cpp
index 7ce7934f9c..cb1d670fe6 100644
--- a/test/HashMapTest.cpp
+++ b/test/HashMapTest.cpp
@@ -8,7 +8,7 @@
 #include <utility>
 #include <vector>
 
-#include "../src/util/HashMap.h"
+#include "util/HashMap.h"
 
 // Note: Since the HashMap class is a wrapper for a well tested hash map
 // implementation the following tests only check the API for functionality and