Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct datatypes for string expressions #1636

Open
wants to merge 34 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
47feb71
STRLEN für UTF 8 angepasst
DuDaAG Oct 24, 2024
4877e8f
Test
DuDaAG Oct 24, 2024
aefd889
test rückgäning
DuDaAG Oct 24, 2024
b2cb8c6
find pull-request
DuDaAG Oct 25, 2024
96b1959
Fix test
DuDaAG Oct 25, 2024
b7806b8
Update src/engine/sparqlExpressions/StringExpressions.cpp
DuDaAG Oct 31, 2024
a83d74b
Update src/engine/sparqlExpressions/StringExpressions.cpp
DuDaAG Oct 31, 2024
e73d1ab
Format
Nov 1, 2024
0ed79df
new LiteralOrIriValueGetter
Nov 13, 2024
2666b78
Merge branch 'my-branch' into master
DuDaAG Nov 13, 2024
27fff04
Merge pull request #3 from DuDaAG/master
DuDaAG Nov 13, 2024
7078f60
idToLiteralAndIri with specifiactions
Nov 22, 2024
2581f4c
some fixes
Nov 22, 2024
6d7a2b2
Add Test IdToLiteralOrIri and some formatting
Nov 24, 2024
5948dcb
formatting
Nov 24, 2024
52ef1f5
Correction for sonar
Nov 24, 2024
f15bf94
SubStr improvements
Nov 29, 2024
774d52b
fix
Nov 29, 2024
d4b49c0
little changes
Nov 30, 2024
72aaa00
Feedback implemented
Dec 7, 2024
25000a9
format
Dec 7, 2024
617c3b7
New position codespell-ignore
Dec 7, 2024
f631ec2
delete codespell-ignore
Dec 7, 2024
889e9dd
UTF8 handling in subStr
Dec 7, 2024
0c41603
format
Dec 7, 2024
1b7e1b4
Add runtime error
Dec 12, 2024
344560a
syntax
Dec 12, 2024
67c747a
fix
Dec 12, 2024
313bba4
T
Dec 12, 2024
be80b09
add exceptions
Dec 12, 2024
2adaa30
nix
Dec 14, 2024
39ca3cb
Merge branch 'master' into Correct-Datatypes-for-StringExpressions
joka921 Dec 18, 2024
7455f29
idToLiteral without Iri
Jan 6, 2025
ac95531
Merge branch 'ad-freiburg:master' into Correct-Datatypes-for-StringEx…
DuDaAG Jan 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 94 additions & 1 deletion src/engine/ExportQueryExecutionTrees.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include "util/ConstexprUtils.h"
#include "util/http/MediaTypes.h"

using LiteralOrIri = ad_utility::triple_component::LiteralOrIri;

// Return true iff the `result` is nonempty.
bool getResultForAsk(const std::shared_ptr<const Result>& result) {
if (result->isFullyMaterialized()) {
Expand Down Expand Up @@ -347,11 +349,57 @@
}
}

// _____________________________________________________________________________
std::optional<LiteralOrIri>
ExportQueryExecutionTrees::idToLiteralOrIriForEncodedValue(
Id id, bool onlyReturnLiteralsWithXsdString) {
if (onlyReturnLiteralsWithXsdString) {
return std::nullopt;
}
auto optionalStringAndType = idToStringAndTypeForEncodedValue(id);
if (!optionalStringAndType) {
return std::nullopt;
}

return LiteralOrIri::literalWithoutQuotes(optionalStringAndType->first);
}

// _____________________________________________________________________________
bool ExportQueryExecutionTrees::isPlainLiteralOrLiteralWithXsdString(
const LiteralOrIri& word) {
return !word.hasDatatype() ||
asStringViewUnsafe(word.getDatatype()) == XSD_STRING;
}

// _____________________________________________________________________________
std::optional<LiteralOrIri> ExportQueryExecutionTrees::handleIriOrLiteral(
LiteralOrIri word, bool onlyReturnLiterals,
bool onlyReturnLiteralsWithXsdString) {
if (!word.isLiteral()) {
if (onlyReturnLiterals || onlyReturnLiteralsWithXsdString) {
AD_THROW("The input is an IRI, but only literals are allowed.");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this only for debugging? Or what is wrong with the nullopt return below?

return std::nullopt;
}

Check warning on line 382 in src/engine/ExportQueryExecutionTrees.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/ExportQueryExecutionTrees.cpp#L380-L382

Added lines #L380 - L382 were not covered by tests
return word;
}

if (onlyReturnLiteralsWithXsdString) {
if (isPlainLiteralOrLiteralWithXsdString(word)) {
return word;
}
return std::nullopt;
}

if (word.hasDatatype() && !isPlainLiteralOrLiteralWithXsdString(word)) {
word.getLiteral().removeDatatype();
}
return word;
}

// _____________________________________________________________________________
ad_utility::triple_component::LiteralOrIri
ExportQueryExecutionTrees::getLiteralOrIriFromVocabIndex(
const Index& index, Id id, const LocalVocab& localVocab) {
using LiteralOrIri = ad_utility::triple_component::LiteralOrIri;
switch (id.getDatatype()) {
case Datatype::LocalVocabIndex:
return localVocab.getWord(id.getLocalVocabIndex()).asLiteralOrIri();
Expand Down Expand Up @@ -412,6 +460,39 @@
return idToStringAndTypeForEncodedValue(id);
}
}

// _____________________________________________________________________________
template <bool onlyReturnLiterals>
std::optional<LiteralOrIri> ExportQueryExecutionTrees::idToLiteralOrIri(
const Index& index, Id id, const LocalVocab& localVocab,
bool onlyReturnLiteralsWithXsdString) {
using enum Datatype;
auto datatype = id.getDatatype();

if constexpr (onlyReturnLiterals) {
if (!(datatype == VocabIndex || datatype == LocalVocabIndex)) {
return std::nullopt;
}
}

switch (datatype) {
case WordVocabIndex:
return LiteralOrIri::literalWithoutQuotes(
index.indexToString(id.getWordVocabIndex()));

Check warning on line 481 in src/engine/ExportQueryExecutionTrees.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/ExportQueryExecutionTrees.cpp#L480-L481

Added lines #L480 - L481 were not covered by tests
case VocabIndex:
case LocalVocabIndex:
return handleIriOrLiteral(
getLiteralOrIriFromVocabIndex(index, id, localVocab),
onlyReturnLiterals, onlyReturnLiteralsWithXsdString);
case TextRecordIndex:
AD_THROW("TextRecordIndex case is not implemented.");
return std::nullopt;

Check warning on line 489 in src/engine/ExportQueryExecutionTrees.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/ExportQueryExecutionTrees.cpp#L488-L489

Added lines #L488 - L489 were not covered by tests
default:
return idToLiteralOrIriForEncodedValue(id,
onlyReturnLiteralsWithXsdString);
}
}

// ___________________________________________________________________________
template std::optional<std::pair<std::string, const char*>>
ExportQueryExecutionTrees::idToStringAndType<true, false, std::identity>(
Expand All @@ -433,6 +514,18 @@
const LocalVocab& localVocab,
std::identity&& escapeFunction);

// ___________________________________________________________________________
template std::optional<LiteralOrIri>
ExportQueryExecutionTrees::idToLiteralOrIri<false>(
const Index& index, Id id, const LocalVocab& localVocab,
bool onlyReturnLiteralsWithXsdString);

// ___________________________________________________________________________
template std::optional<LiteralOrIri>
ExportQueryExecutionTrees::idToLiteralOrIri<true>(
const Index& index, Id id, const LocalVocab& localVocab,
bool onlyReturnLiteralsWithXsdString);

// Convert a stringvalue and optional type to JSON binding.
static nlohmann::json stringAndTypeToBinding(std::string_view entitystr,
const char* xsdType) {
Expand Down
32 changes: 32 additions & 0 deletions src/engine/ExportQueryExecutionTrees.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class ExportQueryExecutionTrees {
public:
using MediaType = ad_utility::MediaType;
using CancellationHandle = ad_utility::SharedCancellationHandle;
using LiteralOrIri = ad_utility::triple_component::LiteralOrIri;

// Compute the result of the given `parsedQuery` (created by the
// `SparqlParser`) for which the `QueryExecutionTree` has been previously
Expand Down Expand Up @@ -69,6 +70,37 @@ class ExportQueryExecutionTrees {
static std::optional<std::pair<std::string, const char*>>
idToStringAndTypeForEncodedValue(Id id);

// Convert the `id` to a 'LiteralOrIri. Datatypes are always stripped unless
// they are 'xsd:string', so for literals with non-'xsd:string' datatypes
// (this includes IDs that directly store their value, like Doubles) the
// datatype is always empty. If 'onlyReturnLiteralsWithXsdString' is true, all
// IRIs and literals with non'-xsd:string' datatypes (including encoded IDs)
// return 'std::nullopt'. These semantics are useful for the string
// expressions in StringExpressions.cpp.
template <bool returnOnlyLiterals = false>
static std::optional<LiteralOrIri> idToLiteralOrIri(
const Index& index, Id id, const LocalVocab& localVocab,
bool onlyReturnLiteralsWithXsdString = false);

// Same as the previous function, but only handles the datatypes for which the
// value is encoded directly in the ID. For other datatypes an exception is
// thrown.
// If `onlyReturnLiteralsWithXsdString` is `true`, returns `std::nullopt`.
// If `onlyReturnLiteralsWithXsdString` is `false`, removes datatypes from
// literals (e.g. the integer `42` is converted to the plain literal `"42"`).
static std::optional<LiteralOrIri> idToLiteralOrIriForEncodedValue(
Id id, bool onlyReturnLiteralsWithXsdString = false);

// A helper function for the `idToLiteralOrIri` function. Checks and processes
// a LiteralOrIri based on the given parameters.
static std::optional<LiteralOrIri> handleIriOrLiteral(
LiteralOrIri word, bool onlyReturnLiterals,
bool onlyReturnLiteralsWithXsdString);

// Checks if a LiteralOrIri is either a plain literal (without datatype)
// or a literal with the `xsd:string` datatype.
static bool isPlainLiteralOrLiteralWithXsdString(const LiteralOrIri& word);

// Acts as a helper to retrieve an LiteralOrIri object
// from an Id, where the Id is of type `VocabIndex` or `LocalVocabIndex`.
// This function should only be called with suitable `Datatype` Id's,
Expand Down
26 changes: 26 additions & 0 deletions src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,32 @@
}
}

// ____________________________________________________________________________
std::optional<LiteralOrIri> LiteralOrIriValueGetter::operator()(
Id id, const EvaluationContext* context) const {
return ExportQueryExecutionTrees::idToLiteralOrIri(context->_qec.getIndex(),
id, context->_localVocab);
}

// ____________________________________________________________________________
std::optional<LiteralOrIri>
LiteralOrIriValueGetterWithXsdStringFilter::operator()(
Id id, const EvaluationContext* context) const {
return ExportQueryExecutionTrees::idToLiteralOrIri(
context->_qec.getIndex(), id, context->_localVocab, true);
}

Check warning on line 106 in src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp#L103-L106

Added lines #L103 - L106 were not covered by tests

// ____________________________________________________________________________
std::optional<LiteralOrIri>
LiteralOrIriValueGetterWithXsdStringFilter::operator()(
const LiteralOrIri& s, const EvaluationContext*) const {

Check warning on line 111 in src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp#L111

Added line #L111 was not covered by tests
if (ExportQueryExecutionTrees::isPlainLiteralOrLiteralWithXsdString(s)) {
return s;
}
AD_THROW("Input is not a plain string or xsd:string.");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like a debug output.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should definitely be removed.

return std::nullopt;
}

Check warning on line 117 in src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp#L113-L117

Added lines #L113 - L117 were not covered by tests

// ____________________________________________________________________________
template <auto isSomethingFunction, auto prefix>
Id IsSomethingValueGetter<isSomethingFunction, prefix>::operator()(
Expand Down
39 changes: 39 additions & 0 deletions src/engine/sparqlExpressions/SparqlExpressionValueGetters.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,45 @@
}
};

// This class can be used as the `ValueGetter` argument of Expression
// templates. It produces a LiteralOrIri.
struct LiteralOrIriValueGetter : Mixin<LiteralOrIriValueGetter> {
using Mixin<LiteralOrIriValueGetter>::operator();

std::optional<LiteralOrIri> operator()(ValueId,
const EvaluationContext*) const;

std::optional<LiteralOrIri> operator()(const LiteralOrIri& s,
const EvaluationContext*) const {
return s;
}
};
joka921 marked this conversation as resolved.
Show resolved Hide resolved

// Same as above but only literals with 'xsd:string' datatype or no datatype are
// returned.
struct LiteralOrIriValueGetterWithXsdStringFilter
: Mixin<LiteralOrIriValueGetterWithXsdStringFilter> {
using Mixin<LiteralOrIriValueGetterWithXsdStringFilter>::operator();

std::optional<LiteralOrIri> operator()(ValueId,
const EvaluationContext*) const;

std::optional<LiteralOrIri> operator()(const LiteralOrIri& s,
const EvaluationContext*) const;
};

// Value getter for `isBlank`.
struct IsBlankNodeValueGetter : Mixin<IsBlankNodeValueGetter> {
using Mixin<IsBlankNodeValueGetter>::operator();
Id operator()(ValueId id, const EvaluationContext*) const {
return Id::makeFromBool(id.getDatatype() == Datatype::BlankNodeIndex);
}

Check warning on line 176 in src/engine/sparqlExpressions/SparqlExpressionValueGetters.h

View check run for this annotation

Codecov / codecov/patch

src/engine/sparqlExpressions/SparqlExpressionValueGetters.h#L174-L176

Added lines #L174 - L176 were not covered by tests
DuDaAG marked this conversation as resolved.
Show resolved Hide resolved

Id operator()(const LiteralOrIri&, const EvaluationContext*) const {
return Id::makeFromBool(false);
}

Check warning on line 180 in src/engine/sparqlExpressions/SparqlExpressionValueGetters.h

View check run for this annotation

Codecov / codecov/patch

src/engine/sparqlExpressions/SparqlExpressionValueGetters.h#L178-L180

Added lines #L178 - L180 were not covered by tests
};

// Boolean value getter that checks whether the given `Id` is a `ValueId` of the
// given `datatype`.
template <Datatype datatype>
Expand Down
104 changes: 88 additions & 16 deletions src/engine/sparqlExpressions/StringExpressions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,29 @@
asNormalizedStringViewUnsafe(normalizedContent))};
};

// Count UTF-8 characters by skipping continuation bytes (those starting with
// "10").
inline std::size_t utf8Length(std::string_view s) {
return ql::ranges::count_if(
s, [](char c) { return (static_cast<unsigned char>(c) & 0xC0) != 0x80; });
}

// Convert UTF-8 position to byte offset
inline std::size_t utf8ToByteOffset(std::string_view str, int64_t utf8Pos) {
std::size_t byteOffset = 0;
int64_t charCount = 0;

for (char c : str) {
if ((static_cast<unsigned char>(c) & 0xC0) != 0x80) {
if (charCount++ == utf8Pos) {
break;
}
}
++byteOffset;
}
return byteOffset;
}

// String functions.
[[maybe_unused]] auto strImpl =
[](std::optional<std::string> s) -> IdOrLiteralOrIri {
Expand Down Expand Up @@ -126,11 +149,7 @@

// STRLEN
[[maybe_unused]] auto strlen = [](std::string_view s) {
// Count UTF-8 characters by skipping continuation bytes (those starting with
// "10").
auto utf8Len = ql::ranges::count_if(
s, [](char c) { return (static_cast<unsigned char>(c) & 0xC0) != 0x80; });
return Id::makeFromInt(utf8Len);
return Id::makeFromInt(utf8Length(s));
};
using StrlenExpression =
StringExpressionImpl<1, LiftStringFunction<decltype(strlen)>>;
Expand Down Expand Up @@ -182,7 +201,7 @@
};

public:
IdOrLiteralOrIri operator()(std::optional<std::string> s, NumericValue start,
IdOrLiteralOrIri operator()(std::optional<LiteralOrIri> s, NumericValue start,
NumericValue length) const {
if (!s.has_value() || std::holds_alternative<NotNumeric>(start) ||
std::holds_alternative<NotNumeric>(length)) {
Expand All @@ -202,29 +221,82 @@
if (startInt < 0) {
lengthInt += startInt;
}

const auto& str = s.value();
const auto& str = asStringViewUnsafe(s.value().getContent());
std::size_t utf8len = utf8Length(str);
// Clamp the number such that it is in `[0, str.size()]`. That way we end up
// with valid arguments for the `getUTF8Substring` method below for both
// with valid arguments for the `setSubstr` method below for both
// starting position and length since all the other corner cases have been
// dealt with above.
auto clamp = [sz = str.size()](int64_t n) -> std::size_t {
auto clamp = [utf8len](int64_t n) -> std::size_t {
if (n < 0) {
return 0;
}
if (static_cast<size_t>(n) > sz) {
return sz;
if (static_cast<size_t>(n) > utf8len) {
return utf8len;
}
return static_cast<size_t>(n);
};

return toLiteral(
ad_utility::getUTF8Substring(str, clamp(startInt), clamp(lengthInt)));
startInt = clamp(startInt);
lengthInt = clamp(lengthInt);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you have to use a clamped version of start + length , ottherwise this doesn't quite work.

std::size_t startByteOffset = utf8ToByteOffset(str, startInt);
std::size_t endByteOffset = utf8ToByteOffset(str, startInt + lengthInt);
std::size_t byteLength = endByteOffset - startByteOffset;

s.value().getLiteral().setSubstr(startByteOffset, byteLength);
return std::move(s.value());
}
};

// Implementation of the `SUBSTR` SPARQL function. It dynamically
// selects the appropriate value getter for the first argument based on whether
// it is a `STR()` expression (using
// `LiteralOrIriValueGetterWithXsdStringFilter`) or another type (using
// `LiteralOrIriValueGetter`).
class SubstrExpressionImpl : public SparqlExpression {
private:
using ExpressionWithStr =
NARY<3, FV<SubstrImpl, LiteralOrIriValueGetterWithXsdStringFilter,
NumericValueGetter, NumericValueGetter>>;
using ExpressionWithoutStr =
NARY<3, FV<SubstrImpl, LiteralOrIriValueGetter, NumericValueGetter,
NumericValueGetter>>;

SparqlExpression::Ptr impl_;

public:
explicit SubstrExpressionImpl(
SparqlExpression::Ptr child,
std::same_as<SparqlExpression::Ptr> auto... children)
requires(sizeof...(children) + 1 == 3) {
AD_CORRECTNESS_CHECK(child != nullptr);

if (child->isStrExpression()) {
auto childrenOfStr = std::move(*child).moveChildrenOut();
AD_CORRECTNESS_CHECK(childrenOfStr.size() == 1);
impl_ = std::make_unique<ExpressionWithStr>(
std::move(childrenOfStr.at(0)), std::move(children)...);

Check warning on line 278 in src/engine/sparqlExpressions/StringExpressions.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/sparqlExpressions/StringExpressions.cpp#L275-L278

Added lines #L275 - L278 were not covered by tests
} else {
impl_ = std::make_unique<ExpressionWithoutStr>(std::move(child),
std::move(children)...);
}
}

ExpressionResult evaluate(EvaluationContext* context) const override {
return impl_->evaluate(context);
}

std::string getCacheKey(const VariableToColumnMap& varColMap) const override {
return impl_->getCacheKey(varColMap);
}

Check warning on line 291 in src/engine/sparqlExpressions/StringExpressions.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/sparqlExpressions/StringExpressions.cpp#L289-L291

Added lines #L289 - L291 were not covered by tests

private:
std::span<SparqlExpression::Ptr> childrenImpl() override {
return impl_->children();
}
};

using SubstrExpression =
StringExpressionImpl<3, SubstrImpl, NumericValueGetter, NumericValueGetter>;
using SubstrExpression = SubstrExpressionImpl;

// STRSTARTS
[[maybe_unused]] auto strStartsImpl = [](std::string_view text,
Expand Down
Loading
Loading