-
Notifications
You must be signed in to change notification settings - Fork 56
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Correct datatypes for string expressions #1636
base: master
Are you sure you want to change the base?
Changes from 32 commits
47feb71
4877e8f
aefd889
b2cb8c6
96b1959
b7806b8
a83d74b
e73d1ab
0ed79df
2666b78
27fff04
7078f60
2581f4c
6d7a2b2
5948dcb
52ef1f5
f15bf94
774d52b
d4b49c0
72aaa00
25000a9
617c3b7
f631ec2
889e9dd
0c41603
1b7e1b4
344560a
67c747a
313bba4
be80b09
2adaa30
39ca3cb
7455f29
ac95531
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,6 +90,32 @@ | |
} | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
std::optional<LiteralOrIri> LiteralOrIriValueGetter::operator()( | ||
Id id, const EvaluationContext* context) const { | ||
return ExportQueryExecutionTrees::idToLiteralOrIri(context->_qec.getIndex(), | ||
id, context->_localVocab); | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
std::optional<LiteralOrIri> | ||
LiteralOrIriValueGetterWithXsdStringFilter::operator()( | ||
Id id, const EvaluationContext* context) const { | ||
return ExportQueryExecutionTrees::idToLiteralOrIri( | ||
context->_qec.getIndex(), id, context->_localVocab, true); | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
std::optional<LiteralOrIri> | ||
LiteralOrIriValueGetterWithXsdStringFilter::operator()( | ||
const LiteralOrIri& s, const EvaluationContext*) const { | ||
if (ExportQueryExecutionTrees::isPlainLiteralOrLiteralWithXsdString(s)) { | ||
return s; | ||
} | ||
AD_THROW("Input is not a plain string or xsd:string."); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like a debug output. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should definitely be removed. |
||
return std::nullopt; | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
template <auto isSomethingFunction, auto prefix> | ||
Id IsSomethingValueGetter<isSomethingFunction, prefix>::operator()( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,29 @@ | |
asNormalizedStringViewUnsafe(normalizedContent))}; | ||
}; | ||
|
||
// Count UTF-8 characters by skipping continuation bytes (those starting with | ||
// "10"). | ||
inline std::size_t utf8Length(std::string_view s) { | ||
return ql::ranges::count_if( | ||
s, [](char c) { return (static_cast<unsigned char>(c) & 0xC0) != 0x80; }); | ||
} | ||
|
||
// Convert UTF-8 position to byte offset | ||
inline std::size_t utf8ToByteOffset(std::string_view str, int64_t utf8Pos) { | ||
std::size_t byteOffset = 0; | ||
int64_t charCount = 0; | ||
|
||
for (char c : str) { | ||
if ((static_cast<unsigned char>(c) & 0xC0) != 0x80) { | ||
if (charCount++ == utf8Pos) { | ||
break; | ||
} | ||
} | ||
++byteOffset; | ||
} | ||
return byteOffset; | ||
} | ||
|
||
// String functions. | ||
[[maybe_unused]] auto strImpl = | ||
[](std::optional<std::string> s) -> IdOrLiteralOrIri { | ||
|
@@ -126,11 +149,7 @@ | |
|
||
// STRLEN | ||
[[maybe_unused]] auto strlen = [](std::string_view s) { | ||
// Count UTF-8 characters by skipping continuation bytes (those starting with | ||
// "10"). | ||
auto utf8Len = ql::ranges::count_if( | ||
s, [](char c) { return (static_cast<unsigned char>(c) & 0xC0) != 0x80; }); | ||
return Id::makeFromInt(utf8Len); | ||
return Id::makeFromInt(utf8Length(s)); | ||
}; | ||
using StrlenExpression = | ||
StringExpressionImpl<1, LiftStringFunction<decltype(strlen)>>; | ||
|
@@ -182,7 +201,7 @@ | |
}; | ||
|
||
public: | ||
IdOrLiteralOrIri operator()(std::optional<std::string> s, NumericValue start, | ||
IdOrLiteralOrIri operator()(std::optional<LiteralOrIri> s, NumericValue start, | ||
NumericValue length) const { | ||
if (!s.has_value() || std::holds_alternative<NotNumeric>(start) || | ||
std::holds_alternative<NotNumeric>(length)) { | ||
|
@@ -202,29 +221,82 @@ | |
if (startInt < 0) { | ||
lengthInt += startInt; | ||
} | ||
|
||
const auto& str = s.value(); | ||
const auto& str = asStringViewUnsafe(s.value().getContent()); | ||
std::size_t utf8len = utf8Length(str); | ||
// Clamp the number such that it is in `[0, str.size()]`. That way we end up | ||
// with valid arguments for the `getUTF8Substring` method below for both | ||
// with valid arguments for the `setSubstr` method below for both | ||
// starting position and length since all the other corner cases have been | ||
// dealt with above. | ||
auto clamp = [sz = str.size()](int64_t n) -> std::size_t { | ||
auto clamp = [utf8len](int64_t n) -> std::size_t { | ||
if (n < 0) { | ||
return 0; | ||
} | ||
if (static_cast<size_t>(n) > sz) { | ||
return sz; | ||
if (static_cast<size_t>(n) > utf8len) { | ||
return utf8len; | ||
} | ||
return static_cast<size_t>(n); | ||
}; | ||
|
||
return toLiteral( | ||
ad_utility::getUTF8Substring(str, clamp(startInt), clamp(lengthInt))); | ||
startInt = clamp(startInt); | ||
lengthInt = clamp(lengthInt); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you have to use a clamped version of |
||
std::size_t startByteOffset = utf8ToByteOffset(str, startInt); | ||
std::size_t endByteOffset = utf8ToByteOffset(str, startInt + lengthInt); | ||
std::size_t byteLength = endByteOffset - startByteOffset; | ||
|
||
s.value().getLiteral().setSubstr(startByteOffset, byteLength); | ||
return std::move(s.value()); | ||
} | ||
}; | ||
|
||
// Implementation of the `SUBSTR` SPARQL function. It dynamically | ||
// selects the appropriate value getter for the first argument based on whether | ||
// it is a `STR()` expression (using | ||
// `LiteralOrIriValueGetterWithXsdStringFilter`) or another type (using | ||
// `LiteralOrIriValueGetter`). | ||
class SubstrExpressionImpl : public SparqlExpression { | ||
private: | ||
using ExpressionWithStr = | ||
NARY<3, FV<SubstrImpl, LiteralOrIriValueGetterWithXsdStringFilter, | ||
NumericValueGetter, NumericValueGetter>>; | ||
using ExpressionWithoutStr = | ||
NARY<3, FV<SubstrImpl, LiteralOrIriValueGetter, NumericValueGetter, | ||
NumericValueGetter>>; | ||
|
||
SparqlExpression::Ptr impl_; | ||
|
||
public: | ||
explicit SubstrExpressionImpl( | ||
SparqlExpression::Ptr child, | ||
std::same_as<SparqlExpression::Ptr> auto... children) | ||
requires(sizeof...(children) + 1 == 3) { | ||
AD_CORRECTNESS_CHECK(child != nullptr); | ||
|
||
if (child->isStrExpression()) { | ||
auto childrenOfStr = std::move(*child).moveChildrenOut(); | ||
AD_CORRECTNESS_CHECK(childrenOfStr.size() == 1); | ||
impl_ = std::make_unique<ExpressionWithStr>( | ||
std::move(childrenOfStr.at(0)), std::move(children)...); | ||
} else { | ||
impl_ = std::make_unique<ExpressionWithoutStr>(std::move(child), | ||
std::move(children)...); | ||
} | ||
} | ||
|
||
ExpressionResult evaluate(EvaluationContext* context) const override { | ||
return impl_->evaluate(context); | ||
} | ||
|
||
std::string getCacheKey(const VariableToColumnMap& varColMap) const override { | ||
return impl_->getCacheKey(varColMap); | ||
} | ||
|
||
private: | ||
std::span<SparqlExpression::Ptr> childrenImpl() override { | ||
return impl_->children(); | ||
} | ||
}; | ||
|
||
using SubstrExpression = | ||
StringExpressionImpl<3, SubstrImpl, NumericValueGetter, NumericValueGetter>; | ||
using SubstrExpression = SubstrExpressionImpl; | ||
|
||
// STRSTARTS | ||
[[maybe_unused]] auto strStartsImpl = [](std::string_view text, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this only for debugging? Or what is wrong with the
nullopt
return below?