Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: hybrid bench sort issues #209

Open
wants to merge 3 commits into
base: main-dev
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 80 additions & 16 deletions scripts/bench_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,44 +66,108 @@ void populate_from_file(std::string path, strings_t &strings,
while (strings.size() < limit && std::getline(f, s, ' ')) strings.push_back(s);
}

constexpr size_t offset_in_word = 0;
constexpr size_t offset_in_word = 4;

static idx_t hybrid_sort_cpp(strings_t const &strings, sz_u64_t *order) {

// What if we take up-to 4 first characters and the index
for (size_t i = 0; i != strings.size(); ++i)
std::memcpy((char *)&order[i] + offset_in_word, strings[order[i]].c_str(),
std::min<std::size_t>(strings[order[i]].size(), 4ul));
for (size_t i = 0; i != strings.size(); ++i) {
size_t index = order[i];

for (size_t j = 0; j < std::min<std::size_t>(strings[(sz_size_t)index].size(), 4ul); ++j) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a memcpy and byte order reversal, right? Should be faster without loops.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, do you have an example on how to do it without loops?

std::memcpy((char *)&order[i] + offset_in_word + 3 - j, strings[(sz_size_t)index].c_str() + j, 1ul);
}
}

std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
char *i_bytes = (char *)&i;
char *j_bytes = (char *)&j;
return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
});

for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
const auto extract_bytes = [](sz_u64_t v) -> uint32_t {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m not sure if I understand the purpose of the following part. Can you please clarify?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once I had fixed the byte order issues, these functions were performing similar to their alternatives.

It turns out the final sort call was the slowest part, but since we are already partially sorting the strings, we only really need to sort each unsorted sub group (all strings with equal first 4 chars).

But to do this we need to keep the bytes in the orders whilst we find the start and end of each group to sort. The extract_bytes lambda just made it easier to get those first 4 string bytes.

I know it's not the prettiest/simplest code.

char *bytes = (char *)&v;
return *(uint32_t *)(bytes + offset_in_word);
};

if (strings.size() >= 2) {
size_t prev_index = 0;
uint64_t prev_bytes = extract_bytes(order[0]);

for (size_t i = 1; i < strings.size(); ++i) {
uint32_t bytes = extract_bytes(order[i]);
if (bytes != prev_bytes) {
std::sort(order + prev_index, order + i, [&](sz_u64_t i, sz_u64_t j) {
// Assumes: offset_in_word==4
sz_size_t i_index = i & 0xFFFF'FFFF;
sz_size_t j_index = j & 0xFFFF'FFFF;
return strings[i_index] < strings[j_index];
});
prev_index = i;
prev_bytes = bytes;
}
}

std::sort(order + prev_index, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
sz_size_t i_index = i & 0xFFFF'FFFF;
sz_size_t j_index = j & 0xFFFF'FFFF;
return strings[i_index] < strings[j_index];
});
}

std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; });
for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);

return strings.size();
}

static idx_t hybrid_stable_sort_cpp(strings_t const &strings, sz_u64_t *order) {

// What if we take up-to 4 first characters and the index
for (size_t i = 0; i != strings.size(); ++i)
std::memcpy((char *)&order[i] + offset_in_word, strings[order[i]].c_str(),
std::min<std::size_t>(strings[order[i]].size(), 4ull));
for (size_t i = 0; i != strings.size(); ++i) {
size_t index = order[i];

for (size_t j = 0; j < std::min<std::size_t>(strings[(sz_size_t)index].size(), 4ul); ++j) {
std::memcpy((char *)&order[i] + offset_in_word + 3 - j, strings[(sz_size_t)index].c_str() + j, 1ul);
}
}

std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
char *i_bytes = (char *)&i;
char *j_bytes = (char *)&j;
return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
});

for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
const auto extract_bytes = [](sz_u64_t v) -> uint32_t {
char *bytes = (char *)&v;
return *(uint32_t *)(bytes + offset_in_word);
};

if (strings.size() >= 2) {
size_t prev_index = 0;
uint64_t prev_bytes = extract_bytes(order[0]);

for (size_t i = 1; i < strings.size(); ++i) {
uint32_t bytes = extract_bytes(order[i]);
if (bytes != prev_bytes) {
std::stable_sort(order + prev_index, order + i, [&](sz_u64_t i, sz_u64_t j) {
// Assumes: offset_in_word==4
sz_size_t i_index = i & 0xFFFF'FFFF;
sz_size_t j_index = j & 0xFFFF'FFFF;
return strings[i_index] < strings[j_index];
});
prev_index = i;
prev_bytes = bytes;
}
}

std::stable_sort(order + prev_index, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
sz_size_t i_index = i & 0xFFFF'FFFF;
sz_size_t j_index = j & 0xFFFF'FFFF;
return strings[i_index] < strings[j_index];
});
}

std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; });
for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);

return strings.size();
}
Expand Down Expand Up @@ -196,7 +260,7 @@ int main(int argc, char const **argv) {
});
expect_sorted(strings, permute_new);

#if __linux__ && defined(_GNU_SOURCE)
#if __linux__ && defined(_GNU_SOURCE) && !defined(__BIONIC__)
bench_permute("qsort_r", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
sz_sequence_t array;
array.order = permute.data();
Expand Down Expand Up @@ -232,12 +296,12 @@ int main(int argc, char const **argv) {
});
expect_sorted(strings, permute_base);

bench_permute(
"hybrid_stable_sort_cpp", strings, permute_base,
[](strings_t const &strings, permute_t &permute) { hybrid_stable_sort_cpp(strings, permute.data()); });
bench_permute("hybrid_stable_sort_cpp", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
hybrid_stable_sort_cpp(strings, permute.data());
});
expect_sorted(strings, permute_new);
expect_same(permute_base, permute_new);
}

return 0;
}
}
Loading