Skip to content

Commit

Permalink
updated counting sort
Browse files Browse the repository at this point in the history
  • Loading branch information
gblelloch committed Feb 27, 2024
1 parent 0661148 commit ed00a4e
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 38 deletions.
9 changes: 5 additions & 4 deletions examples/counting_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ int main(int argc, char* argv[]) {
catch (...) { std::cout << usage << std::endl; return 1; }

long num_buckets = 256;
long num_partitions = std::min<long>(1000l, n / (256 * 16) + 1);

parlay::random_generator gen;
std::uniform_int_distribution<long> dis(0, num_buckets - 1);
Expand All @@ -32,11 +31,13 @@ int main(int argc, char* argv[]) {
return dis(r);});

parlay::internal::timer t("Time");
parlay::sequence<long> result;
parlay::sequence<long> result(n);
for (int i=0; i < 5; i++) {
result = data;
t.start();
result = counting_sort(data, data, num_buckets, num_partitions);
counting_sort(data.begin(), data.end(),
result.begin(),
data.begin(),
num_buckets);
t.next("counting_sort");
}

Expand Down
64 changes: 30 additions & 34 deletions examples/counting_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,57 +5,53 @@
#include <parlay/sequence.h>
#include <parlay/primitives.h>

// **************************************************************
// A parallel counting sort
// Works well for a smallish (e.g. up to 256 or perhaps 1000) buckets.
// Input is a sequence of values, and a range of keys (of equal length).
// They could be the same, or the keys could be a field from the values.
// Must also specify the number of buckets and the number of paritions.
// **************************************************************

template <typename T, typename Keys>
parlay::sequence<T> counting_sort(const parlay::sequence<T>& in, const Keys& keys,
long num_buckets, long num_parts) {
long n = in.size();
template <typename InIt, typename OutIt, typename KeyIt>
parlay::sequence<int>
counting_sort(const InIt& begin, const InIt& end,
OutIt out, const KeyIt& keys,
long num_buckets) {
long n = end - begin;
long num_parts = n / (num_buckets * 64) + 1;
long part_size = (n - 1)/num_parts + 1;

// For each partition count number of each of the key values
auto all_counts = parlay::tabulate(num_parts, [&] (long i) {
// first count buckets within each partition
auto counts = parlay::sequence<int>::uninitialized(num_buckets * num_parts);
parlay::parallel_for(0, num_parts, [&] (long i) {
long start = i * part_size;
long end = std::min<long>(start + part_size, n);
parlay::sequence<int> local_counts(num_buckets, 0);
for (size_t j = start; j < end; j++) local_counts[keys[j]]++;
return local_counts;}, 1);

// need to transpose the counts for the scan
auto counts = parlay::sequence<int>::uninitialized(num_buckets * num_parts);
parlay::parallel_for(0, num_buckets, [&] (long i) {
for (size_t j = 0; j < num_parts; j++)
counts[i* num_parts + j] = all_counts[j][i];}, 1);
all_counts.clear();
for (int j = 0; j < num_buckets; j++) counts[i*num_buckets + j] = 0;
for (size_t j = start; j < end; j++) counts[i*num_buckets + keys[j]]++;
}, 1);

// transpose the counts if more than one part
parlay::sequence<int> trans_counts;
if (num_parts > 1) {
trans_counts = parlay::sequence<int>::uninitialized(num_buckets * num_parts);
parlay::parallel_for(0, num_buckets, [&] (long i) {
for (size_t j = 0; j < num_parts; j++)
trans_counts[i* num_parts + j] = counts[j * num_buckets + i];}, 1);
} else trans_counts = std::move(counts);

// scan for offsets for all buckets
parlay::scan_inplace(counts);
parlay::scan_inplace(trans_counts);

// the ouput sequence
auto out = parlay::sequence<T>::uninitialized(n);

// go back over partitions to place the input in final location
// go back over partitions to place in final location
parlay::parallel_for(0, num_parts, [&] (long i) {
long start = i * part_size;
long end = std::min<long>(start + part_size, n);
parlay::sequence<int> local_offsets(num_buckets, 0);
int local_offsets[num_buckets];

// transpose back
for (int j = 0; j < num_buckets; j++)
local_offsets[j] = counts[num_parts * j + i];
local_offsets[j] = trans_counts[num_parts * j + i];

// copy to output
for (size_t j = start; j < end; j++) {
int k = local_offsets[keys[j]]++;
// the following line helps performance
__builtin_prefetch (((char*) &out[k]) + 64);
out[k] = in[j];
out[k] = begin[j];
}}, 1);
return out;

return parlay::tabulate(num_buckets, [&] (long i) {
return trans_counts[i * num_parts];});
}

0 comments on commit ed00a4e

Please sign in to comment.