单机版的双调排序可以参考 http://blog.csdn.net/sunmenggmail/article/details/42869235
还是这张图片
基于cuda的双调排序的思路是:
为每一个元素提供一个线程,如果大于1024个元素,还是提供1024个线程,这是因为__syncthreads只能作为block内的线程同步,而一个block最多有1024个线程,如果元素个数大于1024则每个线程可能就要负责一个以上的元素的比较
就上图而言,一个矩形代表一次多线程的比较,那么此图仅需要6次比较,就可以有右边的输出。
#include <vector>
#include <algorithm>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
using namespace std;
#define CHECK_EQ1(a,b) do { \
if ((a) != (b)) { \
cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
cout << cudaGetErrorString(a) <<endl;\
exit(1);\
}\
} while(0)
#define CUDA_CHECK(condition)\
do {\
cudaError_t error = condition;\
CHECK_EQ1(error, cudaSuccess);\
} while(0)
static __device__ __forceinline__ unsigned int __btflo(unsigned int word)
{
unsigned int ret;
asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word));
//return the index of highest non-zero bit in a word; for example, 00000110, return 2
return ret;
}
//for > 1024
__global__ void bigBinoticSort(unsigned int *arr, int len, unsigned int *buf) {
unsigned len2 = 1 << (__btflo(len-1u) + 1);//
unsigned int MAX = 0xffffffffu;
unsigned id = threadIdx.x;
if (id >= len2) return;
unsigned iter = blockDim.x;
for (unsigned i = id; i < len2; i += iter) {
if (i >= len) {
buf[i-len] = MAX;
}
}
__syncthreads();
int count = 0;
for (unsigned k = 2; k <= len2; k*=2) {
for (unsigned j = k >> 1; j > 0; j >>= 1) {
for (unsigned i = id; i < len2; i += iter) {
unsigned swapIdx = i ^ j;
if (swapIdx > i) {
unsigned myelem, other;
if (i < len) myelem = arr[i];
else myelem = buf[i-len];
if (swapIdx < len) other = arr[swapIdx];
else other = buf[swapIdx-len];
bool swap = false;
if ((i & k)==0 && myelem > other) swap = true;
if ((i & k) == k && myelem < other) swap = true;
if (swap) {
if (swapIdx < len) arr[swapIdx] = myelem;
else buf[swapIdx-len] = myelem;
if (i < len) arr[i] = other;
else buf[i-len] = other;
}
}
}
__syncthreads();
}
}
}
//for <= 1024
__global__ void binoticSort(unsigned int *arr, int len) {
__shared__ unsigned int buf[1024];
buf[threadIdx.x] = (threadIdx.x < len ? arr[threadIdx.x] : 0xffffffffu);
__syncthreads();
for (unsigned k = 2; k <= blockDim.x; k*=2) {//buid k elements ascend or descend
for (unsigned j = k >> 1; j > 0; j >>= 1) {//merge longer binotic into shorter binotic
unsigned swapIdx = threadIdx.x ^ j;
unsigned myelem = buf[threadIdx.x];
unsigned other = buf[swapIdx];
__syncthreads();
unsigned ascend = k * (swapIdx < threadIdx.x);
unsigned descend = k * (swapIdx > threadIdx.x);
//if I is front, swap is back; ascend = 0, descend = k
//if I is back, swap is front; ascend = k, descend = 0;
bool swap = false;
if ((threadIdx.x & k) == ascend) {
if (myelem > other) swap = true;
}
if ((threadIdx.x & k) == descend) {
if (myelem < other) swap = true;
}
if (swap) buf[swapIdx] = myelem;
__syncthreads();
}
}
if (threadIdx.x < len) arr[threadIdx.x] = buf[threadIdx.x];
}
template<class T>
inline void printVec(T *vec, int len) {
for (int i = 0; i < len; ++i) cout <<vec[i] << "\t";
cout << endl;
}
template<class T>
inline void printVecg(T *gvec, int len) {
T *vec = (T*)malloc(sizeof(T)*len);
CUDA_CHECK(cudaMemcpy(vec,gvec,sizeof(T)*len,cudaMemcpyDeviceToHost));
printVec(vec,len);
free(vec);
}
void lineSize(int N, int &nblocks, int &nthreads) {
if (N <= 1024) {
nthreads = (N + 32 - 1)/32*32;//
}
else {
nblocks = (N + 1024 -1)/1024;
}
}
bool validate(unsigned *gvec, int len) {
unsigned *vec = (unsigned*)malloc(sizeof(unsigned)*len);
CUDA_CHECK(cudaMemcpy(vec,gvec,sizeof(unsigned)*len,cudaMemcpyDeviceToHost));
for(int i = 1; i < len; ++i) {
if (vec[i] <= vec[i-1]) return false;
}
return true;
}
inline int roundUpPower2(int v) {
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}
int main(int argc, char *argv[]) {
if (argc != 2) {
cout << "len \n";
return;
}
int len = atoi(argv[1]);
unsigned int *arr = (unsigned int*)malloc(sizeof(unsigned int)*len);
for (int i = 0; i < len; ++i) arr[i] = i;
srand((unsigned int)time(NULL));
for (int i = len; i >= 2; --i) {
int j = rand() % i;
swap(arr[i-1], arr[j]);
}
unsigned* debug;
CUDA_CHECK(cudaMalloc((void**)&debug, sizeof(unsigned)*1000));
unsigned int* darr, *buf;
CUDA_CHECK(cudaMalloc((void**)&darr, sizeof(unsigned int)*len));
CUDA_CHECK(cudaMalloc((void**)&buf, sizeof(unsigned int)*len));
CUDA_CHECK(cudaMemcpy(darr, arr, sizeof(unsigned int)*len, cudaMemcpyHostToDevice));
bigBinoticSort<<<1,1024>>>(darr,len, buf);
CUDA_CHECK(cudaPeekAtLastError());
CUDA_CHECK(cudaDeviceSynchronize());
if (validate(darr, len))
cout << "yes\n";
else
cout << "no\n";
return 1;
}
算法有两个双调排序实现,一个用于小于1024个元素,用到了共享内存加快访问速度,但是如果真要排序1024以下的元素,建议还是用cpu版本的快排吧,gpu的在速度上并没有明显的优势,甚至还比cpu慢
如果大于1024元素,就采用另一种方法。这种方法的缺点也是很明显的,就是不管再多的元素,只能用一个block进行计算,而一个block最多只能用1024个线程,估计在一万个元素以内的话,这个方法是gpu上最快的。
经过本人测试,包括thrust的sort(基数排序), 只有元素数量超过5000个,gpu上的排序算法才有明显的优势。10万左右的元素,gpu上的排序算法比cpu有一百倍的提速。
下面会介绍在gpu上进行快速排序。gpu快速排序可以处理非常大的数据,但是会有递归深度的限制,当超过递归深度时,就可以调用上面所讲的双调排序进行处理。测试表明,速度比thrust还是快一点
gpu上的快排主要参考样例 NVIDIA_CUDA-6.5_Samples/6_Advanced/cdpAdvancedQuicksort
快排只处理大于1024个元素的数组,然后将其分隔为左右两个子数组,如果子数组长度大于1024则继续动态递归调用快排,如果小于1024则动态调用双调排序。如果快排的递归深度已经超过最大递归深度(cuda最大嵌套深度64,但是还受限于每一级所使用的内存大小),则直接调用双调排序。
这段程序的最精彩的地方在于分隔函数
将数组按照warp大小进行分隔,每个warp处理32个元素,通过全局的atomicAdd函数,分别获得warp内的小于和大于pivot数在数组的偏移地址,注意在同一个warp内,这个偏移地址是一样的,然后每个线程将自己的元素放到偏移地址,这样就完成了分割
需要注意的是,这个快排不是in-place的,又涉及到递归调用,所以还得处理原数组和缓冲区的调换
由于cuda没有显式的锁,此方法采用了一种特殊的循环队列,本人认为在极端情况下,可能会出现问题
(这里的代码有错,没有处理原数组和缓冲区的调换,只是帮助理解。正确的代码请参考Samples里的)
#define QSORT_BLOCKSIZE_SHIFT 9
#define QSORT_BLOCKSIZE (1 << QSORT_BLOCKSIZE_SHIFT)
#define BITONICSORT_LEN 1024 // Must be power of 2!
#define QSORT_MAXDEPTH 16 // Will force final bitonic stage at depth QSORT_MAXDEPTH+1
#define QSORT_STACK_ELEMS 1*1024*1024 // One million stack elements is a HUGE number.
typedef struct __align__(128) qsortAtomicData_t
{
volatile unsigned int lt_offset; // Current output offset for <pivot
volatile unsigned int gt_offset; // Current output offset for >pivot
volatile unsigned int sorted_count; // Total count sorted, for deciding when to launch next wave
volatile unsigned int index; // Ringbuf tracking index. Can be ignored if not using ringbuf.
} qsortAtomicData;
////////////////////////////////////////////////////////////////////////////////
// A ring-buffer for rapid stack allocation
////////////////////////////////////////////////////////////////////////////////
typedef struct qsortRingbuf_t
{
volatile unsigned int head; //1 // Head pointer - we allocate from here
volatile unsigned int tail; //0 // Tail pointer - indicates last still-in-use element
volatile unsigned int count;//0 // Total count allocated
volatile unsigned int max; //0 // Max index allocated
unsigned int stacksize; // // Wrap-around size of buffer (must be power of 2)
volatile void *stackbase; // Pointer to the stack we're allocating from
} qsortRingbuf;
/*
for cuda has no lock, so we have to do like this:
if alloc , ++head
if free , ++tail
so [tail, head) contains alloced chunks; head point to the next free chunk
count record the number of chunks had free
we have n chunks, but the index of a chunk is increase when re-alloc
max record the maximum index of the free chunks
only if the chunks before max are all free, aka, max == count, we can alter tail value
*/
template<class T>
static __device__ void ringbufFree(qsortRingbuf *ringbuf, T *data) {
unsigned index = data->index;
unsigned count = atomicAdd((unsigned*)&(ringbuf->count), 1) + 1;
unsigned max = atomicMax((unsigned*)&(ringbuf->max), index + 1);
if (max < (index + 1)) max = index + 1;
if (max == count) {
atomicMax((unsigned*)&(ringbuf->tail), count);
}
}
template<class T>
static __device__ T* ringbufAlloc(qsortRingbuf *ringbuf) {
unsigned int loop = 10000;
while (((ringbuf->head - ringbuf->tail) >= ringbuf->stacksize) && (loop-- > 0));
if (loop == 0) return NULL;
unsigned index = atomicAdd((unsigned*)&ringbuf->head, 1);
T *ret = (T*)(ringbuf->stackbase) + (index & (ringbuf->stacksize - 1));
ret->index = index;
return ret;
}
__global__ void qsort_warp(unsigned *indata,
unsigned *outdata,
unsigned int offset,//0
unsigned int len,//
qsortAtomicData *atomicData,//stack
qsortRingbuf *atomicDataStack,//ringbuf
unsigned int source_is_indata,//true
unsigned int depth)
{
//printf("depth = %d", depth);
// Find my data offset, based on warp ID
unsigned int thread_id = threadIdx.x + (blockIdx.x << QSORT_BLOCKSIZE_SHIFT);
//unsigned int warp_id = threadIdx.x >> 5; // Used for debug only
unsigned int lane_id = threadIdx.x & (warpSize-1);// %32
// Exit if I'm outside the range of sort to be done
if (thread_id >= len)
return;
//
// First part of the algorithm. Each warp counts the number of elements that are
// greater/less than the pivot.
//
// When a warp knows its count, it updates an atomic counter.
//
// Read in the data and the pivot. Arbitrary pivot selection for now.
unsigned pivot = indata[offset + len/2];
unsigned data = indata[offset + thread_id];
// Count how many are <= and how many are > pivot.
// If all are <= pivot then we adjust the comparison
// because otherwise the sort will move nothing and
// we'll iterate forever.
unsigned int greater = (data > pivot);
unsigned int gt_mask = __ballot(greater);//Evaluate predicate for all active threads of the warp and return an integer whose Nth bit is set if and only if predicate evaluates to non-zero for the Nth thread of the warp and the Nth thread is active.
if (gt_mask == 0) {
greater = (data >= pivot);
gt_mask = __ballot(greater);
}
unsigned lt_mask = __ballot(!greater);
unsigned gt_count = __popc(gt_mask);//count number of 1 in a warp;
unsigned lt_count = __popc(lt_mask);
//only thread 0 in warp calc
//find 2 new positions for this warp
unsigned lt_oft, gt_oft;
if (lane_id == 0) {
if (lt_count > 0)
lt_oft = atomicAdd((unsigned*)&atomicData->lt_offset, lt_count);//atomicAdd return old value, not the newer//all the warps will syn call this
if (gt_count > 0)
gt_oft = len - (atomicAdd((unsigned*) &atomicData->gt_offset, gt_count) + gt_count);
//printf("depth = %d\n", depth);
//printf("pivot = %u\n", pivot);
//printf("lt_count %u lt_oft %u gt_count %u gt_oft %u atomicDataGtOffset %u\n", lt_count,lt_oft, gt_count,gt_oft, atomicData->gt_offset);
}
lt_oft = __shfl((int)lt_oft, 0);
gt_oft = __shfl((int)gt_oft, 0);//Everyone pulls the offsets from lane 0
__syncthreads();
// Now compute my own personal offset within this. I need to know how many
// threads with a lane ID less than mine are going to write to the same buffer
// as me. We can use popc to implement a single-operation warp scan in this case.
unsigned lane_mask_lt;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));//bits set in positions less than the thread's lane number the warp
unsigned my_mask = greater ? gt_mask : lt_mask;
unsigned my_oft = __popc(my_mask & lane_mask_lt);//
//move data
my_oft += greater ? gt_oft : lt_oft;
outdata[offset + my_oft] = data;
__syncthreads();
//if (lane_id == 0) printf("pivot = %d", pivot);
if (lane_id == 0) {
/*if (blockIdx.x == 0) {
printf("depth = %d\n", depth);
for (int i = 0; i < len; ++i)
printf("%u ", outdata[offset+i]);
printf("\n");
}*/
unsigned mycount = lt_count + gt_count;
//we are the last warp
if (atomicAdd((unsigned*)&atomicData->sorted_count, mycount) + mycount == len) {
unsigned lt_len = atomicData->lt_offset;
unsigned gt_len = atomicData->gt_offset;
cudaStream_t lstream, rstream;
cudaStreamCreateWithFlags(&lstream, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&rstream, cudaStreamNonBlocking);
ringbufFree<qsortAtomicData>(atomicDataStack, atomicData);
if (lt_len == 0) return;
//////////////////////////// left
if (lt_len > BITONICSORT_LEN) {
if (depth >= QSORT_MAXDEPTH) {
bigBinoticSort<<<1, BITONICSORT_LEN,0, rstream>>>(outdata + offset, lt_len, indata + offset);
}
else {
if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) == NULL)
printf("Stack-allocation error. Failing left child launch.\n");
else {
atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0;
unsigned int numblocks = (unsigned int)(lt_len+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;
qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, lstream >>>(outdata, indata, offset, lt_len, atomicData, atomicDataStack, true, depth+1);
}
}
}
else if (lt_len > 1) {
unsigned int bitonic_len = 1 << (__btflo(lt_len-1U)+1);
binoticSort<<< 1, bitonic_len, 0, lstream >>>(outdata + offset,lt_len);
}
////////////////////////// right
if (gt_len > BITONICSORT_LEN) {
if (depth >= QSORT_MAXDEPTH)
bigBinoticSort<<<1, BITONICSORT_LEN,0, rstream>>>(outdata + offset + lt_len, gt_len, indata + offset + lt_len);
else {
if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) == NULL)
printf("Stack allocation error! Failing right-side launch.\n");
else {
atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0;
unsigned int numblocks = (unsigned int)(gt_len+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;
qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, rstream >>>(outdata, indata, offset+lt_len, gt_len, atomicData, atomicDataStack, true, depth+1);
}
}
}
else if (gt_len > 1) {
unsigned int bitonic_len = 1 << (__btflo(gt_len-1U)+1);
binoticSort<<< 1, bitonic_len, 0, rstream >>>(outdata + offset + lt_len,gt_len);
}
}
}
}
void runqsort(unsigned *gpudata, unsigned *scratchdata, unsigned int count, cudaStream_t stream) {
unsigned int stacksize = QSORT_STACK_ELEMS;//1*1024*1024
// This is the stack, for atomic tracking of each sort's status
qsortAtomicData *gpustack;
CUDA_CHECK(cudaMalloc((void **)&gpustack, stacksize * sizeof(qsortAtomicData)));
CUDA_CHECK(cudaMemset(gpustack, 0, sizeof(qsortAtomicData))); // Only need set first entry to 0
// Create the memory ringbuffer used for handling the stack.
// Initialise everything to where it needs to be.
qsortRingbuf buf;
qsortRingbuf *ringbuf;
CUDA_CHECK(cudaMalloc((void **)&ringbuf, sizeof(qsortRingbuf)));
buf.head = 1; // We start with one allocation
buf.tail = 0;
buf.count = 0;
buf.max = 0;
buf.stacksize = stacksize;
buf.stackbase = gpustack;
CUDA_CHECK(cudaMemcpy(ringbuf, &buf, sizeof(buf), cudaMemcpyHostToDevice));
if (count > BITONICSORT_LEN)//1024
{//QSORT_BLOCKSIZE = 2^9 = 512
unsigned int numblocks = (unsigned int)(count+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;
qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, stream >>>(gpudata, scratchdata, 0U, count, gpustack, ringbuf, true, 0);
}
else
{
binoticSort<<< 1, BITONICSORT_LEN >>>(gpudata, count);
CUDA_CHECK(cudaMemcpy(scratchdata, gpudata, sizeof(unsigned)*count, cudaMemcpyDeviceToDevice));
}
cudaDeviceSynchronize();
}

本文介绍了在GPU上实现双调排序和快速排序的方法。对于小于1024个元素,使用共享内存加速的双调排序,但速度并不占优;大于1024个元素时,采用一种限制于单个block的排序方法。GPU快排在元素数量超过5000时表现出优势,递归深度过深则转用双调排序。程序重点在于分隔函数,使用warp处理元素并利用atomicAdd进行偏移地址计算。由于不是原地排序且涉及递归,需要特殊处理数组和缓冲区的交换。代码示例参考NVIDIA CUDA Samples。

1881

被折叠的 条评论
为什么被折叠?



