gpu排序

最新推荐文章于 2026-04-16 09:51:11 发布

原创最新推荐文章于 2026-04-16 09:51:11 发布 · 8k 阅读

2 ·

本内容遵循CC 4.0 BY-SA版权协议

算法同时被 2 个专栏收录

376 篇文章

订阅专栏

gpu

10 篇文章

订阅专栏

本文介绍了在GPU上实现双调排序和快速排序的方法。对于小于1024个元素，使用共享内存加速的双调排序，但速度并不占优；大于1024个元素时，采用一种限制于单个block的排序方法。GPU快排在元素数量超过5000时表现出优势，递归深度过深则转用双调排序。程序重点在于分隔函数，使用warp处理元素并利用atomicAdd进行偏移地址计算。由于不是原地排序且涉及递归，需要特殊处理数组和缓冲区的交换。代码示例参考NVIDIA CUDA Samples。

单机版的双调排序可以参考 http://blog.csdn.net/sunmenggmail/article/details/42869235

还是这张图片

基于cuda的双调排序的思路是：

为每一个元素提供一个线程，如果大于1024个元素，还是提供1024个线程，这是因为__syncthreads只能作为block内的线程同步，而一个block最多有1024个线程，如果元素个数大于1024则每个线程可能就要负责一个以上的元素的比较

就上图而言，一个矩形代表一次多线程的比较，那么此图仅需要6次比较，就可以有右边的输出。

#include <vector>
#include <algorithm>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>

using namespace std;

#define CHECK_EQ1(a,b) do { \
	if ((a) != (b)) { \ 
		cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
		cout << cudaGetErrorString(a) <<endl;\
		exit(1);\
	}\
} while(0)

#define CUDA_CHECK(condition)\
do {\
	cudaError_t error = condition;\
	CHECK_EQ1(error, cudaSuccess);\
} while(0)

static __device__ __forceinline__ unsigned int __btflo(unsigned int word)
{
    unsigned int ret;
    asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word));
	//return the index of highest non-zero bit in a word; for example, 00000110, return 2
    return ret;
}

//for > 1024
__global__ void bigBinoticSort(unsigned int *arr, int len, unsigned int *buf) {
	unsigned len2 = 1 << (__btflo(len-1u) + 1);//
	
	unsigned int MAX = 0xffffffffu;
	unsigned id = threadIdx.x;
	if (id >= len2) return;
	unsigned iter = blockDim.x;
	for (unsigned i = id; i < len2; i += iter) {
		if (i >= len) {
			buf[i-len] = MAX;
		}
	}
	
	__syncthreads();
	
	int count = 0;
	for (unsigned k = 2; k <= len2; k*=2) {
		for (unsigned j = k >> 1; j > 0; j >>= 1) {
			for (unsigned i = id; i < len2; i += iter) {
				unsigned swapIdx = i ^ j;
				
				if (swapIdx > i) {
					unsigned myelem, other;
					if (i < len) myelem = arr[i];
					else myelem = buf[i-len];
					
					if (swapIdx < len) other = arr[swapIdx];
					else other = buf[swapIdx-len];
					
					bool swap = false;
					
					if ((i & k)==0 && myelem > other) swap = true;
					if ((i & k) == k && myelem < other) swap = true;
					
					if (swap) {
						if (swapIdx < len) arr[swapIdx] = myelem; 
						else buf[swapIdx-len] = myelem;
						
						if (i < len) arr[i] = other;
						else buf[i-len] = other;
					}
				}
			}
			__syncthreads();
			
			
		}
	}
}

//for <= 1024
__global__ void binoticSort(unsigned int *arr, int len) {
	__shared__  unsigned int buf[1024];
	buf[threadIdx.x] = (threadIdx.x < len ? arr[threadIdx.x] : 0xffffffffu);
	__syncthreads();
	
	for (unsigned k = 2; k <= blockDim.x; k*=2) {//buid k elements ascend or descend
		for (unsigned j = k >> 1; j > 0; j >>= 1) {//merge longer binotic into shorter binotic
			unsigned swapIdx = threadIdx.x ^ j;
			unsigned myelem = buf[threadIdx.x];
			unsigned other = buf[swapIdx];
			
			__syncthreads();
			
			unsigned ascend = k * (swapIdx < threadIdx.x);
			unsigned descend = k * (swapIdx > threadIdx.x);
			//if I is front, swap is back; ascend = 0, descend = k
			//if I is back, swap is front; ascend = k, descend = 0;
			bool swap = false;
			if ((threadIdx.x & k) == ascend) {
				if (myelem > other) swap = true;
			}
			if ((threadIdx.x & k) == descend) {
				if (myelem < other) swap = true;
			}
			
			if (swap) buf[swapIdx] = myelem;
			__syncthreads();
		}
	}
	
	if (threadIdx.x < len) arr[threadIdx.x] = buf[threadIdx.x];
}

template<class T>
inline void printVec(T *vec, int len) {
	for (int i = 0; i < len; ++i) cout <<vec[i] << "\t";
	cout << endl;
}

template<class T>
inline void printVecg(T *gvec, int len) {
	T *vec = (T*)malloc(sizeof(T)*len);
	CUDA_CHECK(cudaMemcpy(vec,gvec,sizeof(T)*len,cudaMemcpyDeviceToHost));
	printVec(vec,len);
	free(vec);
}

void lineSize(int N, int &nblocks, int &nthreads) {
	if (N <= 1024) {
		nthreads = (N + 32 - 1)/32*32;//
	}
	else {
		nblocks = (N + 1024 -1)/1024;
	}
}

bool validate(unsigned *gvec, int len) {
	unsigned *vec = (unsigned*)malloc(sizeof(unsigned)*len);
	CUDA_CHECK(cudaMemcpy(vec,gvec,sizeof(unsigned)*len,cudaMemcpyDeviceToHost));
	for(int i = 1; i < len; ++i) {
		if (vec[i] <= vec[i-1]) return false;
	}
	return true;
}

inline int roundUpPower2(int v) {
	v--;
	v |= v >> 1;
	v |= v >> 2;
	v |= v >> 4;
	v |= v >> 8;
	v |= v >> 16;

	v++;
	return v;
}



int main(int argc, char *argv[]) {
	if (argc != 2) {
		cout << "len \n";
		return;
	}
	int len = atoi(argv[1]);
	unsigned int *arr = (unsigned int*)malloc(sizeof(unsigned int)*len);
	for (int i = 0; i < len; ++i) arr[i] = i;
	srand((unsigned int)time(NULL));
	for (int i = len; i >= 2; --i) {
		int j = rand() % i;
		swap(arr[i-1], arr[j]);
	}
	
	
	
	unsigned* debug;
	CUDA_CHECK(cudaMalloc((void**)&debug, sizeof(unsigned)*1000));
	
	unsigned int* darr, *buf;
	CUDA_CHECK(cudaMalloc((void**)&darr, sizeof(unsigned int)*len));
	CUDA_CHECK(cudaMalloc((void**)&buf, sizeof(unsigned int)*len));
	CUDA_CHECK(cudaMemcpy(darr, arr, sizeof(unsigned int)*len, cudaMemcpyHostToDevice));
	
	
	bigBinoticSort<<<1,1024>>>(darr,len, buf);
	CUDA_CHECK(cudaPeekAtLastError());
	CUDA_CHECK(cudaDeviceSynchronize());
	
	if (validate(darr, len))
		cout << "yes\n";
	else
		cout << "no\n";
	
	
	return 1;
}

算法有两个双调排序实现，一个用于小于1024个元素，用到了共享内存加快访问速度，但是如果真要排序1024以下的元素，建议还是用cpu版本的快排吧，gpu的在速度上并没有明显的优势，甚至还比cpu慢

如果大于1024元素，就采用另一种方法。这种方法的缺点也是很明显的，就是不管再多的元素，只能用一个block进行计算，而一个block最多只能用1024个线程，估计在一万个元素以内的话，这个方法是gpu上最快的。

经过本人测试，包括thrust的sort(基数排序)，只有元素数量超过5000个，gpu上的排序算法才有明显的优势。10万左右的元素，gpu上的排序算法比cpu有一百倍的提速。

下面会介绍在gpu上进行快速排序。gpu快速排序可以处理非常大的数据，但是会有递归深度的限制，当超过递归深度时，就可以调用上面所讲的双调排序进行处理。测试表明，速度比thrust还是快一点

gpu上的快排主要参考样例 NVIDIA_CUDA-6.5_Samples/6_Advanced/cdpAdvancedQuicksort

快排只处理大于1024个元素的数组，然后将其分隔为左右两个子数组，如果子数组长度大于1024则继续动态递归调用快排，如果小于1024则动态调用双调排序。如果快排的递归深度已经超过最大递归深度（cuda最大嵌套深度64，但是还受限于每一级所使用的内存大小），则直接调用双调排序。

这段程序的最精彩的地方在于分隔函数

将数组按照warp大小进行分隔，每个warp处理32个元素，通过全局的atomicAdd函数，分别获得warp内的小于和大于pivot数在数组的偏移地址，注意在同一个warp内，这个偏移地址是一样的，然后每个线程将自己的元素放到偏移地址，这样就完成了分割

需要注意的是，这个快排不是in-place的，又涉及到递归调用，所以还得处理原数组和缓冲区的调换

由于cuda没有显式的锁，此方法采用了一种特殊的循环队列，本人认为在极端情况下，可能会出现问题

（这里的代码有错，没有处理原数组和缓冲区的调换，只是帮助理解。正确的代码请参考Samples里的）

#define QSORT_BLOCKSIZE_SHIFT   9
#define QSORT_BLOCKSIZE         (1 << QSORT_BLOCKSIZE_SHIFT)
#define BITONICSORT_LEN         1024            // Must be power of 2!
#define QSORT_MAXDEPTH          16              // Will force final bitonic stage at depth QSORT_MAXDEPTH+1
#define QSORT_STACK_ELEMS   1*1024*1024 // One million stack elements is a HUGE number.

typedef struct __align__(128) qsortAtomicData_t
{
    volatile unsigned int lt_offset;    // Current output offset for <pivot
    volatile unsigned int gt_offset;    // Current output offset for >pivot
    volatile unsigned int sorted_count; // Total count sorted, for deciding when to launch next wave
    volatile unsigned int index;        // Ringbuf tracking index. Can be ignored if not using ringbuf.
} qsortAtomicData;

////////////////////////////////////////////////////////////////////////////////
// A ring-buffer for rapid stack allocation
////////////////////////////////////////////////////////////////////////////////
typedef struct qsortRingbuf_t
{
    volatile unsigned int head; //1        // Head pointer - we allocate from here
    volatile unsigned int tail; //0        // Tail pointer - indicates last still-in-use element
    volatile unsigned int count;//0        // Total count allocated
    volatile unsigned int max;  //0        // Max index allocated
    unsigned int stacksize;  //           // Wrap-around size of buffer (must be power of 2)
    volatile void *stackbase;           // Pointer to the stack we're allocating from
} qsortRingbuf;


/*
for cuda has no lock, so we have to do like this:
if alloc , ++head
if free , ++tail
so [tail, head) contains alloced chunks; head point to the next free chunk
count record the number of chunks had free
we have n chunks, but the index of a chunk is increase when re-alloc
max record the maximum index of the free chunks
only if the chunks before max are all free, aka, max == count, we can alter tail value 
*/
template<class T>
static __device__ void ringbufFree(qsortRingbuf *ringbuf, T *data) {
	unsigned index = data->index;
	unsigned count = atomicAdd((unsigned*)&(ringbuf->count), 1) + 1;
	unsigned max = atomicMax((unsigned*)&(ringbuf->max), index + 1);
	
	if (max < (index + 1)) max = index + 1;
	if (max == count) {
		atomicMax((unsigned*)&(ringbuf->tail), count);
	}
}

template<class T>
static __device__ T* ringbufAlloc(qsortRingbuf *ringbuf) {
	unsigned int loop = 10000;
	while (((ringbuf->head - ringbuf->tail) >= ringbuf->stacksize) && (loop-- > 0));
	if (loop == 0) return NULL;
	
	unsigned index = atomicAdd((unsigned*)&ringbuf->head, 1);
	T *ret = (T*)(ringbuf->stackbase) + (index & (ringbuf->stacksize - 1));
	ret->index = index;
	
	return ret;
}



__global__ void qsort_warp(unsigned *indata,
                           unsigned *outdata,
                           unsigned int offset,//0
                           unsigned int len,//
                           qsortAtomicData *atomicData,//stack
                           qsortRingbuf *atomicDataStack,//ringbuf
                           unsigned int source_is_indata,//true
                           unsigned int depth) 
{
	//printf("depth = %d", depth);
	    // Find my data offset, based on warp ID
    unsigned int thread_id = threadIdx.x + (blockIdx.x << QSORT_BLOCKSIZE_SHIFT);
    //unsigned int warp_id = threadIdx.x >> 5;   // Used for debug only
    unsigned int lane_id = threadIdx.x & (warpSize-1);// %32

    // Exit if I'm outside the range of sort to be done
    if (thread_id >= len)
        return;

    //
    // First part of the algorithm. Each warp counts the number of elements that are
    // greater/less than the pivot.
    //
    // When a warp knows its count, it updates an atomic counter.
    //

    // Read in the data and the pivot. Arbitrary pivot selection for now.
    unsigned pivot = indata[offset + len/2];
    unsigned data  = indata[offset + thread_id];

    // Count how many are <= and how many are > pivot.
    // If all are <= pivot then we adjust the comparison
    // because otherwise the sort will move nothing and
    // we'll iterate forever.
    unsigned int greater = (data > pivot);
    unsigned int gt_mask = __ballot(greater);//Evaluate predicate for all active threads of the warp and return an integer whose Nth bit is set if and only if predicate evaluates to non-zero for the Nth thread of the warp and the Nth thread is active.
	
	if (gt_mask == 0) {
		greater = (data >= pivot);
		gt_mask = __ballot(greater);
	}
	
	unsigned lt_mask = __ballot(!greater);
	unsigned gt_count = __popc(gt_mask);//count number of 1 in a warp;
	unsigned lt_count = __popc(lt_mask);
	
	//only thread 0 in warp calc
	//find 2 new positions for this warp
	unsigned lt_oft, gt_oft;
	if (lane_id == 0) {
		if (lt_count > 0)
			lt_oft = atomicAdd((unsigned*)&atomicData->lt_offset, lt_count);//atomicAdd return old value, not the newer//all the warps will syn call this
		if (gt_count > 0)
			gt_oft = len - (atomicAdd((unsigned*) &atomicData->gt_offset, gt_count) + gt_count);
		
		//printf("depth = %d\n", depth);
		//printf("pivot = %u\n", pivot);
		//printf("lt_count %u lt_oft %u gt_count %u gt_oft %u  atomicDataGtOffset %u\n", lt_count,lt_oft, gt_count,gt_oft, atomicData->gt_offset);
	}
	
	lt_oft = __shfl((int)lt_oft, 0);
	gt_oft = __shfl((int)gt_oft, 0);//Everyone pulls the offsets from lane 0
	
	__syncthreads();
	 // Now compute my own personal offset within this. I need to know how many
    // threads with a lane ID less than mine are going to write to the same buffer
    // as me. We can use popc to implement a single-operation warp scan in this case.
    unsigned lane_mask_lt;
    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));//bits set in positions less than the thread's lane number the warp
	
	unsigned my_mask = greater ? gt_mask : lt_mask;
	unsigned my_oft = __popc(my_mask & lane_mask_lt);//
	
	//move data
	my_oft += greater ? gt_oft : lt_oft;
	outdata[offset + my_oft] = data;
	__syncthreads();
	
	//if (lane_id == 0) printf("pivot = %d", pivot);
	if (lane_id == 0) {
		/*if (blockIdx.x == 0) {
		printf("depth = %d\n", depth);
		for (int i = 0; i < len; ++i)
			printf("%u ", outdata[offset+i]);
		printf("\n");
		}*/
		
		
		unsigned mycount = lt_count + gt_count;
		//we are the last warp 
		if (atomicAdd((unsigned*)&atomicData->sorted_count, mycount) + mycount == len) {
			unsigned lt_len = atomicData->lt_offset;
			unsigned gt_len = atomicData->gt_offset;
				
			cudaStream_t lstream, rstream;
            cudaStreamCreateWithFlags(&lstream, cudaStreamNonBlocking);
            cudaStreamCreateWithFlags(&rstream, cudaStreamNonBlocking);
			
			ringbufFree<qsortAtomicData>(atomicDataStack, atomicData);
			
			if (lt_len == 0) return;
			//////////////////////////// left
			if (lt_len > BITONICSORT_LEN) {
				if (depth >= QSORT_MAXDEPTH) {
					bigBinoticSort<<<1, BITONICSORT_LEN,0, rstream>>>(outdata + offset, lt_len, indata + offset);
				}
				else {
					
					
					if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) == NULL)
                        printf("Stack-allocation error. Failing left child launch.\n");
					else {
						atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0;
                        unsigned int numblocks = (unsigned int)(lt_len+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;
                        qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, lstream >>>(outdata, indata, offset, lt_len, atomicData, atomicDataStack, true, depth+1);
					}
					
					
				}
			}
			else if (lt_len > 1) {
				unsigned int bitonic_len = 1 << (__btflo(lt_len-1U)+1);
                binoticSort<<< 1, bitonic_len, 0, lstream >>>(outdata + offset,lt_len);
			}
			////////////////////////// right
			if (gt_len > BITONICSORT_LEN) {
				if (depth >= QSORT_MAXDEPTH)
					bigBinoticSort<<<1, BITONICSORT_LEN,0, rstream>>>(outdata + offset + lt_len, gt_len, indata + offset + lt_len);
				else {
					
					if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) == NULL)
                        printf("Stack allocation error! Failing right-side launch.\n");
					else {
						atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0;
                        unsigned int numblocks = (unsigned int)(gt_len+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;
                        qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, rstream >>>(outdata, indata, offset+lt_len, gt_len, atomicData, atomicDataStack, true, depth+1);
					}
					
				}
			}
			else if (gt_len > 1) {
				unsigned int bitonic_len = 1 << (__btflo(gt_len-1U)+1);
                binoticSort<<< 1, bitonic_len, 0, rstream >>>(outdata + offset + lt_len,gt_len);
			}
			
		}
	}
}

void runqsort(unsigned *gpudata, unsigned *scratchdata, unsigned int count, cudaStream_t stream) {
	unsigned int stacksize = QSORT_STACK_ELEMS;//1*1024*1024

    // This is the stack, for atomic tracking of each sort's status
    qsortAtomicData *gpustack;
    CUDA_CHECK(cudaMalloc((void **)&gpustack, stacksize * sizeof(qsortAtomicData)));
    CUDA_CHECK(cudaMemset(gpustack, 0, sizeof(qsortAtomicData)));     // Only need set first entry to 0

    // Create the memory ringbuffer used for handling the stack.
    // Initialise everything to where it needs to be.
    qsortRingbuf buf;
    qsortRingbuf *ringbuf;
    CUDA_CHECK(cudaMalloc((void **)&ringbuf, sizeof(qsortRingbuf)));
    buf.head = 1;           // We start with one allocation
    buf.tail = 0;
    buf.count = 0;
    buf.max = 0;
    buf.stacksize = stacksize;
    buf.stackbase = gpustack;
    CUDA_CHECK(cudaMemcpy(ringbuf, &buf, sizeof(buf), cudaMemcpyHostToDevice));
	
	if (count > BITONICSORT_LEN)//1024
    {//QSORT_BLOCKSIZE = 2^9 = 512
		
        unsigned int numblocks = (unsigned int)(count+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;
        qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, stream >>>(gpudata, scratchdata, 0U, count, gpustack, ringbuf, true, 0);
		
   }
    else
    {
        binoticSort<<< 1, BITONICSORT_LEN >>>(gpudata, count);
		CUDA_CHECK(cudaMemcpy(scratchdata, gpudata, sizeof(unsigned)*count, cudaMemcpyDeviceToDevice));
    }
	
	cudaDeviceSynchronize();
}