CUDA大津阈值—原子加统计直方图&规约求最大值索引

最新推荐文章于 2025-06-20 09:50:14 发布

原创最新推荐文章于 2025-06-20 09:50:14 发布 · 2k 阅读

7 ·

本内容遵循CC 4.0 BY-SA版权协议

CUDA编程专栏收录该内容

9 篇文章

订阅专栏

本文介绍了一种利用CUDA并行计算实现的大津阈值法图像分割算法。该算法通过原子操作、共享内存规约求和及最大值等技术提高了图像处理效率。

在大津阈值的CUDA实现中有三个需要注意的地方：

（1）使用atomicAdd操作实现直方图的统计；

atomicAdd简介：原子操作一般要实现读取-修改-写入三个步骤，并且在执行过程中不会被打断。即除非已经完成了这三个操作，否则其他线程都不能读取或写入x的值。

在计算直方图时，存在一个问题，多个线程可能对输出直方图的同一元素进行递增，这时候我们需要使用原子操作完成。

（2）借助sharedmemory规约求和

（3）使用二维shared memory规约求最大值，下图为二维shared memory的示意图。

下图为大津阈值的前后对比图：

详细介绍见附录代码及注释。

GPU代码如下：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include "Windows.h"
#include <math.h>
#include<iostream>
using namespace std;

#define BLOCKDIM_X		16
#define BLOCKDIM_Y		16

#define GRIDDIM_X		256
#define GRIDDIM_Y		256
#define MASK_WIDTH		5

unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount);
bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount);
static __global__ void kernel_color2gray(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_gray_imgbuf);
static __global__ void kernel_histogram(int width,int height,unsigned char* d_gray_imgbuf,int* histogram);
static __global__ void kernel_ReductionSum(int *d_a, int* d_sum);
static __global__ void kernel_Variance(int *histogram, float* devi, int sum);
static __global__ void kernel_ReductionMax(float* devi, int* thresh);

void main()
{
	//查看显卡配置
	struct cudaDeviceProp pror;
	cudaGetDeviceProperties(&pror,0);
	cout<<"maxThreadsPerBlock="<<pror.maxThreadsPerBlock<<endl;

	long start, end;
	long time = 0; 

	//CUDA计时函数
	start = GetTickCount();
	cudaEvent_t startt,stop; //CUDA计时机制
	cudaEventCreate(&startt);
	cudaEventCreate(&stop);
	cudaEventRecord(startt,0);

	unsigned char *h_src_imgbuf;  //图像指针
	int width, height, byteCount;
	char rootPath1[]="C:\\Users\\a404\\Desktop\\测试图片\\";
	char readPath[1024];
	int frame=300;
	for (int k=1;k<=frame;k++)
	{
		sprintf(readPath, "%s%d.bmp", rootPath1, k);
		h_src_imgbuf=readBmp(readPath, &width, &height, &byteCount);
		//printf("宽=%d，高=%d，字节=%d\n",width, height, byteCount);
	
		int size1=width*height *byteCount*sizeof(unsigned char);
		int size2=width*height *sizeof(unsigned char);
		//输出图像内存-host端	
		unsigned char *h_guassian_imgbuf=new unsigned char[width*height*byteCount];
		unsigned char *h_gray_imgbuf=new unsigned char[width*height];
		unsigned char *h_Otus_imgbuf=new unsigned char[width*height];
		
		//分配显存空间
		unsigned char *d_src_imgbuf;
		unsigned char *d_gray_imgbuf;
		unsigned char *d_Otus_imgbuf;
		int *d_histogram;
		float *d_devi;
		int *d_sum;
		int *d_thresh;
		float *d_maxdevi;

		cudaMalloc((void**)&d_src_imgbuf, size1);

		cudaMalloc((void**)&d_gray_imgbuf, size2);
		cudaMalloc((void**)&d_Otus_imgbuf, size2);

		cudaMalloc((void**)&d_histogram, 256*sizeof(int));
		cudaMalloc((void**)&d_devi, 256*sizeof(float));
		cudaMalloc((void**)&d_sum, sizeof(int));
		cudaMalloc((void**)&d_thresh, sizeof(int));
		//cudaMalloc((void**)&d_maxdevi, sizeof(float));
		
		//把数据从Host传到Device
		cudaMemcpy(d_src_imgbuf, h_src_imgbuf, size1, cudaMemcpyHostToDevice);

		int bx = ceil((double)width/BLOCKDIM_X); //网格和块的分配
		int by = ceil((double)height/BLOCKDIM_Y);

		if(bx > GRIDDIM_X) bx = GRIDDIM_X;
		if(by > GRIDDIM_Y) by = GRIDDIM_Y;

		dim3 grid(bx, by);//网格的结构
		dim3 block(BLOCKDIM_X, BLOCKDIM_Y);//块的结构

		//kernel--灰度变换
		kernel_color2gray<<<grid, block>>>(width,height,byteCount,d_src_imgbuf,d_gray_imgbuf);
		cudaMemcpy(h_gray_imgbuf, d_gray_imgbuf,size2, cudaMemcpyDeviceToHost);//数据传回主机端
	
		//统计直方图
		int h_histogram[256]={0};

		//使用原子操作进行直方图统计
		cudaMemcpy(d_histogram, h_histogram, 256*sizeof(int), cudaMemcpyHostToDevice);
		kernel_histogram<<<grid, block>>>(width,height,d_gray_imgbuf,d_histogram);
		cudaMemcpy(h_histogram, d_histogram, 256*sizeof(int), cudaMemcpyDeviceToHost);
  
		//方差及最大方差
		float h_devi[256]={0}, maxDevi=0;  
		//像素总数
		int h_sum[1]={0};

		//规约求和--借助shared memory
		kernel_ReductionSum<<<1, 256>>>(d_histogram,d_sum);
		cudaMemcpy(h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
		
		//遍历求方差--启用256个线程
		cudaMemcpy(d_devi, h_devi, size1, cudaMemcpyHostToDevice);
		kernel_Variance<<<1, 256>>>(d_histogram,d_devi,h_sum[0]);
		cudaMemcpy(h_devi, d_devi, 256*sizeof(float), cudaMemcpyDeviceToHost);

		//最佳阈值thresh
		int h_thresh[1]={0};

		//规约求最大值，并记录最大值位置--启用256*2个线程
		kernel_ReductionMax<<<1, 512>>>(d_devi,d_thresh);
		cudaMemcpy(h_thresh, d_thresh, sizeof(int), cudaMemcpyDeviceToHost);

		for(int i=0;i<height;i++)
			for(int j=0;j<width;j++)
			{
				int value=*(h_gray_imgbuf+i*width+j);
				if(value<=h_thresh[0])
					*(h_Otus_imgbuf+i*width+j)=0;
				else
					*(h_Otus_imgbuf+i*width+j)=255;
			}

		char rootPath2[]="C:\\Users\\a404\\Desktop\\测试结果\\";
		char writePath[1024];
		sprintf(writePath, "%s%d.bmp", rootPath2, k);
		//saveBmp(writePath, h_guassian_imgbuf, width, height, byteCount);
		saveBmp(writePath, h_Otus_imgbuf, width, height, 1);
		cout<<k<<"  "<<((float)k/frame)*100<<"%"<<endl;
	
		//释放内存
		cudaFree(d_src_imgbuf); 
		cudaFree(d_gray_imgbuf);
		cudaFree(d_Otus_imgbuf);
		cudaFree(d_histogram);
		cudaFree(d_sum);
		cudaFree(d_thresh);	
		
		delete []h_src_imgbuf;
		delete []h_gray_imgbuf;
		delete []h_Otus_imgbuf;
	}
	end = GetTickCount();
	InterlockedExchangeAdd(&time, end - start);
	cout << "Total time GPU:";
	cout << time << endl;
	int x;
	cin>>x;
}

static __global__ void kernel_color2gray(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_gray_imgbuf)
{
	const int tix = blockDim.x * blockIdx.x + threadIdx.x;
	const int tiy = blockDim.y * blockIdx.y + threadIdx.y;

	const int threadTotalX = blockDim.x * gridDim.x;
	const int threadTotalY = blockDim.y * gridDim.y;

	for(int ix = tix; ix < height; ix += threadTotalX)
		for(int iy = tiy; iy < width; iy += threadTotalY)
			*(d_gray_imgbuf+ix*width+iy)=0.11**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+0)
			+0.59**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+1)
			+0.30**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+2)+0.5;
}

static __global__ void kernel_histogram(int width,int height,unsigned char* d_gray_imgbuf,int* histogram)
{
	__shared__ int temp[256];
	//直方图初始化为0
	temp[threadIdx.x*16+threadIdx.y] = 0;
	__syncthreads();

	const int tix = blockDim.x * blockIdx.x + threadIdx.x;
	const int tiy = blockDim.y * blockIdx.y + threadIdx.y;

	const int threadTotalX = blockDim.x * gridDim.x;
	const int threadTotalY = blockDim.y * gridDim.y;

	for(int ix = tix; ix < height; ix += threadTotalX)
	{
		for(int iy = tiy; iy < width; iy += threadTotalY)
		{
			atomicAdd(&(temp[d_gray_imgbuf[ix*width+iy]]), 1);
		}
	}
	__syncthreads();
	atomicAdd(&(histogram[threadIdx.x*16+threadIdx.y]), temp[threadIdx.x*16+threadIdx.y]);
}

static __global__ void kernel_ReductionSum(int *d_a, int* d_sum)  
{  
    //申请共享内存，存在于每个block中   
    __shared__ int partialSum[256];  
  
    //确定索引   
    int tid = threadIdx.x;  
  
    //传global memory数据到shared memory 
	partialSum[tid]=d_a[tid];	
  
    //传输同步  
    __syncthreads();  
      
    //在共享存储器中进行规约  
    for(int stride = 128; stride > 0; stride/=2)  
    {  
        if(tid<stride) partialSum[tid]+=partialSum[tid+stride];  
        __syncthreads();  
    }  
    //将当前block的计算结果写回输出数组  
    if(tid==0)    
        d_sum[0] = partialSum[0];  
}  

static __global__ void kernel_Variance(int *histogram, float* devi, int sum)
{
	int tid = threadIdx.x; 
	//c0和c1组的均值
	float u0,u1;  
	//c0和c1组产生的概率
	float w0,w1; 
	//c0组的像素数
	int count0; 
	//阈值t
	//int t;  
	u0=0;
	count0=0;
	for(int i=0; i<=tid;i++)
	{
		u0 += i*histogram[i];
		count0 += histogram[i];
	}
	u0=u0/count0;
	w0=(float)count0/sum;

	//计算阈值为t时，c1组的均值和产生的概率
	u1=0;
	for(int i=tid+1; i<256;i++)
		u1+=i*histogram[i];
	//C0组像素数与C1组像素数之和为图像总像素数。
	u1=u1/(sum-count0);
	w1=1-w0;

	//两组间的方差
	devi[tid]=w0*w1*(u1-u0)*(u1-u0);
}

static __global__ void kernel_ReductionMax(float* devi,int* thresh)
{
	//申请共享内存，存在于每个block中   
    __shared__ float Max[256*2];
	//__shared__ int index[256];
  
    //确定索引   
    int tid = threadIdx.x;  
   // partialindex[tid]=0;
    //传global memory数据到shared memory 
	if(tid<256)
		Max[tid]=devi[tid];	
	else
		Max[tid]=tid-256;
	//初始化索引
	//index[256]=tid;
    //传输同步  
    __syncthreads();  
      
    //在共享存储器中进行规约，记录索引变化
    for(int stride = 128; stride > 0; stride/=2)  
    {  
        if(tid<stride) 
		{
			if(Max[tid]<Max[tid+stride])
			{
				Max[tid]=Max[tid+stride];
				Max[tid+256]=Max[tid+stride+256];
				//index[tid]=index[tid+stride];
			}
		}
        __syncthreads();  
    }  
    //将当前block的计算结果写回输出数组  
	if(tid==256)
		thresh[0]=Max[256]; 
}

unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount)
{
	//打开文件
	FILE *fp=fopen(bmpName,"rb");
	if(fp==0) return 0;
	//跳过文件头
	fseek(fp, sizeof(BITMAPFILEHEADER),0);

	//读入信息头
	int w, h, b;
	BITMAPINFOHEADER head;
	fread(&head, sizeof(BITMAPINFOHEADER), 1,fp); 
	w = head.biWidth;
	h = head.biHeight;
	b = head.biBitCount/8;
	int lineByte=(w * b+3)/4*4; //每行的字节数为4的倍数

	//跳过颜色表 （颜色表的大小为1024）（彩色图像并没有颜色表，不需要这一步）
	if(b==1)
		fseek(fp, 1024,1);

	//图像数据
	unsigned char *imgBuf=new unsigned char[w * h * b];
	for(int i=0;i<h;i++)
	{
		fread(imgBuf+i*w*b,w*b, 1,fp);
		fseek(fp, lineByte-w*b, 1);
	}
	fclose(fp);

	*width=w,  *height=h, *byteCount=b;

	return imgBuf;
}


bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount)
{
	if(!imgBuf)
		return 0;

	//灰度图像颜色表空间1024，彩色图像没有颜色表
	int palettesize=0;
	if(byteCount==1) palettesize=1024;

	//一行象素字节数为4的倍数
	int lineByte=(width * byteCount+3)/4*4;

	FILE *fp=fopen(bmpName,"wb");
	if(fp==0) return 0;

	//填写文件头
	BITMAPFILEHEADER fileHead;
	fileHead.bfType = 0x4D42;
	fileHead.bfSize= 
		sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER)+ palettesize + lineByte*height;
	fileHead.bfReserved1 = 0;
	fileHead.bfReserved2 = 0;
	fileHead.bfOffBits=54+palettesize;
	fwrite(&fileHead, sizeof(BITMAPFILEHEADER),1, fp);

	// 填写信息头
	BITMAPINFOHEADER head; 
	head.biBitCount=byteCount*8;
	head.biClrImportant=0;
	head.biClrUsed=0;
	head.biCompression=0;
	head.biHeight=height;
	head.biPlanes=1;
	head.biSize=40;
	head.biSizeImage=lineByte*height;
	head.biWidth=width;
	head.biXPelsPerMeter=0;
	head.biYPelsPerMeter=0;
	fwrite(&head, sizeof(BITMAPINFOHEADER),1, fp);

	//颜色表拷贝  
	if(palettesize==1024)
	{
		unsigned char palette[1024];
		for(int i=0;i<256;i++)
		{
			*(palette+i*4+0)=i;
			*(palette+i*4+1)=i;
			*(palette+i*4+2)=i;
			*(palette+i*4+3)=0;     
		}
		fwrite(palette, 1024,1, fp);
	}

	//准备数据并写文件
	unsigned char *buf=new unsigned char[height*lineByte];
	for(int i=0;i<height;i++)
	{
		for(int j=0;j<width*byteCount; j++)
			*(buf+i*lineByte+j)=*(imgBuf+i*width*byteCount+j);
	}
	fwrite(buf, height*lineByte, 1, fp);

	delete []buf;

	fclose(fp);

	return 1;
}