在大津阈值的CUDA实现中有三个需要注意的地方:
(1)使用atomicAdd操作实现直方图的统计;
atomicAdd简介:原子操作一般要实现读取-修改-写入三个步骤,并且在执行过程中不会被打断。即除非已经完成了这三个操作,否则其他线程都不能读取或写入x的值。
在计算直方图时,存在一个问题,多个线程可能对输出直方图的同一元素进行递增,这时候我们需要使用原子操作完成。
(2)借助sharedmemory规约求和
(3)使用二维shared memory规约求最大值,下图为二维shared memory的示意图。
下图为大津阈值的前后对比图:
详细介绍见附录代码及注释。
GPU代码如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "Windows.h"
#include <math.h>
#include<iostream>
using namespace std;
#define BLOCKDIM_X 16
#define BLOCKDIM_Y 16
#define GRIDDIM_X 256
#define GRIDDIM_Y 256
#define MASK_WIDTH 5
unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount);
bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount);
static __global__ void kernel_color2gray(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_gray_imgbuf);
static __global__ void kernel_histogram(int width,int height,unsigned char* d_gray_imgbuf,int* histogram);
static __global__ void kernel_ReductionSum(int *d_a, int* d_sum);
static __global__ void kernel_Variance(int *histogram, float* devi, int sum);
static __global__ void kernel_ReductionMax(float* devi, int* thresh);
void main()
{
//查看显卡配置
struct cudaDeviceProp pror;
cudaGetDeviceProperties(&pror,0);
cout<<"maxThreadsPerBlock="<<pror.maxThreadsPerBlock<<endl;
long start, end;
long time = 0;
//CUDA计时函数
start = GetTickCount();
cudaEvent_t startt,stop; //CUDA计时机制
cudaEventCreate(&startt);
cudaEventCreate(&stop);
cudaEventRecord(startt,0);
unsigned char *h_src_imgbuf; //图像指针
int width, height, byteCount;
char rootPath1[]="C:\\Users\\a404\\Desktop\\测试图片\\";
char readPath[1024];
int frame=300;
for (int k=1;k<=frame;k++)
{
sprintf(readPath, "%s%d.bmp", rootPath1, k);
h_src_imgbuf=readBmp(readPath, &width, &height, &byteCount);
//printf("宽=%d,高=%d,字节=%d\n",width, height, byteCount);
int size1=width*height *byteCount*sizeof(unsigned char);
int size2=width*height *sizeof(unsigned char);
//输出图像内存-host端
unsigned char *h_guassian_imgbuf=new unsigned char[width*height*byteCount];
unsigned char *h_gray_imgbuf=new unsigned char[width*height];
unsigned char *h_Otus_imgbuf=new unsigned char[width*height];
//分配显存空间
unsigned char *d_src_imgbuf;
unsigned char *d_gray_imgbuf;
unsigned char *d_Otus_imgbuf;
int *d_histogram;
float *d_devi;
int *d_sum;
int *d_thresh;
float *d_maxdevi;
cudaMalloc((void**)&d_src_imgbuf, size1);
cudaMalloc((void**)&d_gray_imgbuf, size2);
cudaMalloc((void**)&d_Otus_imgbuf, size2);
cudaMalloc((void**)&d_histogram, 256*sizeof(int));
cudaMalloc((void**)&d_devi, 256*sizeof(float));
cudaMalloc((void**)&d_sum, sizeof(int));
cudaMalloc((void**)&d_thresh, sizeof(int));
//cudaMalloc((void**)&d_maxdevi, sizeof(float));
//把数据从Host传到Device
cudaMemcpy(d_src_imgbuf, h_src_imgbuf, size1, cudaMemcpyHostToDevice);
int bx = ceil((double)width/BLOCKDIM_X); //网格和块的分配
int by = ceil((double)height/BLOCKDIM_Y);
if(bx > GRIDDIM_X) bx = GRIDDIM_X;
if(by > GRIDDIM_Y) by = GRIDDIM_Y;
dim3 grid(bx, by);//网格的结构
dim3 block(BLOCKDIM_X, BLOCKDIM_Y);//块的结构
//kernel--灰度变换
kernel_color2gray<<<grid, block>>>(width,height,byteCount,d_src_imgbuf,d_gray_imgbuf);
cudaMemcpy(h_gray_imgbuf, d_gray_imgbuf,size2, cudaMemcpyDeviceToHost);//数据传回主机端
//统计直方图
int h_histogram[256]={0};
//使用原子操作进行直方图统计
cudaMemcpy(d_histogram, h_histogram, 256*sizeof(int), cudaMemcpyHostToDevice);
kernel_histogram<<<grid, block>>>(width,height,d_gray_imgbuf,d_histogram);
cudaMemcpy(h_histogram, d_histogram, 256*sizeof(int), cudaMemcpyDeviceToHost);
//方差及最大方差
float h_devi[256]={0}, maxDevi=0;
//像素总数
int h_sum[1]={0};
//规约求和--借助shared memory
kernel_ReductionSum<<<1, 256>>>(d_histogram,d_sum);
cudaMemcpy(h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
//遍历求方差--启用256个线程
cudaMemcpy(d_devi, h_devi, size1, cudaMemcpyHostToDevice);
kernel_Variance<<<1, 256>>>(d_histogram,d_devi,h_sum[0]);
cudaMemcpy(h_devi, d_devi, 256*sizeof(float), cudaMemcpyDeviceToHost);
//最佳阈值thresh
int h_thresh[1]={0};
//规约求最大值,并记录最大值位置--启用256*2个线程
kernel_ReductionMax<<<1, 512>>>(d_devi,d_thresh);
cudaMemcpy(h_thresh, d_thresh, sizeof(int), cudaMemcpyDeviceToHost);
for(int i=0;i<height;i++)
for(int j=0;j<width;j++)
{
int value=*(h_gray_imgbuf+i*width+j);
if(value<=h_thresh[0])
*(h_Otus_imgbuf+i*width+j)=0;
else
*(h_Otus_imgbuf+i*width+j)=255;
}
char rootPath2[]="C:\\Users\\a404\\Desktop\\测试结果\\";
char writePath[1024];
sprintf(writePath, "%s%d.bmp", rootPath2, k);
//saveBmp(writePath, h_guassian_imgbuf, width, height, byteCount);
saveBmp(writePath, h_Otus_imgbuf, width, height, 1);
cout<<k<<" "<<((float)k/frame)*100<<"%"<<endl;
//释放内存
cudaFree(d_src_imgbuf);
cudaFree(d_gray_imgbuf);
cudaFree(d_Otus_imgbuf);
cudaFree(d_histogram);
cudaFree(d_sum);
cudaFree(d_thresh);
delete []h_src_imgbuf;
delete []h_gray_imgbuf;
delete []h_Otus_imgbuf;
}
end = GetTickCount();
InterlockedExchangeAdd(&time, end - start);
cout << "Total time GPU:";
cout << time << endl;
int x;
cin>>x;
}
static __global__ void kernel_color2gray(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_gray_imgbuf)
{
const int tix = blockDim.x * blockIdx.x + threadIdx.x;
const int tiy = blockDim.y * blockIdx.y + threadIdx.y;
const int threadTotalX = blockDim.x * gridDim.x;
const int threadTotalY = blockDim.y * gridDim.y;
for(int ix = tix; ix < height; ix += threadTotalX)
for(int iy = tiy; iy < width; iy += threadTotalY)
*(d_gray_imgbuf+ix*width+iy)=0.11**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+0)
+0.59**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+1)
+0.30**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+2)+0.5;
}
static __global__ void kernel_histogram(int width,int height,unsigned char* d_gray_imgbuf,int* histogram)
{
__shared__ int temp[256];
//直方图初始化为0
temp[threadIdx.x*16+threadIdx.y] = 0;
__syncthreads();
const int tix = blockDim.x * blockIdx.x + threadIdx.x;
const int tiy = blockDim.y * blockIdx.y + threadIdx.y;
const int threadTotalX = blockDim.x * gridDim.x;
const int threadTotalY = blockDim.y * gridDim.y;
for(int ix = tix; ix < height; ix += threadTotalX)
{
for(int iy = tiy; iy < width; iy += threadTotalY)
{
atomicAdd(&(temp[d_gray_imgbuf[ix*width+iy]]), 1);
}
}
__syncthreads();
atomicAdd(&(histogram[threadIdx.x*16+threadIdx.y]), temp[threadIdx.x*16+threadIdx.y]);
}
static __global__ void kernel_ReductionSum(int *d_a, int* d_sum)
{
//申请共享内存,存在于每个block中
__shared__ int partialSum[256];
//确定索引
int tid = threadIdx.x;
//传global memory数据到shared memory
partialSum[tid]=d_a[tid];
//传输同步
__syncthreads();
//在共享存储器中进行规约
for(int stride = 128; stride > 0; stride/=2)
{
if(tid<stride) partialSum[tid]+=partialSum[tid+stride];
__syncthreads();
}
//将当前block的计算结果写回输出数组
if(tid==0)
d_sum[0] = partialSum[0];
}
static __global__ void kernel_Variance(int *histogram, float* devi, int sum)
{
int tid = threadIdx.x;
//c0和c1组的均值
float u0,u1;
//c0和c1组产生的概率
float w0,w1;
//c0组的像素数
int count0;
//阈值t
//int t;
u0=0;
count0=0;
for(int i=0; i<=tid;i++)
{
u0 += i*histogram[i];
count0 += histogram[i];
}
u0=u0/count0;
w0=(float)count0/sum;
//计算阈值为t时,c1组的均值和产生的概率
u1=0;
for(int i=tid+1; i<256;i++)
u1+=i*histogram[i];
//C0组像素数与C1组像素数之和为图像总像素数。
u1=u1/(sum-count0);
w1=1-w0;
//两组间的方差
devi[tid]=w0*w1*(u1-u0)*(u1-u0);
}
static __global__ void kernel_ReductionMax(float* devi,int* thresh)
{
//申请共享内存,存在于每个block中
__shared__ float Max[256*2];
//__shared__ int index[256];
//确定索引
int tid = threadIdx.x;
// partialindex[tid]=0;
//传global memory数据到shared memory
if(tid<256)
Max[tid]=devi[tid];
else
Max[tid]=tid-256;
//初始化索引
//index[256]=tid;
//传输同步
__syncthreads();
//在共享存储器中进行规约,记录索引变化
for(int stride = 128; stride > 0; stride/=2)
{
if(tid<stride)
{
if(Max[tid]<Max[tid+stride])
{
Max[tid]=Max[tid+stride];
Max[tid+256]=Max[tid+stride+256];
//index[tid]=index[tid+stride];
}
}
__syncthreads();
}
//将当前block的计算结果写回输出数组
if(tid==256)
thresh[0]=Max[256];
}
unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount)
{
//打开文件
FILE *fp=fopen(bmpName,"rb");
if(fp==0) return 0;
//跳过文件头
fseek(fp, sizeof(BITMAPFILEHEADER),0);
//读入信息头
int w, h, b;
BITMAPINFOHEADER head;
fread(&head, sizeof(BITMAPINFOHEADER), 1,fp);
w = head.biWidth;
h = head.biHeight;
b = head.biBitCount/8;
int lineByte=(w * b+3)/4*4; //每行的字节数为4的倍数
//跳过颜色表 (颜色表的大小为1024)(彩色图像并没有颜色表,不需要这一步)
if(b==1)
fseek(fp, 1024,1);
//图像数据
unsigned char *imgBuf=new unsigned char[w * h * b];
for(int i=0;i<h;i++)
{
fread(imgBuf+i*w*b,w*b, 1,fp);
fseek(fp, lineByte-w*b, 1);
}
fclose(fp);
*width=w, *height=h, *byteCount=b;
return imgBuf;
}
bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount)
{
if(!imgBuf)
return 0;
//灰度图像颜色表空间1024,彩色图像没有颜色表
int palettesize=0;
if(byteCount==1) palettesize=1024;
//一行象素字节数为4的倍数
int lineByte=(width * byteCount+3)/4*4;
FILE *fp=fopen(bmpName,"wb");
if(fp==0) return 0;
//填写文件头
BITMAPFILEHEADER fileHead;
fileHead.bfType = 0x4D42;
fileHead.bfSize=
sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER)+ palettesize + lineByte*height;
fileHead.bfReserved1 = 0;
fileHead.bfReserved2 = 0;
fileHead.bfOffBits=54+palettesize;
fwrite(&fileHead, sizeof(BITMAPFILEHEADER),1, fp);
// 填写信息头
BITMAPINFOHEADER head;
head.biBitCount=byteCount*8;
head.biClrImportant=0;
head.biClrUsed=0;
head.biCompression=0;
head.biHeight=height;
head.biPlanes=1;
head.biSize=40;
head.biSizeImage=lineByte*height;
head.biWidth=width;
head.biXPelsPerMeter=0;
head.biYPelsPerMeter=0;
fwrite(&head, sizeof(BITMAPINFOHEADER),1, fp);
//颜色表拷贝
if(palettesize==1024)
{
unsigned char palette[1024];
for(int i=0;i<256;i++)
{
*(palette+i*4+0)=i;
*(palette+i*4+1)=i;
*(palette+i*4+2)=i;
*(palette+i*4+3)=0;
}
fwrite(palette, 1024,1, fp);
}
//准备数据并写文件
unsigned char *buf=new unsigned char[height*lineByte];
for(int i=0;i<height;i++)
{
for(int j=0;j<width*byteCount; j++)
*(buf+i*lineByte+j)=*(imgBuf+i*width*byteCount+j);
}
fwrite(buf, height*lineByte, 1, fp);
delete []buf;
fclose(fp);
return 1;
}
本文介绍了一种利用CUDA并行计算实现的大津阈值法图像分割算法。该算法通过原子操作、共享内存规约求和及最大值等技术提高了图像处理效率。

1757

被折叠的 条评论
为什么被折叠?



