Added a median reduction kernel

3c5a930b · Eric Wait · d973ba83 · 3c5a930b · 3c5a930b · 3c5a930b
Commit 3c5a930b authored 11 years ago by Eric Wait
--- a/src/c/Common/CudaKernels.cu
+++ b/src/c/Common/CudaKernels.cu
@@ -30,7 +30,8 @@ __global__ void cudaMeanFilter( CudaImageContainer imageIn, CudaImageContainer i
 			}
 		}

-		imageOut[coordinate] = val/kernelVolume;
+		//imageOut[coordinate] = val/kernelVolume;
+		imageOut[coordinate] = coordinate.y;
 	}
 }

@@ -165,26 +166,18 @@ __global__ void cudaMedianFilter( CudaImageContainer imageIn, CudaImageContainer
 	if (coordinate<imageIn.getDeviceDims())
 	{
 		int kernelVolume = 0;
-		DeviceVec<size_t> kernelMidIdx;
-		DeviceVec<size_t> curCoordIm; 
-		DeviceVec<size_t> curCoordKrn;
-
-		kernelMidIdx.x = kernelDims.x/2;
-		kernelMidIdx.y = kernelDims.y/2;
-		kernelMidIdx.z = kernelDims.z/2;
+		DeviceVec<size_t> kernelDims = hostKernelDims;
+		DeviceVec<size_t> halfKernal = kernelDims/2;

-		//find if the kernel will go off the edge of the image
-		curCoordIm.z = (size_t) max(0,(int)coordinate.z-(int)kernelMidIdx.z);
-		curCoordKrn.z = ((int)coordinate.z-(int)kernelMidIdx.z>=0) ? (0) : (kernelMidIdx.z-coordinate.z);
-		for (; curCoordIm.z<imageIn.getDepth() && curCoordKrn.z<kernelDims.z; ++curCoordIm.z, ++curCoordKrn.z)
+		DeviceVec<size_t> curCoordIm = coordinate - halfKernal;
+		curCoordIm.z = (coordinate.z<halfKernal.z) ? 0 : coordinate.z-halfKernal.z;
+		for (; curCoordIm.z<coordinate.z+halfKernal.z && curCoordIm.z<imageIn.getDeviceDims().z; ++curCoordIm.z)
 		{
-			curCoordIm.y = (size_t)max(0,(int)coordinate.y-(int)kernelMidIdx.y);
-			curCoordKrn.y = ((int)coordinate.y-(int)kernelMidIdx.y>=0) ? (0) : (kernelMidIdx.y-coordinate.y);
-			for (; curCoordIm.y<imageIn.getHeight() && curCoordKrn.y<kernelDims.y; ++curCoordIm.y, ++curCoordKrn.y)
+			curCoordIm.y = (coordinate.y<halfKernal.y) ? 0 : coordinate.y-halfKernal.y/2;
+			for (; curCoordIm.y<coordinate.y+halfKernal.y && curCoordIm.y<imageIn.getDeviceDims().y; ++curCoordIm.y)
 			{
-				curCoordIm.x = (size_t)max(0,(int)coordinate.x-(int)kernelMidIdx.x);
-				curCoordKrn.x = ((int)coordinate.x-(int)kernelMidIdx.x>=0) ? (0) : (kernelMidIdx.x-coordinate.x);		
-				for (; curCoordIm.x<imageIn.getWidth() && curCoordKrn.x<kernelDims.x; ++curCoordIm.x, ++curCoordKrn.x)
+				curCoordIm.x = (coordinate.x<halfKernal.x) ? 0 : coordinate.x-halfKernal.x/2;
+				for (; curCoordIm.x<coordinate.x+halfKernal.x && curCoordIm.x<imageIn.getDeviceDims().x; ++curCoordIm.x)
 				{
 					vals[kernelVolume+offset] = imageIn[curCoordIm];
 					++kernelVolume;
@@ -193,8 +186,9 @@ __global__ void cudaMedianFilter( CudaImageContainer imageIn, CudaImageContainer
 		}

 		imageOut[coordinate] = cudaFindMedian(vals+offset,kernelVolume);
-		__syncthreads();
+		//imageOut[coordinate] = coordinate.y;
 	}
+	__syncthreads();
 }

 __global__ void cudaMultAddFilter( CudaImageContainer* imageIn, CudaImageContainer* imageOut, Vec<size_t> hostKernelDims, size_t kernelOffset/*=0*/ )
@@ -644,9 +638,9 @@ __global__ void cudaSumArray(CudaImageContainer arrayIn, double* arrayOut, size_
 		arrayOut[blockIdx.x] = sdata[0];
 }

-__global__ void cudaRuduceImage( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions )
+__global__ void cudaMedianImageReduction( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions)
 {
-	//extern __shared__ DevicePixelType vals[];
+	extern __shared__ DevicePixelType vals[];
 	DeviceVec<size_t> reductions = hostReductions;
 	DeviceVec<size_t> coordinateOut;
 	coordinateOut.x = threadIdx.x + blockIdx.x * blockDim.x;
@@ -658,28 +652,24 @@ __global__ void cudaRuduceImage( CudaImageContainer imageIn, CudaImageContainer
 	if (coordinateOut<imageOut.getDeviceDims())
 	{
 		int kernelVolume = 0;
-		double val = 0.0;
 		DeviceVec<size_t> mins(coordinateOut*DeviceVec<size_t>(reductions));
 		DeviceVec<size_t> maxs = DeviceVec<size_t>::min(mins+reductions, imageIn.getDeviceDims());

 		DeviceVec<size_t> currCorrdIn(mins);
-  		for (currCorrdIn.z=mins.z; currCorrdIn.z<maxs.z; ++currCorrdIn.z)
-  		{
-  			for (currCorrdIn.y=mins.y; currCorrdIn.y<maxs.y; ++currCorrdIn.y)
-  			{
-  				for (currCorrdIn.x=mins.x; currCorrdIn.x<maxs.x; ++currCorrdIn.x)
-  				{
-					//vals[offset+kernelVolume] += imageIn[currCorrdIn];
-					val += (double)imageIn[currCorrdIn];
-  					++kernelVolume;
-  				}
-  			}
-  		}
-
-		imageOut[coordinateOut] = val/kernelVolume;
-		//imageOut[coordinateOut] = cudaFindMedian(vals+offset,kernelVolume);
-		//__syncthreads();
+		for (currCorrdIn.z=mins.z; currCorrdIn.z<maxs.z; ++currCorrdIn.z)
+		{
+			for (currCorrdIn.y=mins.y; currCorrdIn.y<maxs.y; ++currCorrdIn.y)
+			{
+				for (currCorrdIn.x=mins.x; currCorrdIn.x<maxs.x; ++currCorrdIn.x)
+				{
+					vals[offset+kernelVolume] = imageIn[currCorrdIn];
+					++kernelVolume;
+				}
+			}
+		}
+		imageOut[coordinateOut] = cudaFindMedian(vals+offset,kernelVolume);
 	}
+	__syncthreads();
 }

 __global__ void cudaMaximumIntensityProjection( CudaImageContainer imageIn, CudaImageContainer imageOut )
@@ -803,4 +793,36 @@ __global__ void cudaMask( const CudaImageContainer imageIn1, const CudaImageCont

 		imageOut[coordinate] = val;
 	}
-}
\ No newline at end of file
+}
+
+__global__ void cudaMeanImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions)
+{
+	DeviceVec<size_t> reductions = hostReductions;
+	DeviceVec<size_t> coordinateOut;
+	coordinateOut.x = threadIdx.x + blockIdx.x * blockDim.x;
+	coordinateOut.y = threadIdx.y + blockIdx.y * blockDim.y;
+	coordinateOut.z = threadIdx.z + blockIdx.z * blockDim.z;
+
+	if (coordinateOut<imageOut.getDeviceDims())
+	{
+		int kernelVolume = 0;
+		double val = 0;
+		DeviceVec<size_t> mins(coordinateOut*reductions);
+		DeviceVec<size_t> maxs = DeviceVec<size_t>::min(mins+reductions, imageIn.getDeviceDims());
+
+		DeviceVec<size_t> currCorrdIn(mins);
+		for (currCorrdIn.z=mins.z; currCorrdIn.z<maxs.z; ++currCorrdIn.z)
+		{
+			for (currCorrdIn.y=mins.y; currCorrdIn.y<maxs.y; ++currCorrdIn.y)
+			{
+				for (currCorrdIn.x=mins.x; currCorrdIn.x<maxs.x; ++currCorrdIn.x)
+				{
+					val += (double)imageIn[currCorrdIn];
+					++kernelVolume;
+				}
+			}
+		}
+
+		imageOut[coordinateOut] = val/kernelVolume;
+	}
+}
--- a/src/c/Common/CudaKernels.cuh
+++ b/src/c/Common/CudaKernels.cuh
@@ -52,7 +52,9 @@ __global__ void cudaPolyTransferFuncImage(CudaImageContainer imageIn, CudaImageC

 __global__ void cudaSumArray(CudaImageContainer arrayIn, double* arrayOut, size_t n);

-__global__ void cudaRuduceImage(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);
+__global__ void cudaMeanImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);
+
+__global__ void cudaMedianImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);

 __global__ void cudaMaximumIntensityProjection(CudaImageContainer imageIn, CudaImageContainer imageOut);


--- a/src/c/Common/CudaProcessBuffer.cu
+++ b/src/c/Common/CudaProcessBuffer.cu
@@ -7,9 +7,6 @@ const double MAX_MEM_AVAIL = 0.95;

 std::vector<ImageChunk> calculateBuffers(Vec<size_t> imageDims, int numBuffersNeeded, size_t memAvailable, const cudaDeviceProp& prop, Vec<size_t> kernalDims/*=Vec<size_t>(0,0,0)*/)
 {
-	// 	clearDeviceBuffers();
-	// 	deviceImageBuffers.resize(numBuffersNeeded);
-
 	size_t numVoxels = (size_t)(memAvailable / (sizeof(HostPixelType)*numBuffersNeeded));

 	Vec<size_t> overlapVolume;
@@ -182,7 +179,6 @@ std::vector<ImageChunk> calculateChunking(Vec<size_t> orgImageDims, Vec<size_t>
 	return localChunks;
 }

-
 CudaProcessBuffer::CudaProcessBuffer(int device/*=0*/)
 {
 	defaults();
@@ -376,9 +372,50 @@ DevicePixelType* CudaProcessBuffer::meanFilter(const DevicePixelType* imageIn, V
 	return meanImage;
 }

-void CudaProcessBuffer::medianFilter(Vec<size_t> neighborhood)
+DevicePixelType* CudaProcessBuffer::medianFilter(const DevicePixelType* imageIn, Vec<size_t> dims, Vec<size_t> neighborhood, DevicePixelType** imageOut/*=NULL*/)
 {
-	throw std::logic_error("The method or operation is not implemented.");
+	orgImageDims = dims;
+
+	DevicePixelType* medianImage;
+	if (imageOut==NULL)
+		medianImage = new DevicePixelType[orgImageDims.product()];
+	else
+		medianImage = *imageOut;
+
+	std::vector<ImageChunk> chunks = calculateBuffers(dims,2,(size_t)(deviceProp.totalGlobalMem*MAX_MEM_AVAIL),deviceProp,neighborhood);
+
+	CudaImageContainerClean* deviceImageIn = new CudaImageContainerClean(chunks[0].getFullChunkSize(),device);
+	CudaImageContainerClean* deviceImageOut = new CudaImageContainerClean(chunks[0].getFullChunkSize(),device);
+	for (std::vector<ImageChunk>::iterator curChunk=chunks.begin(); curChunk!=chunks.end(); ++curChunk)
+	{
+		curChunk->sendROI(imageIn,dims,deviceImageIn);
+		deviceImageOut->setDims(curChunk->getFullChunkSize());
+
+		dim3 blocks(curChunk->blocks);
+		dim3 threads(curChunk->threads);
+		double threadVolume = threads.x * threads.y * threads.z;
+		double newThreadVolume = (double)deviceProp.sharedMemPerBlock/(sizeof(DevicePixelType)*neighborhood.product());
+
+		double alpha = pow(threadVolume/newThreadVolume,1.0/3.0);
+		threads.x = threads.x / alpha;
+		threads.y = threads.y / alpha;
+		threads.z = threads.z / alpha;
+
+		blocks.x = ceil((double)curChunk->getFullChunkSize().x / threads.x);
+		blocks.y = ceil((double)curChunk->getFullChunkSize().y / threads.y);
+		blocks.z = ceil((double)curChunk->getFullChunkSize().z / threads.z);
+
+		size_t sharedMemorysize = neighborhood.product() * threads.x * threads.y * threads.z;
+
+ 		cudaMedianFilter<<<blocks,threads,sharedMemorysize>>>(*deviceImageIn,*deviceImageOut,neighborhood);
+ 
+ 		curChunk->retriveROI(medianImage,dims,deviceImageOut);
+	}
+
+	delete deviceImageIn;
+	delete deviceImageOut;
+
+	return medianImage;
 }

 void CudaProcessBuffer::minFilter(Vec<size_t> neighborhood, double* kernel/*=NULL*/)
@@ -445,17 +482,23 @@ DevicePixelType* CudaProcessBuffer::reduceImage(const DevicePixelType* imageIn,

 	double ratio = (double)reducedDims.product() / dims.product();

+	if (ratio==1.0)
+	{
+		memcpy(reducedImage,imageIn,sizeof(DevicePixelType)*reducedDims.product());
+		return reducedImage;
+	}
+
 	std::vector<ImageChunk> orgChunks = calculateBuffers(dims,1,(size_t)(deviceProp.totalGlobalMem*MAX_MEM_AVAIL*(1-ratio)),deviceProp,reductions);
 	std::vector<ImageChunk> reducedChunks = orgChunks;

 	for (std::vector<ImageChunk>::iterator it=reducedChunks.begin(); it!=reducedChunks.end(); ++it)
 	{
-		it->imageStart = it->imageStart/reductions;
-		it->chunkROIstart = it->chunkROIstart/reductions;
+		it->imageStart = it->imageROIstart/reductions;
+		it->chunkROIstart = Vec<size_t>(0,0,0);
 		it->imageROIstart = it->imageROIstart/reductions;
-		it->imageEnd = it->imageEnd/reductions;
-		it->chunkROIend = it->chunkROIend/reductions;
+		it->imageEnd = it->imageROIend/reductions;
 		it->imageROIend = it->imageROIend/reductions;
+		it->chunkROIend = it->imageEnd-it->imageStart;

 		calcBlockThread(it->getFullChunkSize(),deviceProp,it->blocks,it->threads);
 	}
@@ -471,9 +514,33 @@ DevicePixelType* CudaProcessBuffer::reduceImage(const DevicePixelType* imageIn,
 		orgIt->sendROI(imageIn,dims,deviceImageIn);
 		deviceImageOut->setDims(reducedIt->getFullChunkSize());

-		sharedMemorysize = reductions.product() * reducedIt->threads.x * reducedIt->threads.y * reducedIt->threads.z;
-		//cudaRuduceImage<<<reducedIt->blocks,reducedIt->threads,sharedMemorysize>>>(*deviceImageIn,*deviceImageOut,reductions);
-		cudaRuduceImage<<<reducedIt->blocks,reducedIt->threads>>>(*deviceImageIn,*deviceImageOut,reductions);
+		dim3 blocks(reducedIt->blocks);
+		dim3 threads(reducedIt->threads);
+ 		double threadVolume = threads.x * threads.y * threads.z;
+ 		double newThreadVolume = (double)deviceProp.sharedMemPerBlock/(sizeof(DevicePixelType)*reductions.product());
+ 
+ 		double alpha = pow(threadVolume/newThreadVolume,1.0/3.0);
+ 		threads.x = threads.x / alpha;
+ 		threads.y = threads.y / alpha;
+ 		threads.z = threads.z / alpha;
+
+		if (threads.x*threads.y*threads.z>deviceProp.maxThreadsPerBlock)
+		{
+			unsigned int maxThreads = pow(deviceProp.maxThreadsPerBlock,1.0/3.0);
+			threads.x = maxThreads;
+			threads.y = maxThreads;
+			threads.z = maxThreads;
+		}
+ 
+ 		blocks.x = ceil((double)reducedIt->getFullChunkSize().x / threads.x);
+ 		blocks.y = ceil((double)reducedIt->getFullChunkSize().y / threads.y);
+ 		blocks.z = ceil((double)reducedIt->getFullChunkSize().z / threads.z);
+ 
+ 		size_t sharedMemorysize = reductions.product() * threads.x * threads.y * threads.z;
+ 
+ 		cudaMedianImageReduction<<<blocks,threads,sharedMemorysize>>>(*deviceImageIn, *deviceImageOut, reductions);
+
+		//cudaMeanImageReduction<<<blocks,threads>>>(*deviceImageIn,*deviceImageOut,reductions);

 		reducedIt->retriveROI(reducedImage,reducedDims,deviceImageOut);
 		
@@ -484,6 +551,8 @@ DevicePixelType* CudaProcessBuffer::reduceImage(const DevicePixelType* imageIn,
 	delete deviceImageIn;
 	delete deviceImageOut;

+	cudaThreadExit();
+
 	return reducedImage;
 }


--- a/src/c/Common/CudaProcessBuffer.cuh
+++ b/src/c/Common/CudaProcessBuffer.cuh
@@ -211,34 +211,8 @@ public:
 	// 	/*
 	// 	*	Filters image where each pixel is the median of its neighborhood
 	// 	*/
-	 	void medianFilter(Vec<size_t> neighborhood);
-	// 	{
-	// 		static dim3 localBlocks = blocks;
-	// 		static dim3 localThreads = threads;
-	// 		size_t sharedMemorySize = neighborhood.product()*localThreads.x*localThreads.y*localThreads.z;
-	// 		if (sizeof(ImagePixelType)*sharedMemorySize>deviceProp.sharedMemPerBlock)
-	// 		{
-	// 			float maxThreads = (float)deviceProp.sharedMemPerBlock/(sizeof(ImagePixelType)*neighborhood.product());
-	// 			size_t threadDim = (size_t)pow(maxThreads,1/3.0f);
-	// 			localThreads.x = (unsigned int)threadDim;
-	// 			localThreads.y = (unsigned int)threadDim;
-	// 			localThreads.z = (unsigned int)threadDim;
-	// 
-	// 			localBlocks.x = (size_t)ceil((float)imageDims.x/localThreads.x);
-	// 			localBlocks.y = (size_t)ceil((float)imageDims.y/localThreads.y);
-	// 			localBlocks.z = (size_t)ceil((float)imageDims.z/localThreads.z);
-	// 
-	// 			sharedMemorySize = neighborhood.product()*localThreads.x*localThreads.y*localThreads.z;
-	// 		}
-	// 
-	// #if CUDA_CALLS_ON
-	// 		cudaMedianFilter<<<localBlocks,localThreads,sizeof(ImagePixelType)*sharedMemorySize>>>(*getCurrentBuffer(),*getNextBuffer(),
-	// 			neighborhood);
-	// #endif
-	// 
-	// 		incrementBufferNumber();
-	// 	}
-	// 
+	 	DevicePixelType* medianFilter(const DevicePixelType* imageIn, Vec<size_t> dims, Vec<size_t> neighborhood, DevicePixelType** imageOut=NULL);
+
 	// 	/*
 	// 	*	Sets each pixel to the min value of its neighborhood
 	// 	*	Erodes structures