Skip to content
Snippets Groups Projects
Commit 3c5a930b authored by Eric Wait's avatar Eric Wait
Browse files

Added a median reduction kernel

parent d973ba83
No related branches found
No related tags found
No related merge requests found
......@@ -30,7 +30,8 @@ __global__ void cudaMeanFilter( CudaImageContainer imageIn, CudaImageContainer i
}
}
imageOut[coordinate] = val/kernelVolume;
//imageOut[coordinate] = val/kernelVolume;
imageOut[coordinate] = coordinate.y;
}
}
......@@ -165,26 +166,18 @@ __global__ void cudaMedianFilter( CudaImageContainer imageIn, CudaImageContainer
if (coordinate<imageIn.getDeviceDims())
{
int kernelVolume = 0;
DeviceVec<size_t> kernelMidIdx;
DeviceVec<size_t> curCoordIm;
DeviceVec<size_t> curCoordKrn;
kernelMidIdx.x = kernelDims.x/2;
kernelMidIdx.y = kernelDims.y/2;
kernelMidIdx.z = kernelDims.z/2;
DeviceVec<size_t> kernelDims = hostKernelDims;
DeviceVec<size_t> halfKernal = kernelDims/2;
//find if the kernel will go off the edge of the image
curCoordIm.z = (size_t) max(0,(int)coordinate.z-(int)kernelMidIdx.z);
curCoordKrn.z = ((int)coordinate.z-(int)kernelMidIdx.z>=0) ? (0) : (kernelMidIdx.z-coordinate.z);
for (; curCoordIm.z<imageIn.getDepth() && curCoordKrn.z<kernelDims.z; ++curCoordIm.z, ++curCoordKrn.z)
DeviceVec<size_t> curCoordIm = coordinate - halfKernal;
curCoordIm.z = (coordinate.z<halfKernal.z) ? 0 : coordinate.z-halfKernal.z;
for (; curCoordIm.z<coordinate.z+halfKernal.z && curCoordIm.z<imageIn.getDeviceDims().z; ++curCoordIm.z)
{
curCoordIm.y = (size_t)max(0,(int)coordinate.y-(int)kernelMidIdx.y);
curCoordKrn.y = ((int)coordinate.y-(int)kernelMidIdx.y>=0) ? (0) : (kernelMidIdx.y-coordinate.y);
for (; curCoordIm.y<imageIn.getHeight() && curCoordKrn.y<kernelDims.y; ++curCoordIm.y, ++curCoordKrn.y)
curCoordIm.y = (coordinate.y<halfKernal.y) ? 0 : coordinate.y-halfKernal.y/2;
for (; curCoordIm.y<coordinate.y+halfKernal.y && curCoordIm.y<imageIn.getDeviceDims().y; ++curCoordIm.y)
{
curCoordIm.x = (size_t)max(0,(int)coordinate.x-(int)kernelMidIdx.x);
curCoordKrn.x = ((int)coordinate.x-(int)kernelMidIdx.x>=0) ? (0) : (kernelMidIdx.x-coordinate.x);
for (; curCoordIm.x<imageIn.getWidth() && curCoordKrn.x<kernelDims.x; ++curCoordIm.x, ++curCoordKrn.x)
curCoordIm.x = (coordinate.x<halfKernal.x) ? 0 : coordinate.x-halfKernal.x/2;
for (; curCoordIm.x<coordinate.x+halfKernal.x && curCoordIm.x<imageIn.getDeviceDims().x; ++curCoordIm.x)
{
vals[kernelVolume+offset] = imageIn[curCoordIm];
++kernelVolume;
......@@ -193,8 +186,9 @@ __global__ void cudaMedianFilter( CudaImageContainer imageIn, CudaImageContainer
}
imageOut[coordinate] = cudaFindMedian(vals+offset,kernelVolume);
__syncthreads();
//imageOut[coordinate] = coordinate.y;
}
__syncthreads();
}
__global__ void cudaMultAddFilter( CudaImageContainer* imageIn, CudaImageContainer* imageOut, Vec<size_t> hostKernelDims, size_t kernelOffset/*=0*/ )
......@@ -644,9 +638,9 @@ __global__ void cudaSumArray(CudaImageContainer arrayIn, double* arrayOut, size_
arrayOut[blockIdx.x] = sdata[0];
}
__global__ void cudaRuduceImage( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions )
__global__ void cudaMedianImageReduction( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions)
{
//extern __shared__ DevicePixelType vals[];
extern __shared__ DevicePixelType vals[];
DeviceVec<size_t> reductions = hostReductions;
DeviceVec<size_t> coordinateOut;
coordinateOut.x = threadIdx.x + blockIdx.x * blockDim.x;
......@@ -658,28 +652,24 @@ __global__ void cudaRuduceImage( CudaImageContainer imageIn, CudaImageContainer
if (coordinateOut<imageOut.getDeviceDims())
{
int kernelVolume = 0;
double val = 0.0;
DeviceVec<size_t> mins(coordinateOut*DeviceVec<size_t>(reductions));
DeviceVec<size_t> maxs = DeviceVec<size_t>::min(mins+reductions, imageIn.getDeviceDims());
DeviceVec<size_t> currCorrdIn(mins);
for (currCorrdIn.z=mins.z; currCorrdIn.z<maxs.z; ++currCorrdIn.z)
{
for (currCorrdIn.y=mins.y; currCorrdIn.y<maxs.y; ++currCorrdIn.y)
{
for (currCorrdIn.x=mins.x; currCorrdIn.x<maxs.x; ++currCorrdIn.x)
{
//vals[offset+kernelVolume] += imageIn[currCorrdIn];
val += (double)imageIn[currCorrdIn];
++kernelVolume;
}
}
}
imageOut[coordinateOut] = val/kernelVolume;
//imageOut[coordinateOut] = cudaFindMedian(vals+offset,kernelVolume);
//__syncthreads();
for (currCorrdIn.z=mins.z; currCorrdIn.z<maxs.z; ++currCorrdIn.z)
{
for (currCorrdIn.y=mins.y; currCorrdIn.y<maxs.y; ++currCorrdIn.y)
{
for (currCorrdIn.x=mins.x; currCorrdIn.x<maxs.x; ++currCorrdIn.x)
{
vals[offset+kernelVolume] = imageIn[currCorrdIn];
++kernelVolume;
}
}
}
imageOut[coordinateOut] = cudaFindMedian(vals+offset,kernelVolume);
}
__syncthreads();
}
__global__ void cudaMaximumIntensityProjection( CudaImageContainer imageIn, CudaImageContainer imageOut )
......@@ -803,4 +793,36 @@ __global__ void cudaMask( const CudaImageContainer imageIn1, const CudaImageCont
imageOut[coordinate] = val;
}
}
\ No newline at end of file
}
__global__ void cudaMeanImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions)
{
DeviceVec<size_t> reductions = hostReductions;
DeviceVec<size_t> coordinateOut;
coordinateOut.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinateOut.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinateOut.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinateOut<imageOut.getDeviceDims())
{
int kernelVolume = 0;
double val = 0;
DeviceVec<size_t> mins(coordinateOut*reductions);
DeviceVec<size_t> maxs = DeviceVec<size_t>::min(mins+reductions, imageIn.getDeviceDims());
DeviceVec<size_t> currCorrdIn(mins);
for (currCorrdIn.z=mins.z; currCorrdIn.z<maxs.z; ++currCorrdIn.z)
{
for (currCorrdIn.y=mins.y; currCorrdIn.y<maxs.y; ++currCorrdIn.y)
{
for (currCorrdIn.x=mins.x; currCorrdIn.x<maxs.x; ++currCorrdIn.x)
{
val += (double)imageIn[currCorrdIn];
++kernelVolume;
}
}
}
imageOut[coordinateOut] = val/kernelVolume;
}
}
......@@ -52,7 +52,9 @@ __global__ void cudaPolyTransferFuncImage(CudaImageContainer imageIn, CudaImageC
__global__ void cudaSumArray(CudaImageContainer arrayIn, double* arrayOut, size_t n);
__global__ void cudaRuduceImage(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);
__global__ void cudaMeanImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);
__global__ void cudaMedianImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);
__global__ void cudaMaximumIntensityProjection(CudaImageContainer imageIn, CudaImageContainer imageOut);
......
......@@ -7,9 +7,6 @@ const double MAX_MEM_AVAIL = 0.95;
std::vector<ImageChunk> calculateBuffers(Vec<size_t> imageDims, int numBuffersNeeded, size_t memAvailable, const cudaDeviceProp& prop, Vec<size_t> kernalDims/*=Vec<size_t>(0,0,0)*/)
{
// clearDeviceBuffers();
// deviceImageBuffers.resize(numBuffersNeeded);
size_t numVoxels = (size_t)(memAvailable / (sizeof(HostPixelType)*numBuffersNeeded));
Vec<size_t> overlapVolume;
......@@ -182,7 +179,6 @@ std::vector<ImageChunk> calculateChunking(Vec<size_t> orgImageDims, Vec<size_t>
return localChunks;
}
CudaProcessBuffer::CudaProcessBuffer(int device/*=0*/)
{
defaults();
......@@ -376,9 +372,50 @@ DevicePixelType* CudaProcessBuffer::meanFilter(const DevicePixelType* imageIn, V
return meanImage;
}
void CudaProcessBuffer::medianFilter(Vec<size_t> neighborhood)
DevicePixelType* CudaProcessBuffer::medianFilter(const DevicePixelType* imageIn, Vec<size_t> dims, Vec<size_t> neighborhood, DevicePixelType** imageOut/*=NULL*/)
{
throw std::logic_error("The method or operation is not implemented.");
orgImageDims = dims;
DevicePixelType* medianImage;
if (imageOut==NULL)
medianImage = new DevicePixelType[orgImageDims.product()];
else
medianImage = *imageOut;
std::vector<ImageChunk> chunks = calculateBuffers(dims,2,(size_t)(deviceProp.totalGlobalMem*MAX_MEM_AVAIL),deviceProp,neighborhood);
CudaImageContainerClean* deviceImageIn = new CudaImageContainerClean(chunks[0].getFullChunkSize(),device);
CudaImageContainerClean* deviceImageOut = new CudaImageContainerClean(chunks[0].getFullChunkSize(),device);
for (std::vector<ImageChunk>::iterator curChunk=chunks.begin(); curChunk!=chunks.end(); ++curChunk)
{
curChunk->sendROI(imageIn,dims,deviceImageIn);
deviceImageOut->setDims(curChunk->getFullChunkSize());
dim3 blocks(curChunk->blocks);
dim3 threads(curChunk->threads);
double threadVolume = threads.x * threads.y * threads.z;
double newThreadVolume = (double)deviceProp.sharedMemPerBlock/(sizeof(DevicePixelType)*neighborhood.product());
double alpha = pow(threadVolume/newThreadVolume,1.0/3.0);
threads.x = threads.x / alpha;
threads.y = threads.y / alpha;
threads.z = threads.z / alpha;
blocks.x = ceil((double)curChunk->getFullChunkSize().x / threads.x);
blocks.y = ceil((double)curChunk->getFullChunkSize().y / threads.y);
blocks.z = ceil((double)curChunk->getFullChunkSize().z / threads.z);
size_t sharedMemorysize = neighborhood.product() * threads.x * threads.y * threads.z;
cudaMedianFilter<<<blocks,threads,sharedMemorysize>>>(*deviceImageIn,*deviceImageOut,neighborhood);
curChunk->retriveROI(medianImage,dims,deviceImageOut);
}
delete deviceImageIn;
delete deviceImageOut;
return medianImage;
}
void CudaProcessBuffer::minFilter(Vec<size_t> neighborhood, double* kernel/*=NULL*/)
......@@ -445,17 +482,23 @@ DevicePixelType* CudaProcessBuffer::reduceImage(const DevicePixelType* imageIn,
double ratio = (double)reducedDims.product() / dims.product();
if (ratio==1.0)
{
memcpy(reducedImage,imageIn,sizeof(DevicePixelType)*reducedDims.product());
return reducedImage;
}
std::vector<ImageChunk> orgChunks = calculateBuffers(dims,1,(size_t)(deviceProp.totalGlobalMem*MAX_MEM_AVAIL*(1-ratio)),deviceProp,reductions);
std::vector<ImageChunk> reducedChunks = orgChunks;
for (std::vector<ImageChunk>::iterator it=reducedChunks.begin(); it!=reducedChunks.end(); ++it)
{
it->imageStart = it->imageStart/reductions;
it->chunkROIstart = it->chunkROIstart/reductions;
it->imageStart = it->imageROIstart/reductions;
it->chunkROIstart = Vec<size_t>(0,0,0);
it->imageROIstart = it->imageROIstart/reductions;
it->imageEnd = it->imageEnd/reductions;
it->chunkROIend = it->chunkROIend/reductions;
it->imageEnd = it->imageROIend/reductions;
it->imageROIend = it->imageROIend/reductions;
it->chunkROIend = it->imageEnd-it->imageStart;
calcBlockThread(it->getFullChunkSize(),deviceProp,it->blocks,it->threads);
}
......@@ -471,9 +514,33 @@ DevicePixelType* CudaProcessBuffer::reduceImage(const DevicePixelType* imageIn,
orgIt->sendROI(imageIn,dims,deviceImageIn);
deviceImageOut->setDims(reducedIt->getFullChunkSize());
sharedMemorysize = reductions.product() * reducedIt->threads.x * reducedIt->threads.y * reducedIt->threads.z;
//cudaRuduceImage<<<reducedIt->blocks,reducedIt->threads,sharedMemorysize>>>(*deviceImageIn,*deviceImageOut,reductions);
cudaRuduceImage<<<reducedIt->blocks,reducedIt->threads>>>(*deviceImageIn,*deviceImageOut,reductions);
dim3 blocks(reducedIt->blocks);
dim3 threads(reducedIt->threads);
double threadVolume = threads.x * threads.y * threads.z;
double newThreadVolume = (double)deviceProp.sharedMemPerBlock/(sizeof(DevicePixelType)*reductions.product());
double alpha = pow(threadVolume/newThreadVolume,1.0/3.0);
threads.x = threads.x / alpha;
threads.y = threads.y / alpha;
threads.z = threads.z / alpha;
if (threads.x*threads.y*threads.z>deviceProp.maxThreadsPerBlock)
{
unsigned int maxThreads = pow(deviceProp.maxThreadsPerBlock,1.0/3.0);
threads.x = maxThreads;
threads.y = maxThreads;
threads.z = maxThreads;
}
blocks.x = ceil((double)reducedIt->getFullChunkSize().x / threads.x);
blocks.y = ceil((double)reducedIt->getFullChunkSize().y / threads.y);
blocks.z = ceil((double)reducedIt->getFullChunkSize().z / threads.z);
size_t sharedMemorysize = reductions.product() * threads.x * threads.y * threads.z;
cudaMedianImageReduction<<<blocks,threads,sharedMemorysize>>>(*deviceImageIn, *deviceImageOut, reductions);
//cudaMeanImageReduction<<<blocks,threads>>>(*deviceImageIn,*deviceImageOut,reductions);
reducedIt->retriveROI(reducedImage,reducedDims,deviceImageOut);
......@@ -484,6 +551,8 @@ DevicePixelType* CudaProcessBuffer::reduceImage(const DevicePixelType* imageIn,
delete deviceImageIn;
delete deviceImageOut;
cudaThreadExit();
return reducedImage;
}
......
......@@ -211,34 +211,8 @@ public:
// /*
// * Filters image where each pixel is the median of its neighborhood
// */
void medianFilter(Vec<size_t> neighborhood);
// {
// static dim3 localBlocks = blocks;
// static dim3 localThreads = threads;
// size_t sharedMemorySize = neighborhood.product()*localThreads.x*localThreads.y*localThreads.z;
// if (sizeof(ImagePixelType)*sharedMemorySize>deviceProp.sharedMemPerBlock)
// {
// float maxThreads = (float)deviceProp.sharedMemPerBlock/(sizeof(ImagePixelType)*neighborhood.product());
// size_t threadDim = (size_t)pow(maxThreads,1/3.0f);
// localThreads.x = (unsigned int)threadDim;
// localThreads.y = (unsigned int)threadDim;
// localThreads.z = (unsigned int)threadDim;
//
// localBlocks.x = (size_t)ceil((float)imageDims.x/localThreads.x);
// localBlocks.y = (size_t)ceil((float)imageDims.y/localThreads.y);
// localBlocks.z = (size_t)ceil((float)imageDims.z/localThreads.z);
//
// sharedMemorySize = neighborhood.product()*localThreads.x*localThreads.y*localThreads.z;
// }
//
// #if CUDA_CALLS_ON
// cudaMedianFilter<<<localBlocks,localThreads,sizeof(ImagePixelType)*sharedMemorySize>>>(*getCurrentBuffer(),*getNextBuffer(),
// neighborhood);
// #endif
//
// incrementBufferNumber();
// }
//
DevicePixelType* medianFilter(const DevicePixelType* imageIn, Vec<size_t> dims, Vec<size_t> neighborhood, DevicePixelType** imageOut=NULL);
// /*
// * Sets each pixel to the min value of its neighborhood
// * Erodes structures
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment