Skip to content
Snippets Groups Projects
Commit b16ce675 authored by Eric Wait's avatar Eric Wait
Browse files

Rearranged cuda kernels

parent bcf19b3a
No related branches found
No related tags found
No related merge requests found
Showing
with 693 additions and 862 deletions
#include "CudaKernels.cuh"
__global__ void cudaAddFactor( CudaImageContainer imageIn1, CudaImageContainer imageOut, double factor, DevicePixelType minValue,
DevicePixelType maxValue )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn1.getDeviceDims())
{
double outValue = imageIn1[coordinate] + factor;
//size_t idxIn1 = imageIn1.getDeviceDims().linearAddressAt(coordinate,imageIn1.isColumnMajor());
//size_t idxOut = imageOut.getDeviceDims().linearAddressAt(coordinate,imageOut.isColumnMajor());
imageOut[coordinate] = min((double)maxValue,max((double)minValue,outValue));
// DevicePixelType* im = imageOut.getDeviceImagePointer();
// size_t idx = coordinate.x+coordinate.y*imageOut.getWidth()+coordinate.z*imageOut.getHeight();
// size_t calcIdx = imageOut.getDeviceDims().linearAddressAt(coordinate);
// im[idx] = coordinate.x;
}
}
#include "CudaKernels.cuh"
__global__ void cudaAddTwoImagesWithFactor( CudaImageContainer imageIn1, CudaImageContainer imageIn2, CudaImageContainer imageOut, double factor,
DevicePixelType minValue, DevicePixelType maxValue )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn1.getDeviceDims())
{
double subtractor = factor*(double)imageIn2[coordinate];
DevicePixelType outValue = (double)imageIn1[coordinate] + subtractor;
imageOut[coordinate] = min(maxValue,max(minValue,outValue));
}
}
#include "CudaKernels.cuh"
__device__ DevicePixelType* SubDivide(DevicePixelType* pB, DevicePixelType* pE)
{
DevicePixelType* pPivot = --pE;
const DevicePixelType pivot = *pPivot;
while (pB < pE)
{
if (*pB > pivot)
{
--pE;
DevicePixelType temp = *pB;
*pB = *pE;
*pE = temp;
} else
++pB;
}
DevicePixelType temp = *pPivot;
*pPivot = *pE;
*pE = temp;
return pE;
}
__device__ void SelectElement(DevicePixelType* pB, DevicePixelType* pE, size_t k)
{
while (true)
{
DevicePixelType* pPivot = SubDivide(pB, pE);
size_t n = pPivot - pB;
if (n == k)
break;
if (n > k)
pE = pPivot;
else
{
pB = pPivot + 1;
k -= (n + 1);
}
}
}
__device__ DevicePixelType cudaFindMedian(DevicePixelType* vals, int numVals)
{
SelectElement(vals,vals+numVals, numVals/2);
return vals[numVals/2];
}
#include "CudaKernels.cuh"
__global__ void cudaFindMinMax( CudaImageContainer arrayIn, double* minArrayOut, double* maxArrayOut, size_t n )
{
extern __shared__ double maxData[];
extern __shared__ double minData[];
size_t tid = threadIdx.x;
size_t i = blockIdx.x*blockDim.x*2 + tid;
size_t gridSize = blockDim.x*2*gridDim.x;
while (i<n)
{
maxData[tid] = arrayIn[i];
minData[tid] = arrayIn[i];
if (i+blockDim.x<n)
{
if(maxData[tid]<arrayIn[i+blockDim.x])
maxData[tid] = arrayIn[i+blockDim.x];
if(minData[tid]>arrayIn[i+blockDim.x])
minData[tid] = arrayIn[i+blockDim.x];
}
i += gridSize;
}
__syncthreads();
if (blockDim.x >= 2048)
{
if (tid < 1024)
{
if(maxData[tid]<maxData[tid + 1024])
maxData[tid] = maxData[tid + 1024];
if(minData[tid]>minData[tid + 1024])
minData[tid] = minData[tid + 1024];
}
__syncthreads();
}
if (blockDim.x >= 1024)
{
if (tid < 512)
{
if(maxData[tid]<maxData[tid + 512])
maxData[tid] = maxData[tid + 512];
if(minData[tid]>minData[tid + 512])
minData[tid] = minData[tid + 512];
}
__syncthreads();
}
if (blockDim.x >= 512)
{
if (tid < 256)
{
if(maxData[tid]<maxData[tid + 256])
maxData[tid] = maxData[tid + 256];
if(minData[tid]>minData[tid + 256])
minData[tid] = minData[tid + 256];
}
__syncthreads();
}
if (blockDim.x >= 256) {
if (tid < 128)
{
if(maxData[tid]<maxData[tid + 128])
maxData[tid] = maxData[tid + 128];
if(minData[tid]>minData[tid + 128])
minData[tid] = minData[tid + 128];
}
__syncthreads();
}
if (blockDim.x >= 128)
{
if (tid < 64)
{
if(maxData[tid]<maxData[tid + 64])
maxData[tid] = maxData[tid + 64];
if(minData[tid]>minData[tid + 64])
minData[tid] = minData[tid + 64];
}
__syncthreads();
}
if (tid < 32) {
if (blockDim.x >= 64)
{
{
if(maxData[tid]<maxData[tid + 64])
maxData[tid] = maxData[tid + 64];
if(minData[tid]>minData[tid + 64])
minData[tid] = minData[tid + 64];
}
__syncthreads();
}
if (blockDim.x >= 32)
{
if(maxData[tid]<maxData[tid + 16])
maxData[tid] = maxData[tid + 16];
if(minData[tid]>minData[tid + 16])
minData[tid] = minData[tid + 16];
__syncthreads();
}
if (blockDim.x >= 16)
{
if(maxData[tid]<maxData[tid + 8])
maxData[tid] = maxData[tid + 8];
if(minData[tid]>minData[tid + 8])
minData[tid] = minData[tid + 8];
__syncthreads();
}
if (blockDim.x >= 8)
{
if(maxData[tid]<maxData[tid + 4])
maxData[tid] = maxData[tid + 4];
if(minData[tid]>minData[tid + 4])
minData[tid] = minData[tid + 4];
__syncthreads();
}
if (blockDim.x >= 4)
{
if(maxData[tid]<maxData[tid + 2])
maxData[tid] = maxData[tid + 2];
if(minData[tid]>minData[tid + 2])
minData[tid] = minData[tid + 2];
__syncthreads();
}
if (blockDim.x >= 2)
{
if(maxData[tid]<maxData[tid + 1])
maxData[tid] = maxData[tid + 1];
if(minData[tid]>minData[tid + 1])
minData[tid] = minData[tid + 1];
__syncthreads();
}
}
if (tid==0)
{
minArrayOut[blockIdx.x] = minData[0];
maxArrayOut[blockIdx.x] = maxData[0];
}
}
#include "CudaKernels.cuh"
__global__ void cudaGetROI( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostStartPos, Vec<size_t> hostNewSize )
{
DeviceVec<size_t> newSize = hostNewSize;
DeviceVec<size_t> startPos = hostStartPos;
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate>=startPos && coordinate<startPos+newSize && coordinate<imageIn.getDeviceDims())
{
imageOut[coordinate-startPos] = imageIn[coordinate];
}
}
#include "CudaKernels.cuh"
__global__ void cudaHistogramCreate( CudaImageContainer imageIn, size_t* histogram )
{
//This code is modified from that of Sanders - Cuda by Example
__shared__ size_t tempHisto[NUM_BINS];
tempHisto[threadIdx.x] = 0;
__syncthreads();
int i = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while (i < imageIn.getDeviceDims().product())
{
atomicAdd(&(tempHisto[imageIn[i]]), 1);
i += stride;
}
__syncthreads();
atomicAdd(&(histogram[threadIdx.x]), tempHisto[threadIdx.x]);
}
This diff is collapsed.
......@@ -6,64 +6,57 @@
#include "cuda_runtime.h"
#include "CudaImageContainer.cuh"
__constant__ float cudaConstKernel[MAX_KERNEL_DIM*MAX_KERNEL_DIM*MAX_KERNEL_DIM];
__constant__ extern float cudaConstKernel[MAX_KERNEL_DIM*MAX_KERNEL_DIM*MAX_KERNEL_DIM];
__device__ DevicePixelType* SubDivide(DevicePixelType* pB, DevicePixelType* pE);
__global__ void cudaAddFactor(CudaImageContainer imageIn1, CudaImageContainer imageOut, double factor, DevicePixelType minValue,
DevicePixelType maxValue);
__device__ void SelectElement(DevicePixelType* pB, DevicePixelType* pE, size_t k);
__global__ void cudaAddTwoImagesWithFactor(CudaImageContainer imageIn1, CudaImageContainer imageIn2, CudaImageContainer imageOut,
double factor, DevicePixelType minValue, DevicePixelType maxValue);
__device__ DevicePixelType cudaFindMedian(DevicePixelType* vals, int numVals);
__global__ void cudaMedianFilter(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims);
__global__ void cudaFindMinMax(CudaImageContainer imageIn, double* minArrayOut, double* maxArrayOut, size_t n);
__global__ void cudaMeanFilter(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims);
__global__ void cudaGetROI(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostStartPos, Vec<size_t> hostNewSize);
__global__ void cudaMultiplyImage(CudaImageContainer imageIn, CudaImageContainer imageOut, double factor, DevicePixelType minValue,
DevicePixelType maxValue);
__global__ void cudaHistogramCreate( CudaImageContainer imageIn, size_t* histogram);
__global__ void cudaAddTwoImagesWithFactor(CudaImageContainer imageIn1, CudaImageContainer imageIn2, CudaImageContainer imageOut,
double factor, DevicePixelType minValue, DevicePixelType maxValue);
__global__ void cudaMask(const CudaImageContainer imageIn1, const CudaImageContainer imageIn2, CudaImageContainer imageOut,
DevicePixelType threshold);
__global__ void cudaMultiplyTwoImages(CudaImageContainer imageIn1, CudaImageContainer imageIn2, CudaImageContainer imageOut);
__global__ void cudaMaxFilter(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims);
__global__ void cudaAddFactor(CudaImageContainer imageIn1, CudaImageContainer imageOut, double factor, DevicePixelType minValue,
DevicePixelType maxValue);
__global__ void cudaMaximumIntensityProjection(CudaImageContainer imageIn, CudaImageContainer imageOut);
__global__ void cudaMultAddFilter(CudaImageContainer* imageIn, CudaImageContainer* imageOut, Vec<size_t> hostKernelDims, size_t kernelOffset=0);
__global__ void cudaMeanFilter(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims);
__global__ void cudaMinFilter(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims);
__global__ void cudaMeanImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);
__global__ void cudaMaxFilter(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims);
__global__ void cudaMedianFilter(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims);
__global__ void cudaHistogramCreate( CudaImageContainer imageIn, size_t* histogram);
__global__ void cudaMedianImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);
__global__ void cudaHistogramCreateROI(CudaImageContainer imageIn, size_t* histogram, Vec<size_t> starts,
Vec<size_t> sizes);
__global__ void cudaMinFilter(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims);
__global__ void cudaNormalizeHistogram(size_t* histogram, double* normHistogram, Vec<size_t> imageDims);
__global__ void cudaMultAddFilter(CudaImageContainer* imageIn, CudaImageContainer* imageOut, Vec<size_t> hostKernelDims, size_t kernelOffset=0);
__global__ void cudaThresholdImage(CudaImageContainer imageIn, CudaImageContainer imageOut, DevicePixelType threshold,
DevicePixelType minValue, DevicePixelType maxValue);
__global__ void cudaMultiplyImage(CudaImageContainer imageIn, CudaImageContainer imageOut, double factor, DevicePixelType minValue,
DevicePixelType maxValue);
__global__ void cudaFindMinMax(CudaImageContainer imageIn, double* minArrayOut, double* maxArrayOut, size_t n);
__global__ void cudaMultiplyTwoImages(CudaImageContainer imageIn1, CudaImageContainer imageIn2, CudaImageContainer imageOut);
__global__ void cudaNormalizeHistogram(size_t* histogram, double* normHistogram, Vec<size_t> imageDims);
__global__ void cudaPolyTransferFuncImage(CudaImageContainer imageIn, CudaImageContainer imageOut, double a, double b, double c,
DevicePixelType minPixelValue, DevicePixelType maxPixelValue);
__global__ void cudaSumArray(CudaImageContainer arrayIn, double* arrayOut, size_t n);
__global__ void cudaMeanImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);
__global__ void cudaMedianImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions);
__global__ void cudaMaximumIntensityProjection(CudaImageContainer imageIn, CudaImageContainer imageOut);
__global__ void cudaPow(CudaImageContainer imageIn, CudaImageContainer imageOut, double p);
__global__ void cudaGetROI(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostStartPos, Vec<size_t> hostNewSize);
__global__ void cudaSumArray(CudaImageContainer arrayIn, double* arrayOut, size_t n);
__global__ void cudaPow(CudaImageContainer imageIn, CudaImageContainer imageOut, double p);
__global__ void cudaThresholdImage(CudaImageContainer imageIn, CudaImageContainer imageOut, DevicePixelType threshold,
DevicePixelType minValue, DevicePixelType maxValue);
__global__ void cudaUnmixing(const CudaImageContainer imageIn1, const CudaImageContainer imageIn2, CudaImageContainer imageOut1,
Vec<size_t> hostKernelDims, DevicePixelType minPixelValue, DevicePixelType maxPixelValue);
__global__ void cudaMask(const CudaImageContainer imageIn1, const CudaImageContainer imageIn2, CudaImageContainer imageOut,
DevicePixelType threshold);
\ No newline at end of file
#include "CudaKernels.cuh"
__global__ void cudaMask( const CudaImageContainer imageIn1, const CudaImageContainer imageIn2, CudaImageContainer imageOut,
DevicePixelType threshold )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn1.getDeviceDims())
{
DevicePixelType val=0;
if (imageIn2[coordinate] <= threshold)
val = imageIn1[coordinate];
imageOut[coordinate] = val;
}
}
#include "CudaKernels.cuh"
__constant__ float cudaConstKernel[MAX_KERNEL_DIM*MAX_KERNEL_DIM*MAX_KERNEL_DIM];
__global__ void cudaMaxFilter( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn.getDeviceDims())
{
DevicePixelType maxVal = imageIn[coordinate];
DeviceVec<size_t> kernelDims = hostKernelDims;
DeviceVec<size_t> kernelMidIdx;
DeviceVec<size_t> curCoordIm;
DeviceVec<size_t> curCoordKrn;
kernelMidIdx.x = (kernelDims.x+1)/2;
kernelMidIdx.y = (kernelDims.y+1)/2;
kernelMidIdx.z = (kernelDims.z+1)/2;
//find if the kernel will go off the edge of the image
curCoordIm.z = (size_t) max(0,(int)coordinate.z-(int)kernelMidIdx.z);
curCoordKrn.z = ((int)coordinate.z-(int)kernelMidIdx.z>=0) ? (0) : (kernelMidIdx.z-coordinate.z);
for (; curCoordIm.z<imageIn.getDepth() && curCoordKrn.z<kernelDims.z; ++curCoordIm.z, ++curCoordKrn.z)
{
curCoordIm.y = (size_t)max(0,(int)coordinate.y-(int)kernelMidIdx.y);
curCoordKrn.y = ((int)coordinate.y-(int)kernelMidIdx.y>=0) ? (0) : (kernelMidIdx.y-coordinate.y);
for (; curCoordIm.y<imageIn.getHeight() && curCoordKrn.y<kernelDims.y; ++curCoordIm.y, ++curCoordKrn.y)
{
curCoordIm.x = (size_t)max(0,(int)coordinate.x-(int)kernelMidIdx.x);
curCoordKrn.x = ((int)coordinate.x-(int)kernelMidIdx.x>=0) ? (0) : (kernelMidIdx.x-coordinate.x);
for (; curCoordIm.x<imageIn.getWidth() && curCoordKrn.x<kernelDims.x; ++curCoordIm.x, ++curCoordKrn.x)
{
if(cudaConstKernel[kernelDims.linearAddressAt(curCoordKrn)]>0)
{
maxVal = (DevicePixelType)max((float)maxVal, imageIn[curCoordIm]*
cudaConstKernel[kernelDims.linearAddressAt(curCoordKrn)]);
}
}
}
}
imageOut[coordinate] = maxVal;
}
}
#include "CudaKernels.cuh"
__global__ void cudaMaximumIntensityProjection( CudaImageContainer imageIn, CudaImageContainer imageOut )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn.getDeviceDims() && coordinate.z==0)
{
DevicePixelType maxVal = 0;
for (; coordinate.z<imageIn.getDepth(); ++coordinate.z)
{
if (maxVal<imageIn[coordinate])
{
maxVal = imageIn[coordinate];
}
}
coordinate.z = 0;
imageOut[coordinate] = maxVal;
}
}
#include "CudaKernels.cuh"
__global__ void cudaMeanFilter( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn.getDeviceDims())
{
double val = 0;
double kernelVolume = 0;
DeviceVec<size_t> kernelDims = hostKernelDims;
DeviceVec<size_t> halfKernal = kernelDims/2;
DeviceVec<size_t> curCoordIm = coordinate - halfKernal;
curCoordIm.z = (coordinate.z<halfKernal.z) ? 0 : coordinate.z-halfKernal.z;
for (; curCoordIm.z<coordinate.z+halfKernal.z && curCoordIm.z<imageIn.getDeviceDims().z; ++curCoordIm.z)
{
curCoordIm.y = (coordinate.y<halfKernal.y) ? 0 : coordinate.y-halfKernal.y/2;
for (; curCoordIm.y<coordinate.y+halfKernal.y && curCoordIm.y<imageIn.getDeviceDims().y; ++curCoordIm.y)
{
curCoordIm.x = (coordinate.x<halfKernal.x) ? 0 : coordinate.x-halfKernal.x/2;
for (; curCoordIm.x<coordinate.x+halfKernal.x && curCoordIm.x<imageIn.getDeviceDims().x; ++curCoordIm.x)
{
val += imageIn[curCoordIm];
++kernelVolume;
}
}
}
//imageOut[coordinate] = val/kernelVolume;
imageOut[coordinate] = coordinate.y;
}
}
#include "CudaKernels.cuh"
__global__ void cudaMeanImageReduction(CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions)
{
DeviceVec<size_t> reductions = hostReductions;
DeviceVec<size_t> coordinateOut;
coordinateOut.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinateOut.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinateOut.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinateOut<imageOut.getDeviceDims())
{
int kernelVolume = 0;
double val = 0;
DeviceVec<size_t> mins(coordinateOut*reductions);
DeviceVec<size_t> maxs = DeviceVec<size_t>::min(mins+reductions, imageIn.getDeviceDims());
DeviceVec<size_t> currCorrdIn(mins);
for (currCorrdIn.z=mins.z; currCorrdIn.z<maxs.z; ++currCorrdIn.z)
{
for (currCorrdIn.y=mins.y; currCorrdIn.y<maxs.y; ++currCorrdIn.y)
{
for (currCorrdIn.x=mins.x; currCorrdIn.x<maxs.x; ++currCorrdIn.x)
{
val += (double)imageIn[currCorrdIn];
++kernelVolume;
}
}
}
imageOut[coordinateOut] = val/kernelVolume;
}
}
#include "CudaKernels.cuh"
__global__ void cudaMedianFilter( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims )
{
extern __shared__ DevicePixelType vals[];
DeviceVec<size_t> kernelDims = hostKernelDims;
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
int offset = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
offset *= kernelDims.product();
if (coordinate<imageIn.getDeviceDims())
{
int kernelVolume = 0;
DeviceVec<size_t> kernelDims = hostKernelDims;
DeviceVec<size_t> halfKernal = kernelDims/2;
DeviceVec<size_t> curCoordIm = coordinate - halfKernal;
curCoordIm.z = (coordinate.z<halfKernal.z) ? 0 : coordinate.z-halfKernal.z;
for (; curCoordIm.z<coordinate.z+halfKernal.z && curCoordIm.z<imageIn.getDeviceDims().z; ++curCoordIm.z)
{
curCoordIm.y = (coordinate.y<halfKernal.y) ? 0 : coordinate.y-halfKernal.y/2;
for (; curCoordIm.y<coordinate.y+halfKernal.y && curCoordIm.y<imageIn.getDeviceDims().y; ++curCoordIm.y)
{
curCoordIm.x = (coordinate.x<halfKernal.x) ? 0 : coordinate.x-halfKernal.x/2;
for (; curCoordIm.x<coordinate.x+halfKernal.x && curCoordIm.x<imageIn.getDeviceDims().x; ++curCoordIm.x)
{
vals[kernelVolume+offset] = imageIn[curCoordIm];
++kernelVolume;
}
}
}
imageOut[coordinate] = cudaFindMedian(vals+offset,kernelVolume);
}
__syncthreads();
}
#include "CudaKernels.cuh"
__global__ void cudaMedianImageReduction( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostReductions)
{
extern __shared__ DevicePixelType vals[];
DeviceVec<size_t> reductions = hostReductions;
DeviceVec<size_t> coordinateOut;
coordinateOut.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinateOut.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinateOut.z = threadIdx.z + blockIdx.z * blockDim.z;
int offset = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
offset *= reductions.product();
if (coordinateOut<imageOut.getDeviceDims())
{
int kernelVolume = 0;
DeviceVec<size_t> mins(coordinateOut*DeviceVec<size_t>(reductions));
DeviceVec<size_t> maxs = DeviceVec<size_t>::min(mins+reductions, imageIn.getDeviceDims());
DeviceVec<size_t> currCorrdIn(mins);
for (currCorrdIn.z=mins.z; currCorrdIn.z<maxs.z; ++currCorrdIn.z)
{
for (currCorrdIn.y=mins.y; currCorrdIn.y<maxs.y; ++currCorrdIn.y)
{
for (currCorrdIn.x=mins.x; currCorrdIn.x<maxs.x; ++currCorrdIn.x)
{
vals[offset+kernelVolume] = imageIn[currCorrdIn];
++kernelVolume;
}
}
}
imageOut[coordinateOut] = cudaFindMedian(vals+offset,kernelVolume);
}
__syncthreads();
}
#include "CudaKernels.cuh"
__global__ void cudaMinFilter( CudaImageContainer imageIn, CudaImageContainer imageOut, Vec<size_t> hostKernelDims )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn.getDeviceDims())
{
DevicePixelType minVal = imageIn[coordinate];
DeviceVec<size_t> kernelDims = hostKernelDims;
DeviceVec<size_t> kernelMidIdx;
DeviceVec<size_t> curCoordIm;
DeviceVec<size_t> curCoordKrn;
kernelMidIdx.x = kernelDims.x/2;
kernelMidIdx.y = kernelDims.y/2;
kernelMidIdx.z = kernelDims.z/2;
//find if the kernel will go off the edge of the image
curCoordIm.z = (size_t) max(0,(int)coordinate.z-(int)kernelMidIdx.z);
curCoordKrn.z = ((int)coordinate.z-(int)kernelMidIdx.z>=0) ? (0) : (kernelMidIdx.z-coordinate.z);
for (; curCoordIm.z<imageIn.getDepth() && curCoordKrn.z<kernelDims.z; ++curCoordIm.z, ++curCoordKrn.z)
{
curCoordIm.y = (size_t)max(0,(int)coordinate.y-(int)kernelMidIdx.y);
curCoordKrn.y = ((int)coordinate.y-(int)kernelMidIdx.y>=0) ? (0) : (kernelMidIdx.y-coordinate.y);
for (; curCoordIm.y<imageIn.getHeight() && curCoordKrn.y<kernelDims.y; ++curCoordIm.y, ++curCoordKrn.y)
{
curCoordIm.x = (size_t)max(0,(int)coordinate.x-(int)kernelMidIdx.x);
curCoordKrn.x = ((int)coordinate.x-(int)kernelMidIdx.x>=0) ? (0) : (kernelMidIdx.x-coordinate.x);
for (; curCoordIm.x<imageIn.getWidth() && curCoordKrn.x<kernelDims.x; ++curCoordIm.x, ++curCoordKrn.x)
{
if(cudaConstKernel[kernelDims.linearAddressAt(curCoordKrn)]>0)
{
minVal = (DevicePixelType)min((float)minVal, imageIn[curCoordIm]*
cudaConstKernel[kernelDims.linearAddressAt(curCoordKrn)]);
}
}
}
}
imageOut[coordinate] = minVal;
}
}
#include "CudaKernels.cuh"
__global__ void cudaMultAddFilter( CudaImageContainer* imageIn, CudaImageContainer* imageOut, Vec<size_t> hostKernelDims, size_t kernelOffset/*=0*/ )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn->getDeviceDims())
{
double val = 0;
double kernFactor = 0;
DeviceVec<size_t> kernelDims = hostKernelDims;
DeviceVec<size_t> kernelMidIdx;
DeviceVec<size_t> curCoordIm;
DeviceVec<size_t> curCoordKrn;
kernelMidIdx.x = kernelDims.x/2;
kernelMidIdx.y = kernelDims.y/2;
kernelMidIdx.z = kernelDims.z/2;
//find if the kernel will go off the edge of the image
curCoordIm.z = (size_t) max(0,(int)coordinate.z-(int)kernelMidIdx.z);
curCoordKrn.z = ((int)coordinate.z-(int)kernelMidIdx.z>=0) ? (0) : (kernelMidIdx.z-coordinate.z);
for (; curCoordIm.z<imageIn->getDepth() && curCoordKrn.z<kernelDims.z; ++curCoordIm.z, ++curCoordKrn.z)
{
curCoordIm.y = (size_t)max(0,(int)coordinate.y-(int)kernelMidIdx.y);
curCoordKrn.y = ((int)coordinate.y-(int)kernelMidIdx.y>=0) ? (0) : (kernelMidIdx.y-coordinate.y);
for (; curCoordIm.y<imageIn->getHeight() && curCoordKrn.y<kernelDims.y; ++curCoordIm.y, ++curCoordKrn.y)
{
curCoordIm.x = (size_t)max(0,(int)coordinate.x-(int)kernelMidIdx.x);
curCoordKrn.x = ((int)coordinate.x-(int)kernelMidIdx.x>=0) ? (0) : (kernelMidIdx.x-coordinate.x);
for (; curCoordIm.x<imageIn->getWidth() && curCoordKrn.x<kernelDims.x; ++curCoordIm.x, ++curCoordKrn.x)
{
size_t kernIdx = kernelDims.linearAddressAt(curCoordKrn)+kernelOffset;
kernFactor += cudaConstKernel[kernIdx];
val += (*imageIn)[curCoordIm] * cudaConstKernel[kernIdx];
}
}
}
(*imageOut)[coordinate] = val/kernFactor;
}
}
#include "CudaKernels.cuh"
__global__ void cudaMultiplyImage( CudaImageContainer imageIn, CudaImageContainer imageOut, double factor, DevicePixelType minValue,
DevicePixelType maxValue )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn.getDeviceDims())
{
imageOut[coordinate] = min((double)maxValue,max((double)minValue, factor*imageIn[coordinate]));
}
}
#include "CudaKernels.cuh"
__global__ void cudaMultiplyTwoImages( CudaImageContainer imageIn1, CudaImageContainer imageIn2, CudaImageContainer imageOut )
{
DeviceVec<size_t> coordinate;
coordinate.x = threadIdx.x + blockIdx.x * blockDim.x;
coordinate.y = threadIdx.y + blockIdx.y * blockDim.y;
coordinate.z = threadIdx.z + blockIdx.z * blockDim.z;
if (coordinate<imageIn1.getDeviceDims())
{
DevicePixelType val1 = imageIn1[coordinate];
DevicePixelType val2 = imageIn2[coordinate];
imageOut[coordinate] = imageIn1[coordinate] * imageIn2[coordinate];
}
}
#include "CudaKernels.cuh"
__global__ void cudaNormalizeHistogram(size_t* histogram, double* normHistogram, Vec<size_t> imageDims)
{
int x = blockIdx.x;
normHistogram[x] = (double)(histogram[x]) / (imageDims.x*imageDims.y*imageDims.z);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment