Newer
Older
#include "Vec.h"
#include "CudaUtilities.cuh"
template <class PixelType>
__global__ void cudaSum(PixelType* arrayIn, double* arrayOut, size_t n)
size_t i = threadIdx.x + blockIdx.x*blockDim.x;
sums[threadIdx.x] += (double)(arrayIn[i+imStride]);
i += imStride;
for (int localStride=blockDim.x/2; localStride>0; localStride=localStride/2)
{
if (threadIdx.x<localStride)
sums[threadIdx.x] += sums[threadIdx.x+localStride];
else
break;
__syncthreads();
}
if (threadIdx.x==0)
{
arrayOut[blockIdx.x] = sums[0];
}
template <class PixelType>
double sumArray(const PixelType* imageIn, size_t n, int device=0)
{
double sum = 0.0;
double* deviceSum;
double* hostSum;
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
size_t availMem, total;
cudaMemGetInfo(&availMem,&total);
size_t numValsPerChunk = MIN(n,(size_t)((availMem*MAX_MEM_AVAIL)/sizeof(PixelType)));
int maxBlocks = (int)ceil((double)numValsPerChunk/(2*props.maxThreadsPerBlock));
int threads = props.maxThreadsPerBlock;
HANDLE_ERROR(cudaMalloc((void**)&deviceBuffer,sizeof(PixelType)*numValsPerChunk));
HANDLE_ERROR(cudaMalloc((void**)&deviceSum,sizeof(double)*maxBlocks));
for (size_t startIdx=0; startIdx<n; startIdx += numValsPerChunk)
size_t curNumVals = MIN(numValsPerChunk,n-startIdx);
HANDLE_ERROR(cudaMemcpy(deviceBuffer,imageIn+startIdx,sizeof(PixelType)*curNumVals,cudaMemcpyHostToDevice));
int blocks = (int)ceil((double)curNumVals/(2*props.maxThreadsPerBlock));
size_t sharedMemSize = sizeof(double)*props.maxThreadsPerBlock;
cudaSum<<<blocks,threads,sharedMemSize>>>(deviceBuffer,deviceSum,curNumVals);
DEBUG_KERNEL_CHECK();
HANDLE_ERROR(cudaMemcpy(hostSum,deviceSum,sizeof(double)*blocks,cudaMemcpyDeviceToHost));