Newer
Older
#include "Vec.h"
#include "CudaUtilities.cuh"
template <class PixelType>
__global__ void cudaSum(PixelType* arrayIn, double* arrayOut, size_t n)
size_t i = threadIdx.x + blockIdx.x*blockDim.x;
size_t stride = blockDim.x*gridDim.x;
if (i<n)
while (i<n)
{
sums[threadIdx.x] += (double)(arrayIn[i]);
for (int reduceUpTo = blockDim.x/2; reduceUpTo>0; reduceUpTo /= 2)
if (threadIdx.x<reduceUpTo)
sums[threadIdx.x] += sums[threadIdx.x+reduceUpTo];
__syncthreads();
}
template <class PixelType>
double sumArray(const PixelType* imageIn, size_t n, int device=0)
{
double sum = 0.0;
double* deviceSum;
double* hostSum;
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
size_t availMem, total;
cudaMemGetInfo(&availMem,&total);
size_t numValsPerChunk = MIN(n,(size_t)((availMem*MAX_MEM_AVAIL)/sizeof(PixelType)));
HANDLE_ERROR(cudaMalloc((void**)&deviceBuffer,sizeof(PixelType)*numValsPerChunk));
HANDLE_ERROR(cudaMalloc((void**)&deviceSum,sizeof(double)*props.multiProcessorCount));
hostSum = new double[props.multiProcessorCount];
for (size_t startIdx=0; startIdx<n; startIdx += numValsPerChunk)
size_t curNumVals = MIN(numValsPerChunk,n-startIdx);
HANDLE_ERROR(cudaMemcpy(deviceBuffer,imageIn+startIdx,sizeof(PixelType)*curNumVals,cudaMemcpyHostToDevice));
int threads = (int)MIN((size_t)props.maxThreadsPerBlock,curNumVals);
int blocks = MIN(props.multiProcessorCount,(int)ceil((double)curNumVals/threads));
cudaSum<<<blocks,threads,sizeof(double)*threads>>>(deviceBuffer,deviceSum,curNumVals);
DEBUG_KERNEL_CHECK();
HANDLE_ERROR(cudaMemcpy(hostSum,deviceSum,sizeof(double)*blocks,cudaMemcpyDeviceToHost));
}
}
HANDLE_ERROR(cudaFree(deviceSum));