Skip to content
Snippets Groups Projects
CudaSum.cuh 2.32 KiB
Newer Older
Eric Wait's avatar
Eric Wait committed
#pragma once
#include "Vec.h"
#include "CudaUtilities.cuh"
Eric Wait's avatar
Eric Wait committed
#include "cuda_runtime.h"
Eric Wait's avatar
Eric Wait committed

Eric Wait's avatar
Eric Wait committed
template <class PixelTypeIn, class OutType>
__global__ void cudaSum(PixelTypeIn* arrayIn, OutType* arrayOut, size_t n)
Eric Wait's avatar
Eric Wait committed
{
Eric Wait's avatar
Eric Wait committed
	extern __shared__ unsigned char sharedMem[];
	OutType* sums = (OutType*)sharedMem;
Eric Wait's avatar
Eric Wait committed

	size_t i = threadIdx.x + blockIdx.x*blockDim.x;
Eric Wait's avatar
Eric Wait committed
	size_t imStride = blockDim.x*gridDim.x;

	if (i<n)
Eric Wait's avatar
Eric Wait committed
	{
Eric Wait's avatar
Eric Wait committed
		sums[threadIdx.x] = (OutType)(arrayIn[i]);
Eric Wait's avatar
Eric Wait committed
		while (i+imStride<n)
Eric Wait's avatar
Eric Wait committed
			sums[threadIdx.x] += (OutType)(arrayIn[i+imStride]);
Eric Wait's avatar
Eric Wait committed
			i += imStride;
Eric Wait's avatar
Eric Wait committed
	}
	else
	{
		sums[threadIdx.x] = 0;
	}
Eric Wait's avatar
Eric Wait committed

Eric Wait's avatar
Eric Wait committed
	__syncthreads();
Eric Wait's avatar
Eric Wait committed

Eric Wait's avatar
Eric Wait committed
	for (int localStride=blockDim.x/2; localStride>0; localStride=localStride/2)
	{
		if (threadIdx.x<localStride)
			sums[threadIdx.x] += sums[threadIdx.x+localStride];
Eric Wait's avatar
Eric Wait committed
		__syncthreads();
	}
Eric Wait's avatar
Eric Wait committed
	if (threadIdx.x==0)
	{
		arrayOut[blockIdx.x] = sums[0];
Eric Wait's avatar
Eric Wait committed
	}
	__syncthreads();
Eric Wait's avatar
Eric Wait committed
template < class OutType, class PixelTypeIn>
OutType sumArray(const PixelTypeIn* imageIn, size_t n, int device=0)
Eric Wait's avatar
Eric Wait committed
	OutType sum = 0;
	OutType* deviceSum;
	OutType* hostSum;
	PixelTypeIn* deviceBuffer;

	cudaDeviceProp props;
	cudaGetDeviceProperties(&props, device);

	size_t availMem, total;
	cudaMemGetInfo(&availMem,&total);

Eric Wait's avatar
Eric Wait committed
	size_t numValsPerChunk = MIN(n,(size_t)((availMem*MAX_MEM_AVAIL)/sizeof(PixelTypeIn)));
Eric Wait's avatar
Eric Wait committed
	int threads = props.maxThreadsPerBlock;
Eric Wait's avatar
Eric Wait committed
	int maxBlocks = (int)ceil((double)numValsPerChunk/(threads*2)); 
Eric Wait's avatar
Eric Wait committed

Eric Wait's avatar
Eric Wait committed
	HANDLE_ERROR(cudaMalloc((void**)&deviceBuffer,sizeof(PixelTypeIn)*numValsPerChunk));
	HANDLE_ERROR(cudaMalloc((void**)&deviceSum,sizeof(OutType)*maxBlocks));
Eric Wait's avatar
Eric Wait committed
	hostSum = new OutType[maxBlocks];
	for (size_t startIdx=0; startIdx<n; startIdx += numValsPerChunk)
		size_t curNumVals = MIN(numValsPerChunk,n-startIdx);

Eric Wait's avatar
Eric Wait committed
		HANDLE_ERROR(cudaMemcpy(deviceBuffer,imageIn+startIdx,sizeof(PixelTypeIn)*curNumVals,cudaMemcpyHostToDevice));
Eric Wait's avatar
Eric Wait committed
		int blocks = (int)ceil((double)curNumVals/(threads*2));
		size_t sharedMemSize = sizeof(OutType)*threads;
Eric Wait's avatar
Eric Wait committed
		cudaSum<<<blocks,threads,sharedMemSize>>>(deviceBuffer,deviceSum,curNumVals);
		DEBUG_KERNEL_CHECK();

Eric Wait's avatar
Eric Wait committed
		HANDLE_ERROR(cudaMemcpy(hostSum,deviceSum,sizeof(OutType)*blocks,cudaMemcpyDeviceToHost));
		for (int i=0; i<blocks; ++i)
Eric Wait's avatar
Eric Wait committed
			sum += hostSum[i];
Eric Wait's avatar
Eric Wait committed

Eric Wait's avatar
Eric Wait committed
		memset(hostSum,0,sizeof(OutType)*maxBlocks);
	}

	HANDLE_ERROR(cudaFree(deviceSum));
	HANDLE_ERROR(cudaFree(deviceBuffer));

	delete[] hostSum;

	return sum;
Eric Wait's avatar
Eric Wait committed
}