Newer
Older
template <class PixelType>
__global__ void cudaSum(PixelType* arrayIn, double* arrayOut, size_t n)
{
//This algorithm was used from a this website:
// http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
// accessed 4/28/2013
extern __shared__ double sdata[];
size_t tid = threadIdx.x;
size_t i = blockIdx.x*blockDim.x + tid;
size_t gridSize = blockDim.x*gridDim.x;
sdata[tid] = (double)(arrayIn[i]);
__syncthreads();
if (blockDim.x >= 2048)
{
if (tid < 1024)
sdata[tid] += sdata[tid + 1024];
__syncthreads();
}
if (blockDim.x >= 1024)
{
if (tid < 512)
sdata[tid] += sdata[tid + 512];
__syncthreads();
}
if (blockDim.x >= 512)
{
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockDim.x >= 256) {
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
}
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
if (blockDim.x >= 128)
{
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) {
if (blockDim.x >= 64)
{
sdata[tid] += sdata[tid + 32];
__syncthreads();
}
if (blockDim.x >= 32)
{
sdata[tid] += sdata[tid + 16];
__syncthreads();
}
if (blockDim.x >= 16)
{
sdata[tid] += sdata[tid + 8];
__syncthreads();
}
if (blockDim.x >= 8)
{
sdata[tid] += sdata[tid + 4];
__syncthreads();
}
if (blockDim.x >= 4)
{
sdata[tid] += sdata[tid + 2];
__syncthreads();
}
if (blockDim.x >= 2)
{
sdata[tid] += sdata[tid + 1];
__syncthreads();
}
}
if (tid==0)
arrayOut[blockIdx.x] = sdata[0];