#include "CudaKernels.cuh" __global__ void cudaSumArray(DevicePixelType* arrayIn, double* arrayOut, size_t n) { //This algorithm was used from a this website: // http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf // accessed 4/28/2013 extern __shared__ double sdata[]; size_t tid = threadIdx.x; size_t i = blockIdx.x*blockDim.x + tid; size_t gridSize = blockDim.x*gridDim.x; sdata[tid] = (double)(arrayIn[i]); do { if (i+blockDim.x<n) sdata[tid] += (double)(arrayIn[i+blockDim.x]); i += gridSize; }while (i<n); __syncthreads(); if (blockDim.x >= 2048) { if (tid < 1024) sdata[tid] += sdata[tid + 1024]; __syncthreads(); } if (blockDim.x >= 1024) { if (tid < 512) sdata[tid] += sdata[tid + 512]; __syncthreads(); } if (blockDim.x >= 512) { if (tid < 256) sdata[tid] += sdata[tid + 256]; __syncthreads(); } if (blockDim.x >= 256) { if (tid < 128) sdata[tid] += sdata[tid + 128]; __syncthreads(); } if (blockDim.x >= 128) { if (tid < 64) sdata[tid] += sdata[tid + 64]; __syncthreads(); } if (tid < 32) { if (blockDim.x >= 64) { sdata[tid] += sdata[tid + 32]; __syncthreads(); } if (blockDim.x >= 32) { sdata[tid] += sdata[tid + 16]; __syncthreads(); } if (blockDim.x >= 16) { sdata[tid] += sdata[tid + 8]; __syncthreads(); } if (blockDim.x >= 8) { sdata[tid] += sdata[tid + 4]; __syncthreads(); } if (blockDim.x >= 4) { sdata[tid] += sdata[tid + 2]; __syncthreads(); } if (blockDim.x >= 2) { sdata[tid] += sdata[tid + 1]; __syncthreads(); } } if (tid==0) arrayOut[blockIdx.x] = sdata[0]; }