Fixed LoG (still bleeds on edges the same way MATLAB does)

29f554c6 · Eric Wait · 9e5d22a3 · 29f554c6 · 29f554c6 · 29f554c6
Commit 29f554c6 authored 6 years ago by Eric Wait
--- a/src/c/Cuda/CudaLoG.cuh
+++ b/src/c/Cuda/CudaLoG.cuh
@@ -10,7 +10,7 @@
 #include "Defines.h"
 #include "Vec.h"
 #include "KernelGenerators.h"
-#include "SeparableMultiplySum.cuh"
+#include "CudaAddTwoImages.cuh"

 #include <cuda_runtime.h>
 #include <limits>
@@ -21,16 +21,16 @@ void cLoG(ImageContainer<PixelTypeIn> imageIn, ImageContainer<float>& imageOut,
 {
 	const float MIN_VAL = std::numeric_limits<float>::lowest();
 	const float MAX_VAL = std::numeric_limits<float>::max();
-	const int NUM_BUFF_NEEDED = 2;
+	const int NUM_BUFF_NEEDED = 3;

 	setUpOutIm<float>(imageIn.getDims(), imageOut);

-	CudaDevices cudaDevs(cudaMultiplySum<float, float>, device);
+	CudaDevices cudaDevs(cudaAddTwoImages<float,float,float>, device);

-	Vec<size_t> kernDims(0);
-	float* hostKernels = createGaussianKernel(sigmas, kernDims);
+	Vec<size_t> kernelDims(0);
+	float* hostLoG_GausKernels = createLoG_GausKernels(sigmas, kernelDims);

-	std::vector<ImageChunk> chunks = calculateBuffers(imageIn.getDims(), NUM_BUFF_NEEDED, cudaDevs, sizeof(float), kernDims);
+	std::vector<ImageChunk> chunks = calculateBuffers(imageIn.getDims(), NUM_BUFF_NEEDED, cudaDevs, sizeof(float), kernelDims);

 	Vec<size_t> maxDeviceDims;
 	setMaxDeviceDims(chunks, maxDeviceDims);
@@ -42,29 +42,96 @@ void cLoG(ImageContainer<PixelTypeIn> imageIn, ImageContainer<float>& imageOut,
 		const int N_THREADS = omp_get_num_threads();
 		const int CUR_DEVICE = cudaDevs.getDeviceIdx(CUDA_IDX);

-		CudaDeviceImages<float> deviceImages(NUM_BUFF_NEEDED, maxDeviceDims, CUR_DEVICE);
+		CudaDeviceImages<float> deviceImages(NUM_BUFF_NEEDED-1, maxDeviceDims, CUR_DEVICE);
+		CudaDeviceImages<float> deviceImagesScratch(1, maxDeviceDims, CUR_DEVICE);

-		Kernel constFullKern(kernDims.sum(), hostKernels, CUR_DEVICE);
-		Kernel constKernelMem_x = constFullKern.getOffsetCopy(Vec<size_t>(kernDims.x, 1, 1), 0);
-		Kernel constKernelMem_y = constFullKern.getOffsetCopy(Vec<size_t>(1, kernDims.y, 1), kernDims.x);
-		Kernel constKernelMem_z = constFullKern.getOffsetCopy(Vec<size_t>(1, 1, kernDims.z), kernDims.x + kernDims.y);
+		size_t logStride = kernelDims.sum();
+		Kernel constFullKern(logStride*2, hostLoG_GausKernels, CUR_DEVICE);
+
+		Kernel constLoGKernelMem_x = constFullKern.getOffsetCopy(Vec<size_t>(kernelDims.x, 1, 1), 0);
+		Kernel constLoGKernelMem_y = constFullKern.getOffsetCopy(Vec<size_t>(1, kernelDims.y, 1), kernelDims.x);
+		Kernel constLoGKernelMem_z = constFullKern.getOffsetCopy(Vec<size_t>(1, 1, kernelDims.z), kernelDims.x + kernelDims.y);
+
+		Kernel constGausKernelMem_x = constFullKern.getOffsetCopy(Vec<size_t>(kernelDims.x, 1, 1), logStride);
+		Kernel constGausKernelMem_y = constFullKern.getOffsetCopy(Vec<size_t>(1, kernelDims.y, 1), kernelDims.x + logStride);
+		Kernel constGausKernelMem_z = constFullKern.getOffsetCopy(Vec<size_t>(1, 1, kernelDims.z), kernelDims.x + kernelDims.y + logStride);

 		for (int i = CUDA_IDX; i < chunks.size(); i += N_THREADS)
 		{
-			if (!chunks[i].sendROI(imageIn, deviceImages.getCurBuffer()))
-				std::runtime_error("Error sending ROI to device!");
-
+			size_t memsize = sizeof(float)*chunks[i].getFullChunkSize().product();
 			deviceImages.setAllDims(chunks[i].getFullChunkSize());
+			deviceImagesScratch.setAllDims(chunks[i].getFullChunkSize());

-			for (int j = 0; j < numIterations; ++j)
+			HANDLE_ERROR(cudaMemset(deviceImagesScratch.getCurBuffer()->getDeviceImagePointer(), 0, memsize));
+
+			// apply LoG in X
+			if (sigmas.x!=0)
 			{
-				SeparableMultiplySum(chunks[i], deviceImages, constKernelMem_x, constKernelMem_y, constKernelMem_z, MIN_VAL, MAX_VAL);
+				if (!chunks[i].sendROI(imageIn, deviceImages.getCurBuffer()))
+					std::runtime_error("Error sending ROI to device!");
+				cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constLoGKernelMem_x, MIN_VAL, MAX_VAL, false);
+				deviceImages.incrementBuffer();
+				if (sigmas.y!=0)
+				{
+					cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_y, MIN_VAL, MAX_VAL);
+					deviceImages.incrementBuffer();
+				}
+				if (sigmas.z!=0)
+				{
+					cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_z, MIN_VAL, MAX_VAL);
+					deviceImages.incrementBuffer();
+				}
+				cudaAddTwoImages << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), MIN_VAL, MAX_VAL);
+				DEBUG_KERNEL_CHECK();
 			}
-			chunks[i].retriveROI(imageOut, deviceImages.getCurBuffer());
+
+			// apply LoG in Y
+			if (sigmas.y!=0)
+			{
+				if (!chunks[i].sendROI(imageIn, deviceImages.getCurBuffer()))
+					std::runtime_error("Error sending ROI to device!");
+				if (sigmas.x!=0)
+				{
+					cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_x, MIN_VAL, MAX_VAL);
+					deviceImages.incrementBuffer();
+				}
+				cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constLoGKernelMem_y, MIN_VAL, MAX_VAL, false);
+				deviceImages.incrementBuffer();
+				if (sigmas.z!=0)
+				{
+					cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_z, MIN_VAL, MAX_VAL);
+					deviceImages.incrementBuffer();
+				}
+				cudaAddTwoImages << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), MIN_VAL, MAX_VAL);
+				DEBUG_KERNEL_CHECK();
+			}
+
+			// apply LoG in Z
+			if (sigmas.z!=0)
+			{
+				if (!chunks[i].sendROI(imageIn, deviceImages.getCurBuffer()))
+					std::runtime_error("Error sending ROI to device!");
+				if (sigmas.x!=0)
+				{
+					cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_x, MIN_VAL, MAX_VAL);
+					deviceImages.incrementBuffer();
+				}
+				if(sigmas.y!=0)
+				{
+					cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_y, MIN_VAL, MAX_VAL);
+					deviceImages.incrementBuffer();
+				}
+				cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constLoGKernelMem_z, MIN_VAL, MAX_VAL, false);
+				deviceImages.incrementBuffer();
+				cudaAddTwoImages << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), MIN_VAL, MAX_VAL);
+				DEBUG_KERNEL_CHECK();
+			}
+
+			chunks[i].retriveROI(imageOut, deviceImagesScratch.getCurBuffer());
 		}

 		constFullKern.clean();
 	}

-	delete[] hostKernels;
+	delete[] hostLoG_GausKernels;
 }
--- a/src/c/Cuda/KernelGenerators.h
+++ b/src/c/Cuda/KernelGenerators.h
@@ -5,4 +5,4 @@
 #include <vector>

 float* createGaussianKernel(Vec<double> sigmas, Vec<size_t>& dimsOut);
-float* createLoGKernel(Vec<double> sigmas, Vec<size_t>& dimsOut);
+float* createLoG_GausKernels(Vec<double> sigmas, Vec<size_t>& dimsOut);
--- a/src/c/Cuda/LoGKernel.cpp
+++ b/src/c/Cuda/LoGKernel.cpp
@@ -2,7 +2,7 @@

 #include <functional>

-float* createLoGKernel(Vec<double> sigmas, Vec<size_t>& dimsOut)
+float* createLoG_GausKernels(Vec<double> sigmas, Vec<size_t>& dimsOut)
 {
 	const double PI = std::atan(1.0) * 4;

@@ -16,79 +16,124 @@ float* createLoGKernel(Vec<double> sigmas, Vec<size_t>& dimsOut)

 	Vec<double> mid = Vec<double>(dimsOut) / 2.0 - 0.5;

-	float* kernelOut = new float[dimsOut.sum()];
-
-	bool is3d = sigmas != Vec<double>(0.0);
+	float* kernelOut = new float[dimsOut.sum()*2];

 	Vec<double> sigmaSqr = sigmas.pwr(2);
+	Vec<double> oneOverSigSqr = Vec<double>(1.0) / sigmaSqr;
 	Vec<double> twoSigmaSqr = sigmaSqr * 2;
+	Vec<double> sigmaForth = sigmas.pwr(4);
+
+	int loGstride = dimsOut.sum();

-	if (is3d)
+	for (int i = 0; i < 3; ++i)
 	{
-		// LaTeX form of LoG
-		// $\frac{\Big(\frac{(x-\mu_x)^2}{\sigma_x^4}-\frac{1}{\sigma_x^2}+\frac{(y-\mu_y)^2}{\sigma_y^4}-\frac{1}{\sigma_y^2}+\frac{(z-\mu_z)^2}{\sigma_z^4}-\frac{1}{\sigma_z^2}\Big)\exp\Big(-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}-\frac{(z-\mu)^2}{2\sigma_z^2}\Big)}{(2\pi)^{\frac{3}{2}}\sigma_x\sigma_y\sigma_z}$
-		Vec<double> sigma4th = sigmas.pwr(4);
-		double subtractor = (Vec<double>(1.0f, 1.0f, 1.0f) / sigmaSqr).sum();
-		double denominator = pow(2.0*PI, 3.0 / 2.0)*sigmas.product();
+		int stride = 0;
+		if (i > 0)
+			stride = dimsOut.x;
+		if (i > 1)
+			stride += dimsOut.y;

-		for (int i =0; i<3; ++i)
+		if (sigmas.e[i] == 0)
 		{
-			size_t startOffset = 0;
-			if (i > 0)
-				startOffset += dimsOut.x;
-
-			if (i > 1)
-				startOffset += dimsOut.y;
-
-			for (int j = 0; j<dimsOut.e[i]; ++j)
-			{
-				int firstOther = 0;
-				if (j == 0)
-					firstOther = 1;
-				int secondOther = 2;
-				if (j == 2)
-					secondOther = 1;
-
-				double firstAdditive = -1.0 / sigmaSqr.e[firstOther];
-				double secondAdditive = -1.0 / sigmaSqr.e[secondOther];
-
-				double muSub = SQR(j - mid.e[i]);
-				double muSubSigSqr = muSub / (2 * sigmaSqr.e[i]);
-				double additive = muSub / sigma4th.e[i] - 1.0 / sigmaSqr.e[i];
+			kernelOut[stride] = 0.0f;
+			kernelOut[stride + loGstride] = 1.0f;
+			continue;
+		}

-				double posVal = ((additive + firstAdditive + secondAdditive) * exp(-muSubSigSqr)) / denominator;
-				kernelOut[startOffset + j] = (float)posVal;
-			}
+		double gaussSum = 0.0;
+		for (int j = 0; j < dimsOut.e[i]; ++j)
+		{
+			double pos = j - mid.e[i];
+			double posSqr = SQR(pos);
+			double gauss = exp(-(posSqr / twoSigmaSqr.e[i]));
+			double logVal = (posSqr / sigmaForth.e[i] - oneOverSigSqr.e[i])*gauss;
+			kernelOut[j + stride] = (float)logVal;
+			kernelOut[j + stride + loGstride] = gauss;
+			gaussSum += gauss;
+		}

+		double sumVal = 0.0;
+		for (int j = 0; j < dimsOut.e[i]; ++j)
+		{
+			kernelOut[j + stride] /= (float)gaussSum;
+			sumVal += kernelOut[j + stride];
+			kernelOut[j + stride + loGstride] /= (float)gaussSum;
 		}
-	}
-	else
-	{
-		// LaTeX form of LoG
-		// $\frac{-1}{\pi\sigma_x^2\sigma_y^2}\Bigg(1-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}\Bigg)\exp\Bigg(-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}\Bigg)$
-		// figure out which dim is zero
-		double sigProd = 1.0;
-		if (sigmas.x != 0)
-			sigProd *= sigmas.x;
-		if (sigmas.y != 0)
-			sigProd *= sigmas.y;
-		if (sigmas.z != 0)
-			sigProd *= sigmas.z;
-
-		double denominator = -PI*sigProd;
-
-		for (int i=0; i<2; ++i)
+
+		for (int j = 0; j < dimsOut.e[i]; ++j)
 		{
-			size_t startOffset = i*dimsOut.e[0];
-
-			for (int j = 0; j < dimsOut.e[i]; ++j)
-			{
-				double gaussComp = SQR(j - mid.e[i]) / twoSigmaSqr.e[i];
-				double posVal = ((1.0 - gaussComp)*exp(-gaussComp)) / denominator;
-				kernelOut[startOffset + j] = (float)posVal;
-			}
+			kernelOut[j + stride] -= (float)sumVal;
 		}
 	}

+	//bool is3d = sigmas != Vec<double>(0.0);
+	//
+	//if (is3d)
+	//{
+	//	// LaTeX form of LoG
+	//	// $\frac{\Big(\frac{(x-\mu_x)^2}{\sigma_x^4}-\frac{1}{\sigma_x^2}+\frac{(y-\mu_y)^2}{\sigma_y^4}-\frac{1}{\sigma_y^2}+\frac{(z-\mu_z)^2}{\sigma_z^4}-\frac{1}{\sigma_z^2}\Big)\exp\Big(-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}-\frac{(z-\mu)^2}{2\sigma_z^2}\Big)}{(2\pi)^{\frac{3}{2}}\sigma_x\sigma_y\sigma_z}$
+	//	Vec<double> sigma4th = sigmas.pwr(4);
+	//	double subtractor = (Vec<double>(1.0f, 1.0f, 1.0f) / sigmaSqr).sum();
+	//	double denominator = pow(2.0*PI, 3.0 / 2.0)*sigmas.product();
+
+	//	for (int i =0; i<3; ++i)
+	//	{
+	//		size_t startOffset = 0;
+	//		if (i > 0)
+	//			startOffset += dimsOut.x;
+
+	//		if (i > 1)
+	//			startOffset += dimsOut.y;
+
+	//		for (int j = 0; j<dimsOut.e[i]; ++j)
+	//		{
+	//			int firstOther = 0;
+	//			if (j == 0)
+	//				firstOther = 1;
+	//			int secondOther = 2;
+	//			if (j == 2)
+	//				secondOther = 1;
+
+	//			double firstAdditive = -1.0 / sigmaSqr.e[firstOther];
+	//			double secondAdditive = -1.0 / sigmaSqr.e[secondOther];
+
+	//			double muSub = SQR(j - mid.e[i]);
+	//			double muSubSigSqr = muSub / (2 * sigmaSqr.e[i]);
+	//			double additive = muSub / sigma4th.e[i] - 1.0 / sigmaSqr.e[i];
+
+	//			double posVal = ((additive + firstAdditive + secondAdditive) * exp(-muSubSigSqr)) / denominator;
+	//			kernelOut[startOffset + j] = (float)posVal;
+	//		}
+
+	//	}
+	//}
+	//else
+	//{
+	//	// LaTeX form of LoG
+	//	// $\frac{-1}{\pi\sigma_x^2\sigma_y^2}\Bigg(1-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}\Bigg)\exp\Bigg(-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}\Bigg)$
+	//	// figure out which dim is zero
+	//	double sigProd = 1.0;
+	//	if (sigmas.x != 0)
+	//		sigProd *= sigmas.x;
+	//	if (sigmas.y != 0)
+	//		sigProd *= sigmas.y;
+	//	if (sigmas.z != 0)
+	//		sigProd *= sigmas.z;
+
+	//	double denominator = -PI*sigProd;
+
+	//	for (int i=0; i<2; ++i)
+	//	{
+	//		size_t startOffset = i*dimsOut.e[0];
+
+	//		for (int j = 0; j < dimsOut.e[i]; ++j)
+	//		{
+	//			double gaussComp = SQR(j - mid.e[i]) / twoSigmaSqr.e[i];
+	//			double posVal = ((1.0 - gaussComp)*exp(-gaussComp)) / denominator;
+	//			kernelOut[startOffset + j] = (float)posVal;
+	//		}
+	//	}
+	//}
+
 	return kernelOut;
 }