Skip to content
Snippets Groups Projects
Commit 29f554c6 authored by Eric Wait's avatar Eric Wait
Browse files

Fixed LoG (still bleeds on edges the same way MATLAB does)

parent 9e5d22a3
No related branches found
No related tags found
No related merge requests found
......@@ -10,7 +10,7 @@
#include "Defines.h"
#include "Vec.h"
#include "KernelGenerators.h"
#include "SeparableMultiplySum.cuh"
#include "CudaAddTwoImages.cuh"
#include <cuda_runtime.h>
#include <limits>
......@@ -21,16 +21,16 @@ void cLoG(ImageContainer<PixelTypeIn> imageIn, ImageContainer<float>& imageOut,
{
const float MIN_VAL = std::numeric_limits<float>::lowest();
const float MAX_VAL = std::numeric_limits<float>::max();
const int NUM_BUFF_NEEDED = 2;
const int NUM_BUFF_NEEDED = 3;
setUpOutIm<float>(imageIn.getDims(), imageOut);
CudaDevices cudaDevs(cudaMultiplySum<float, float>, device);
CudaDevices cudaDevs(cudaAddTwoImages<float,float,float>, device);
Vec<size_t> kernDims(0);
float* hostKernels = createGaussianKernel(sigmas, kernDims);
Vec<size_t> kernelDims(0);
float* hostLoG_GausKernels = createLoG_GausKernels(sigmas, kernelDims);
std::vector<ImageChunk> chunks = calculateBuffers(imageIn.getDims(), NUM_BUFF_NEEDED, cudaDevs, sizeof(float), kernDims);
std::vector<ImageChunk> chunks = calculateBuffers(imageIn.getDims(), NUM_BUFF_NEEDED, cudaDevs, sizeof(float), kernelDims);
Vec<size_t> maxDeviceDims;
setMaxDeviceDims(chunks, maxDeviceDims);
......@@ -42,29 +42,96 @@ void cLoG(ImageContainer<PixelTypeIn> imageIn, ImageContainer<float>& imageOut,
const int N_THREADS = omp_get_num_threads();
const int CUR_DEVICE = cudaDevs.getDeviceIdx(CUDA_IDX);
CudaDeviceImages<float> deviceImages(NUM_BUFF_NEEDED, maxDeviceDims, CUR_DEVICE);
CudaDeviceImages<float> deviceImages(NUM_BUFF_NEEDED-1, maxDeviceDims, CUR_DEVICE);
CudaDeviceImages<float> deviceImagesScratch(1, maxDeviceDims, CUR_DEVICE);
Kernel constFullKern(kernDims.sum(), hostKernels, CUR_DEVICE);
Kernel constKernelMem_x = constFullKern.getOffsetCopy(Vec<size_t>(kernDims.x, 1, 1), 0);
Kernel constKernelMem_y = constFullKern.getOffsetCopy(Vec<size_t>(1, kernDims.y, 1), kernDims.x);
Kernel constKernelMem_z = constFullKern.getOffsetCopy(Vec<size_t>(1, 1, kernDims.z), kernDims.x + kernDims.y);
size_t logStride = kernelDims.sum();
Kernel constFullKern(logStride*2, hostLoG_GausKernels, CUR_DEVICE);
Kernel constLoGKernelMem_x = constFullKern.getOffsetCopy(Vec<size_t>(kernelDims.x, 1, 1), 0);
Kernel constLoGKernelMem_y = constFullKern.getOffsetCopy(Vec<size_t>(1, kernelDims.y, 1), kernelDims.x);
Kernel constLoGKernelMem_z = constFullKern.getOffsetCopy(Vec<size_t>(1, 1, kernelDims.z), kernelDims.x + kernelDims.y);
Kernel constGausKernelMem_x = constFullKern.getOffsetCopy(Vec<size_t>(kernelDims.x, 1, 1), logStride);
Kernel constGausKernelMem_y = constFullKern.getOffsetCopy(Vec<size_t>(1, kernelDims.y, 1), kernelDims.x + logStride);
Kernel constGausKernelMem_z = constFullKern.getOffsetCopy(Vec<size_t>(1, 1, kernelDims.z), kernelDims.x + kernelDims.y + logStride);
for (int i = CUDA_IDX; i < chunks.size(); i += N_THREADS)
{
if (!chunks[i].sendROI(imageIn, deviceImages.getCurBuffer()))
std::runtime_error("Error sending ROI to device!");
size_t memsize = sizeof(float)*chunks[i].getFullChunkSize().product();
deviceImages.setAllDims(chunks[i].getFullChunkSize());
deviceImagesScratch.setAllDims(chunks[i].getFullChunkSize());
for (int j = 0; j < numIterations; ++j)
HANDLE_ERROR(cudaMemset(deviceImagesScratch.getCurBuffer()->getDeviceImagePointer(), 0, memsize));
// apply LoG in X
if (sigmas.x!=0)
{
SeparableMultiplySum(chunks[i], deviceImages, constKernelMem_x, constKernelMem_y, constKernelMem_z, MIN_VAL, MAX_VAL);
if (!chunks[i].sendROI(imageIn, deviceImages.getCurBuffer()))
std::runtime_error("Error sending ROI to device!");
cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constLoGKernelMem_x, MIN_VAL, MAX_VAL, false);
deviceImages.incrementBuffer();
if (sigmas.y!=0)
{
cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_y, MIN_VAL, MAX_VAL);
deviceImages.incrementBuffer();
}
if (sigmas.z!=0)
{
cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_z, MIN_VAL, MAX_VAL);
deviceImages.incrementBuffer();
}
cudaAddTwoImages << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), MIN_VAL, MAX_VAL);
DEBUG_KERNEL_CHECK();
}
chunks[i].retriveROI(imageOut, deviceImages.getCurBuffer());
// apply LoG in Y
if (sigmas.y!=0)
{
if (!chunks[i].sendROI(imageIn, deviceImages.getCurBuffer()))
std::runtime_error("Error sending ROI to device!");
if (sigmas.x!=0)
{
cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_x, MIN_VAL, MAX_VAL);
deviceImages.incrementBuffer();
}
cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constLoGKernelMem_y, MIN_VAL, MAX_VAL, false);
deviceImages.incrementBuffer();
if (sigmas.z!=0)
{
cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_z, MIN_VAL, MAX_VAL);
deviceImages.incrementBuffer();
}
cudaAddTwoImages << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), MIN_VAL, MAX_VAL);
DEBUG_KERNEL_CHECK();
}
// apply LoG in Z
if (sigmas.z!=0)
{
if (!chunks[i].sendROI(imageIn, deviceImages.getCurBuffer()))
std::runtime_error("Error sending ROI to device!");
if (sigmas.x!=0)
{
cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_x, MIN_VAL, MAX_VAL);
deviceImages.incrementBuffer();
}
if(sigmas.y!=0)
{
cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constGausKernelMem_y, MIN_VAL, MAX_VAL);
deviceImages.incrementBuffer();
}
cudaMultiplySum << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImages.getNextBuffer()), constLoGKernelMem_z, MIN_VAL, MAX_VAL, false);
deviceImages.incrementBuffer();
cudaAddTwoImages << <chunks[i].blocks, chunks[i].threads >> > (*(deviceImages.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), *(deviceImagesScratch.getCurBuffer()), MIN_VAL, MAX_VAL);
DEBUG_KERNEL_CHECK();
}
chunks[i].retriveROI(imageOut, deviceImagesScratch.getCurBuffer());
}
constFullKern.clean();
}
delete[] hostKernels;
delete[] hostLoG_GausKernels;
}
......@@ -5,4 +5,4 @@
#include <vector>
float* createGaussianKernel(Vec<double> sigmas, Vec<size_t>& dimsOut);
float* createLoGKernel(Vec<double> sigmas, Vec<size_t>& dimsOut);
float* createLoG_GausKernels(Vec<double> sigmas, Vec<size_t>& dimsOut);
......@@ -2,7 +2,7 @@
#include <functional>
float* createLoGKernel(Vec<double> sigmas, Vec<size_t>& dimsOut)
float* createLoG_GausKernels(Vec<double> sigmas, Vec<size_t>& dimsOut)
{
const double PI = std::atan(1.0) * 4;
......@@ -16,79 +16,124 @@ float* createLoGKernel(Vec<double> sigmas, Vec<size_t>& dimsOut)
Vec<double> mid = Vec<double>(dimsOut) / 2.0 - 0.5;
float* kernelOut = new float[dimsOut.sum()];
bool is3d = sigmas != Vec<double>(0.0);
float* kernelOut = new float[dimsOut.sum()*2];
Vec<double> sigmaSqr = sigmas.pwr(2);
Vec<double> oneOverSigSqr = Vec<double>(1.0) / sigmaSqr;
Vec<double> twoSigmaSqr = sigmaSqr * 2;
Vec<double> sigmaForth = sigmas.pwr(4);
int loGstride = dimsOut.sum();
if (is3d)
for (int i = 0; i < 3; ++i)
{
// LaTeX form of LoG
// $\frac{\Big(\frac{(x-\mu_x)^2}{\sigma_x^4}-\frac{1}{\sigma_x^2}+\frac{(y-\mu_y)^2}{\sigma_y^4}-\frac{1}{\sigma_y^2}+\frac{(z-\mu_z)^2}{\sigma_z^4}-\frac{1}{\sigma_z^2}\Big)\exp\Big(-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}-\frac{(z-\mu)^2}{2\sigma_z^2}\Big)}{(2\pi)^{\frac{3}{2}}\sigma_x\sigma_y\sigma_z}$
Vec<double> sigma4th = sigmas.pwr(4);
double subtractor = (Vec<double>(1.0f, 1.0f, 1.0f) / sigmaSqr).sum();
double denominator = pow(2.0*PI, 3.0 / 2.0)*sigmas.product();
int stride = 0;
if (i > 0)
stride = dimsOut.x;
if (i > 1)
stride += dimsOut.y;
for (int i =0; i<3; ++i)
if (sigmas.e[i] == 0)
{
size_t startOffset = 0;
if (i > 0)
startOffset += dimsOut.x;
if (i > 1)
startOffset += dimsOut.y;
for (int j = 0; j<dimsOut.e[i]; ++j)
{
int firstOther = 0;
if (j == 0)
firstOther = 1;
int secondOther = 2;
if (j == 2)
secondOther = 1;
double firstAdditive = -1.0 / sigmaSqr.e[firstOther];
double secondAdditive = -1.0 / sigmaSqr.e[secondOther];
double muSub = SQR(j - mid.e[i]);
double muSubSigSqr = muSub / (2 * sigmaSqr.e[i]);
double additive = muSub / sigma4th.e[i] - 1.0 / sigmaSqr.e[i];
kernelOut[stride] = 0.0f;
kernelOut[stride + loGstride] = 1.0f;
continue;
}
double posVal = ((additive + firstAdditive + secondAdditive) * exp(-muSubSigSqr)) / denominator;
kernelOut[startOffset + j] = (float)posVal;
}
double gaussSum = 0.0;
for (int j = 0; j < dimsOut.e[i]; ++j)
{
double pos = j - mid.e[i];
double posSqr = SQR(pos);
double gauss = exp(-(posSqr / twoSigmaSqr.e[i]));
double logVal = (posSqr / sigmaForth.e[i] - oneOverSigSqr.e[i])*gauss;
kernelOut[j + stride] = (float)logVal;
kernelOut[j + stride + loGstride] = gauss;
gaussSum += gauss;
}
double sumVal = 0.0;
for (int j = 0; j < dimsOut.e[i]; ++j)
{
kernelOut[j + stride] /= (float)gaussSum;
sumVal += kernelOut[j + stride];
kernelOut[j + stride + loGstride] /= (float)gaussSum;
}
}
else
{
// LaTeX form of LoG
// $\frac{-1}{\pi\sigma_x^2\sigma_y^2}\Bigg(1-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}\Bigg)\exp\Bigg(-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}\Bigg)$
// figure out which dim is zero
double sigProd = 1.0;
if (sigmas.x != 0)
sigProd *= sigmas.x;
if (sigmas.y != 0)
sigProd *= sigmas.y;
if (sigmas.z != 0)
sigProd *= sigmas.z;
double denominator = -PI*sigProd;
for (int i=0; i<2; ++i)
for (int j = 0; j < dimsOut.e[i]; ++j)
{
size_t startOffset = i*dimsOut.e[0];
for (int j = 0; j < dimsOut.e[i]; ++j)
{
double gaussComp = SQR(j - mid.e[i]) / twoSigmaSqr.e[i];
double posVal = ((1.0 - gaussComp)*exp(-gaussComp)) / denominator;
kernelOut[startOffset + j] = (float)posVal;
}
kernelOut[j + stride] -= (float)sumVal;
}
}
//bool is3d = sigmas != Vec<double>(0.0);
//
//if (is3d)
//{
// // LaTeX form of LoG
// // $\frac{\Big(\frac{(x-\mu_x)^2}{\sigma_x^4}-\frac{1}{\sigma_x^2}+\frac{(y-\mu_y)^2}{\sigma_y^4}-\frac{1}{\sigma_y^2}+\frac{(z-\mu_z)^2}{\sigma_z^4}-\frac{1}{\sigma_z^2}\Big)\exp\Big(-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}-\frac{(z-\mu)^2}{2\sigma_z^2}\Big)}{(2\pi)^{\frac{3}{2}}\sigma_x\sigma_y\sigma_z}$
// Vec<double> sigma4th = sigmas.pwr(4);
// double subtractor = (Vec<double>(1.0f, 1.0f, 1.0f) / sigmaSqr).sum();
// double denominator = pow(2.0*PI, 3.0 / 2.0)*sigmas.product();
// for (int i =0; i<3; ++i)
// {
// size_t startOffset = 0;
// if (i > 0)
// startOffset += dimsOut.x;
// if (i > 1)
// startOffset += dimsOut.y;
// for (int j = 0; j<dimsOut.e[i]; ++j)
// {
// int firstOther = 0;
// if (j == 0)
// firstOther = 1;
// int secondOther = 2;
// if (j == 2)
// secondOther = 1;
// double firstAdditive = -1.0 / sigmaSqr.e[firstOther];
// double secondAdditive = -1.0 / sigmaSqr.e[secondOther];
// double muSub = SQR(j - mid.e[i]);
// double muSubSigSqr = muSub / (2 * sigmaSqr.e[i]);
// double additive = muSub / sigma4th.e[i] - 1.0 / sigmaSqr.e[i];
// double posVal = ((additive + firstAdditive + secondAdditive) * exp(-muSubSigSqr)) / denominator;
// kernelOut[startOffset + j] = (float)posVal;
// }
// }
//}
//else
//{
// // LaTeX form of LoG
// // $\frac{-1}{\pi\sigma_x^2\sigma_y^2}\Bigg(1-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}\Bigg)\exp\Bigg(-\frac{(x-\mu_x)^2}{2\sigma_x^2}-\frac{(y-\mu_y)^2}{2\sigma_y^2}\Bigg)$
// // figure out which dim is zero
// double sigProd = 1.0;
// if (sigmas.x != 0)
// sigProd *= sigmas.x;
// if (sigmas.y != 0)
// sigProd *= sigmas.y;
// if (sigmas.z != 0)
// sigProd *= sigmas.z;
// double denominator = -PI*sigProd;
// for (int i=0; i<2; ++i)
// {
// size_t startOffset = i*dimsOut.e[0];
// for (int j = 0; j < dimsOut.e[i]; ++j)
// {
// double gaussComp = SQR(j - mid.e[i]) / twoSigmaSqr.e[i];
// double posVal = ((1.0 - gaussComp)*exp(-gaussComp)) / denominator;
// kernelOut[startOffset + j] = (float)posVal;
// }
// }
//}
return kernelOut;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment