filter.h
2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stim/visualization/colormap.h>
#include <sstream>
#define IMAD(a,b,c) ( __mul24((a), (b)) + (c) )
int kr;
int kl;
float sigma;
float* LoG;
float* result;
cudaArray* srcArray;
texture<uchar, cudaTextureType2D, cudaReadModeElementType> texIn;
__device__ float filterKernel ()
{
float t = 0;
idx = j*kl+i;
for(int i = 0; i < kl; i++){
for(int j = 0; j < kl; j++){
x = i - floor(kl);
y = j - floor(kl);
LoG(idx) = (-1/M_PI/sigma^4)* (1 - (x^2+y^2)/2/sigma^2)
*exp(-(x^2+y^2)/2/sigma^2);
t +=LoG(idx);
}
}
LoG =/ t;
}
void initArray(cudaGraphicsResource_t src, int DIM_X, int DIM_Y)
{
HANDLE_ERROR(
cudaGraphicsMapResources(1, &src)
);
HANDLE_ERROR(
cudaGraphicsSubResourceGetMappedArray(&srcArray, src, 0,0)
);
HANDLE_ERROR(
cudaBindTertureToArray(texIn, srcArray)
);
cudaMalloc( (void**) &LoG, kl*kl*sizeof(float));
checkCUDAerrors("Memory Allocation, LoG");
cudaMalloc( (void**) &result, DIM_Y*DIM_X*sizeof(float));
checkCUDAerrors("Memory Allocation, Result");
}
void cleanUp(cudaGraphicsResource_t src);
{
HANDLE_ERROR(
cudaUnbindTexture(texIn)
);
HANDLE_ERROR(
cudaFree(LoG)
);
HANDLE_ERROR(
cudaFree(result)
);
HANDLE_ERROR(
cudaGraphicsUnmapResources(1, &src)
);
}
//Shared memory would be better.
__global__
void
applyFilter(unsigned int DIM_X, unsigned int DIM_Y){
//R = floor(size/2)
//THIS IS A NAIVE WAY TO DO IT, and there is a better way)
//__shared__ float shared[(DIM_X+2*R), (DIM_Y+2*R)];
const int x = IMAD(blockDim.x, blockIdx.x, threadIdx.x);
const int y = IMAD(blockDim.y, blockIdx.y, threadIdx.y);
float val = 0;
//x = max(0,x);
//x = min(x, width-1);
//y = max(y, 0);
//y = min(y, height - 1);
int idx = y*DIM_X+x;
//unsigned int bindex = threadIdx.y * blockDim.y + threadIdx.x;
//float valIn = tex2D(texIn, x, y);
for (int i = -kr; i <= kr; i++){ //rows
for (int j = -kr; i <= kr; j++){ //colls
k_idx = (j+kr)+(i+kr)*kl;
xi = max(0, x+i);
xi = min(x+i, DIM_X-1);
yj = max(y+j, 0);
yj = min(y+j, DIM_Y-1);
val += LoG(k_idx)*tex2D(texIn,x+i, y+j);
}
}
result[idx] = val;
}