Commit 6b747d25f8386ac98393a6ae14a4eb1803c8ffa3

Authored by David Mayerich
2 parents eb303393 42890942

Merge branch 'bsds500' of git.stim.ee.uh.edu:codebase/stimlib

stim/cuda/array_add.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_ADD_H
  2 +#define STIM_CUDA_ARRAY_ADD_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/devices.h>
  7 +#include <stim/cuda/error.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
  14 +
  15 + //calculate the 1D index for this thread
  16 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + if(idx < N){
  19 + sum[idx] = ptr1[idx] + ptr2[idx];
  20 + }
  21 +
  22 + }
  23 +
  24 + template<typename T>
  25 + void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
  26 +
  27 + //get the maximum number of threads per block for the CUDA device
  28 + int threads = stim::maxThreadsPerBlock();
  29 +
  30 + //calculate the number of blocks
  31 + int blocks = N / threads + (N%threads == 0 ? 0:1);
  32 +
  33 + //call the kernel to do the multiplication
  34 + cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
  35 +
  36 + }
  37 +
  38 + template<typename T>
  39 + void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){
  40 +
  41 + //allocate memory on the GPU for the array
  42 + T* gpu_ptr1;
  43 + T* gpu_ptr2;
  44 + T* gpu_sum;
  45 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  46 + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
  47 + HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) );
  48 +
  49 + //copy the array to the GPU
  50 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  51 + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
  52 +
  53 + //call the GPU version of this function
  54 + gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N);
  55 +
  56 + //copy the array back to the CPU
  57 + HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) );
  58 +
  59 + //free allocated memory
  60 + cudaFree(gpu_ptr1);
  61 + cudaFree(gpu_ptr2);
  62 + cudaFree(gpu_sum);
  63 +
  64 + }
  65 +
  66 + }
  67 +}
  68 +
  69 +
  70 +
  71 +#endif
0 72 \ No newline at end of file
... ...
stim/cuda/arraymath.cuh
... ... @@ -5,6 +5,8 @@
5 5 #include <stim/cuda/array_multiply.cuh>
6 6 #include <stim/cuda/array_abs.cuh>
7 7 #include <stim/cuda/array_cart2polar.cuh>
  8 +#include <stim/cuda/gaussian_blur.cuh>
  9 +#include <stim/cuda/conv2.cuh>
8 10  
9 11 namespace stim{
10 12 namespace cuda{
... ...
stim/cuda/conv2.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_CONV2_H
  2 +#define STIM_CUDA_CONV2_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/devices.h>
  7 +#include <stim/cuda/error.h>
  8 +#include <cmath>
  9 +#include <algorithm>
  10 +
  11 +namespace stim{
  12 + namespace cuda{
  13 +
  14 + template<typename T>
  15 + //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
  16 + __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
  17 +
  18 +
  19 + //the radius of mask
  20 + unsigned r = (M - 1)/2;
  21 +
  22 +
  23 + //calculate the 1D index for this thread
  24 + //int idx = blockIdx.x * blockDim.x + threadIdx.x;
  25 +
  26 + //change 1D index to 2D cordinates
  27 + int i = blockIdx.x * blockDim.x + threadIdx.x;
  28 + int j = blockIdx.y;
  29 +
  30 + int idx = j * w + i;
  31 + //unsigned long N = w * h;
  32 +
  33 + if(i < w && j < h){
  34 +
  35 + copy[idx] = tex2D<float>(texObj, i+100, j+100);
  36 + return;
  37 +
  38 + //tex2D<float>(texObj, i, j);
  39 +
  40 + //allocate memory for result
  41 + T sum = 0;
  42 +
  43 + //for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){
  44 +
  45 + //for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){
  46 +
  47 + for (int y = j - r; y <= j + r; y++){
  48 +
  49 + for (int x = i - r; x <= i + r; x++){
  50 +
  51 + //idx to mask cordinates(xx, yy)
  52 + unsigned int xx = x - (i - r);
  53 + unsigned int yy = y - (j - r);
  54 +
  55 + //T temp = img[y * w + x] * mask[yy * M + xx];
  56 + //sum += img[y * w + x] * mask[yy * M + xx];
  57 + sum += tex2D<float>(texObj, x, y);// * mask[yy * M + xx];
  58 + }
  59 + }
  60 + copy[idx] = sum;
  61 + }
  62 + }
  63 +
  64 +
  65 + template<typename T>
  66 + void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){
  67 +
  68 + unsigned long N = w * h;
  69 +
  70 + // Allocate CUDA array in device memory
  71 +
  72 + //define a channel descriptor for a single 32-bit channel
  73 + cudaChannelFormatDesc channelDesc =
  74 + cudaCreateChannelDesc(32, 0, 0, 0,
  75 + cudaChannelFormatKindFloat);
  76 + cudaArray* cuArray; //declare the cuda array
  77 + cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array
  78 +
  79 + // Copy the image data from global memory to the array
  80 + cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T),
  81 + cudaMemcpyDeviceToDevice);
  82 +
  83 + // Specify texture
  84 + struct cudaResourceDesc resDesc; //create a resource descriptor
  85 + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
  86 + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
  87 + resDesc.res.array.array = cuArray; //add a pointer to the cuda array
  88 +
  89 + // Specify texture object parameters
  90 + struct cudaTextureDesc texDesc; //create a texture descriptor
  91 + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
  92 + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
  93 + texDesc.addressMode[1] = cudaAddressModeWrap;
  94 + texDesc.filterMode = cudaFilterModePoint; //use linear filtering
  95 + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
  96 + texDesc.normalizedCoords = 0; //not using normalized coordinates
  97 +
  98 + // Create texture object
  99 + cudaTextureObject_t texObj = 0;
  100 + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  101 +
  102 + //get the maximum number of threads per block for the CUDA device
  103 + int threads = stim::maxThreadsPerBlock();
  104 +
  105 + //calculate the number of blocks
  106 + dim3 blocks(w / threads + 1, h);
  107 +
  108 + //call the kernel to do the multiplication
  109 + //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M);
  110 + cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
  111 +
  112 + }
  113 +
  114 + template<typename T>
  115 + void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){
  116 +
  117 + unsigned long N = w * h;
  118 + //allocate memory on the GPU for the array
  119 + T* gpu_img;
  120 + T* gpu_mask;
  121 + T* gpu_copy;
  122 + HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
  123 + HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) );
  124 + HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
  125 +
  126 + //copy the array to the GPU
  127 + HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
  128 + HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) );
  129 +
  130 + //call the GPU version of this function
  131 + gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M);
  132 +
  133 + //copy the array back to the CPU
  134 + HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
  135 +
  136 + //free allocated memory
  137 + cudaFree(gpu_img);
  138 + cudaFree(gpu_mask);
  139 + cudaFree(gpu_copy);
  140 +
  141 + }
  142 +
  143 + }
  144 +}
  145 +
  146 +
  147 +#endif
0 148 \ No newline at end of file
... ...