#ifndef STIM_CUDA_SET_RMAX_H
#define STIM_CUDA_SET_RMAX_H

#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include <stim/cuda/cudatools/error.h>

template<typename T>
	__global__ void cuda_set_rmax(T* gpu_r, int rx, int ry, int rz, int x, int y, int z){

		//calculate x,y,z coordinates for this thread
		int xi = blockIdx.x * blockDim.x + threadIdx.x;
		//find the grid size along y
		int grid_y = y / blockDim.y;
		int blockidx_y = blockIdx.y % grid_y;
		int yi = blockidx_y * blockDim.y + threadIdx.y;
		int zi = blockIdx.y / grid_y;
		int i = zi * x * y + yi * x + xi;			
			
		if(xi>=x || yi>=y || zi>=z) return;

		gpu_r[i*3+0] = rx;
		gpu_r[i*3+1] = ry;
		gpu_r[i*3+2] = rz;
			
	}

template<typename T>
		void gpu_set_rmax(T* gpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){

			
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
						
			//call the kernel to do the voting
			cuda_set_rmax <T> <<< blocks, threads >>>(gpu_r, r[0], r[1], r[2], x , y, z);

		}
template<typename T>
		void cpu_set_rmax(T* cpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){

			//calculate the number of bytes in the array
			unsigned int bytes = x * y * z * sizeof(T);

			//allocate space on the GPU for the rmax
			T* gpu_r;
			cudaMalloc(&gpu_vote, bytes*3);		

			cudaMemcpy(gpu_r, cpu_r, bytes*3, cudaMemcpyHostToDevice);
			
					
			//call the GPU version of the vote calculation function
			gpu_set_rmax<T>(gpu_r, r, x , y, z);
							
			//copy the Vote Data back to the CPU
			cudaMemcpy(cpu_r, gpu_r, bytes*3, cudaMemcpyDeviceToHost) ;

			//free allocated memory
			cudaFree(gpu_r);
					
		}


#endif