вот написал фильтр собеля, но не работает. Помогите найти ошибку
исходное изображение
после
const int FILTER_RADIUS = 1;
const int TILE_WIDTH = 16;
const int TILE_HEIGHT = 16;
const int BLOCK_WIDTH = TILE_WIDTH + 2 * FILTER_RADIUS;
const int BLOCK_HEIGHT = TILE_HEIGHT + 2 * FILTER_RADIUS;
__device__
unsigned char compute_sobel(unsigned char up_left, unsigned char up_middle, unsigned char up_right,
unsigned char mid_left, unsigned char mid_middle, unsigned char mid_right,
unsigned char low_left, unsigned char low_middle, unsigned char low_right,
float fscale)
{
short horizont = up_right + 2 * mid_right + low_right - up_left - 2 * mid_left - low_left;
short vertical = up_left + 2 * up_middle + up_left - low_left - 2 * low_middle - low_right;
short sum = (short)(fscale*(abs((int)horizont) + abs((int)vertical)));
if (sum < 0)
return 0;
else if (sum>0xff)
return 0xff;
return (unsigned char)sum;
}
__global__
void sobel_filter(unsigned char * image_original_array, unsigned char * image_out_array, int width, int height)
{
__shared__ unsigned char shared_mem[BLOCK_HEIGHT*BLOCK_WIDTH];
int x = blockIdx.x*TILE_WIDTH + threadIdx.x;
int y = blockIdx.y*TILE_HEIGHT + threadIdx.y ;
int index = y*width + x;
int shared_index = threadIdx.y*blockDim.y + threadIdx.x;
shared_mem[shared_index] = image_original_array[index];
__syncthreads();
if (threadIdx.x >= FILTER_RADIUS&&threadIdx.x < BLOCK_WIDTH - FILTER_RADIUS&&
threadIdx.y >= FILTER_RADIUS && threadIdx.y < BLOCK_HEIGHT - FILTER_RADIUS)
{
unsigned char pix00 = shared_mem[shared_index + (-FILTER_RADIUS*blockDim.x - FILTER_RADIUS)];
unsigned char pix01 = shared_mem[shared_index + (-FILTER_RADIUS*blockDim.x - 0)];
unsigned char pix02 = shared_mem[shared_index + (-FILTER_RADIUS*blockDim.x + FILTER_RADIUS)];
unsigned char pix10 = shared_mem[shared_index + (0*blockDim.x - FILTER_RADIUS)];
unsigned char pix11 = shared_mem[shared_index + (0*blockDim.x - 0)];
unsigned char pix12 = shared_mem[shared_index + (0*blockDim.x + FILTER_RADIUS)];
unsigned char pix20 = shared_mem[shared_index + (FILTER_RADIUS*blockDim.x - FILTER_RADIUS)];
unsigned char pix21 = shared_mem[shared_index + (FILTER_RADIUS*blockDim.x - 0)];
unsigned char pix22 = shared_mem[shared_index + (FILTER_RADIUS*blockDim.x + FILTER_RADIUS)];
image_out_array[index] = compute_sobel(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, 1.f);
}
}
void cuda_filter(unsigned char * image_original_array, unsigned char * image_rezultat_array, int width, int height)
{
unsigned char *d_imageRGBA;
unsigned char *d_image_rezultat_array;
size_t num_pixels = width*height;
cudaSetDevice(0); // Устройство 0
cudaEvent_t event_1, event_2;
cudaEventCreate(&event_1);
cudaEventCreate(&event_2);
float malloc = 0;
cudaEventRecord(event_1);
checkCudaErrors(cudaMalloc(&d_imageRGBA, sizeof(unsigned char)* num_pixels));
checkCudaErrors(cudaMalloc(&d_image_rezultat_array, sizeof(unsigned char)* num_pixels));
cudaEventRecord(event_2);
cudaEventSynchronize(event_2);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
cudaEventElapsedTime(&malloc, event_1, event_2);
cudaEvent_t memcpy_1, memcpy_2;
cudaEventCreate(&memcpy_1);
cudaEventCreate(&memcpy_2);
float memcpy = 0;
cudaEventRecord(memcpy_1);
checkCudaErrors(cudaMemcpy(d_imageRGBA, image_original_array, sizeof(unsigned char)* num_pixels, cudaMemcpyHostToDevice));
cudaEventRecord(memcpy_2);
cudaEventSynchronize(memcpy_2);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
cudaEventElapsedTime(&malloc, memcpy_1, memcpy_2);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int grid_width = (width + TILE_WIDTH - 1) / TILE_WIDTH;
int grid_height = (height + TILE_HEIGHT - 1) / TILE_HEIGHT;
dim3 dim_grid(grid_height, grid_width);
dim3 dim_block(BLOCK_WIDTH, BLOCK_HEIGHT);
cudaEventRecord(start);
sobel_filter << <dim_grid, dim_block >> >(d_imageRGBA, d_image_rezultat_array, width, height);
//GPUFiltering(d_imageRGBA, d_image_rezultat_array, num_rows, num_cols); // вызов функции фильтра
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cudaEventRecord(memcpy_1);
checkCudaErrors(cudaMemcpy(image_original_array, d_image_rezultat_array, sizeof(unsigned char)* num_pixels, cudaMemcpyDeviceToHost));
cudaEventRecord(memcpy_2);
cudaEventSynchronize(memcpy_2);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
cudaEventElapsedTime(&memcpy, memcpy_1, memcpy_2);
cudaEventDestroy(event_1);
cudaEventDestroy(event_2);
cudaEventDestroy(memcpy_1);
cudaEventDestroy(memcpy_2);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(d_image_rezultat_array);
cudaFree(d_imageRGBA);
}