Как реализовать фильтр собеля?

Question

kill94 @kill94

Как реализовать фильтр собеля?

вот написал фильтр собеля, но не работает. Помогите найти ошибку
исходное изображение

после

const int FILTER_RADIUS = 1;
const int TILE_WIDTH = 16;
const int TILE_HEIGHT = 16;
const int BLOCK_WIDTH = TILE_WIDTH + 2 * FILTER_RADIUS;
const int BLOCK_HEIGHT = TILE_HEIGHT + 2 * FILTER_RADIUS;

__device__ 
unsigned char compute_sobel(unsigned char up_left, unsigned char up_middle, unsigned char up_right,
							unsigned char mid_left, unsigned char mid_middle, unsigned char mid_right,
							unsigned char low_left, unsigned char low_middle, unsigned char low_right,
							float fscale)
{
	short horizont = up_right + 2 * mid_right + low_right - up_left - 2 * mid_left - low_left;
	short vertical = up_left + 2 * up_middle + up_left - low_left - 2 * low_middle - low_right;
	short sum = (short)(fscale*(abs((int)horizont) + abs((int)vertical)));

	if (sum < 0)
		return 0;
	else if (sum>0xff)
		return 0xff;

	return (unsigned char)sum;
}

__global__
void sobel_filter(unsigned char * image_original_array, unsigned char * image_out_array, int width, int height)
{
	__shared__ unsigned char shared_mem[BLOCK_HEIGHT*BLOCK_WIDTH];
	int x = blockIdx.x*TILE_WIDTH + threadIdx.x;
	int y = blockIdx.y*TILE_HEIGHT + threadIdx.y ;

	int index = y*width + x;
	int shared_index = threadIdx.y*blockDim.y + threadIdx.x;

	shared_mem[shared_index] = image_original_array[index];

	__syncthreads();
	 
	if (threadIdx.x >= FILTER_RADIUS&&threadIdx.x < BLOCK_WIDTH - FILTER_RADIUS&&
		threadIdx.y >= FILTER_RADIUS && threadIdx.y < BLOCK_HEIGHT - FILTER_RADIUS)
	{
		unsigned char pix00 = shared_mem[shared_index + (-FILTER_RADIUS*blockDim.x - FILTER_RADIUS)];
		unsigned char pix01 = shared_mem[shared_index + (-FILTER_RADIUS*blockDim.x - 0)];
		unsigned char pix02 = shared_mem[shared_index + (-FILTER_RADIUS*blockDim.x + FILTER_RADIUS)];
		
		unsigned char pix10 = shared_mem[shared_index + (0*blockDim.x - FILTER_RADIUS)];
		unsigned char pix11 = shared_mem[shared_index + (0*blockDim.x - 0)];
		unsigned char pix12 = shared_mem[shared_index + (0*blockDim.x + FILTER_RADIUS)];

		unsigned char pix20 = shared_mem[shared_index + (FILTER_RADIUS*blockDim.x - FILTER_RADIUS)];
		unsigned char pix21 = shared_mem[shared_index + (FILTER_RADIUS*blockDim.x - 0)];
		unsigned char pix22 = shared_mem[shared_index + (FILTER_RADIUS*blockDim.x + FILTER_RADIUS)];

		image_out_array[index] = compute_sobel(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, 1.f);
	}
}

 
void cuda_filter(unsigned char *  image_original_array, unsigned char *  image_rezultat_array, int  width, int height)
{
	unsigned char *d_imageRGBA;
	unsigned char *d_image_rezultat_array;
	size_t num_pixels = width*height;
	cudaSetDevice(0); // Устройство 0

	cudaEvent_t event_1, event_2; 
	cudaEventCreate(&event_1);
	cudaEventCreate(&event_2);

	float malloc = 0;

	cudaEventRecord(event_1);
	checkCudaErrors(cudaMalloc(&d_imageRGBA, sizeof(unsigned char)* num_pixels));
	checkCudaErrors(cudaMalloc(&d_image_rezultat_array, sizeof(unsigned char)* num_pixels));
	cudaEventRecord(event_2);
	cudaEventSynchronize(event_2);
	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
	cudaEventElapsedTime(&malloc, event_1, event_2);

	cudaEvent_t memcpy_1, memcpy_2; 
	cudaEventCreate(&memcpy_1);
	cudaEventCreate(&memcpy_2);
	float memcpy = 0;

	cudaEventRecord(memcpy_1);
	checkCudaErrors(cudaMemcpy(d_imageRGBA, image_original_array, sizeof(unsigned char)* num_pixels, cudaMemcpyHostToDevice));
	cudaEventRecord(memcpy_2);
	cudaEventSynchronize(memcpy_2);
	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
	cudaEventElapsedTime(&malloc, memcpy_1, memcpy_2);

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	int grid_width = (width + TILE_WIDTH - 1) / TILE_WIDTH;
	int grid_height = (height + TILE_HEIGHT - 1) / TILE_HEIGHT;
	dim3 dim_grid(grid_height, grid_width);
	dim3 dim_block(BLOCK_WIDTH, BLOCK_HEIGHT);

	cudaEventRecord(start);
	sobel_filter << <dim_grid, dim_block >> >(d_imageRGBA, d_image_rezultat_array, width, height);
	//GPUFiltering(d_imageRGBA, d_image_rezultat_array, num_rows, num_cols); // вызов функции фильтра
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
	float milliseconds = 0;
	cudaEventElapsedTime(&milliseconds, start, stop);

	cudaEventRecord(memcpy_1);
	checkCudaErrors(cudaMemcpy(image_original_array, d_image_rezultat_array, sizeof(unsigned char)* num_pixels, cudaMemcpyDeviceToHost));
	cudaEventRecord(memcpy_2);
	cudaEventSynchronize(memcpy_2);
	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
	cudaEventElapsedTime(&memcpy, memcpy_1, memcpy_2);

	cudaEventDestroy(event_1);
	cudaEventDestroy(event_2);
	cudaEventDestroy(memcpy_1);
	cudaEventDestroy(memcpy_2);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaFree(d_image_rezultat_array);
	cudaFree(d_imageRGBA);
}