| // Define a kernel to convert RGB to Grayscale | |
| __global__ void img2gray_kernel(const uint8_t* input, uint8_t* output, int width, int height) { | |
| int x = blockIdx.x * blockDim.x + threadIdx.x; | |
| int y = blockIdx.y * blockDim.y + threadIdx.y; | |
| if (x < width && y < height) { | |
| int idx = (y * width + x) * 3; // RGB has 3 channels | |
| uint8_t r = input[idx]; | |
| uint8_t g = input[idx + 1]; | |
| uint8_t b = input[idx + 2]; | |
| // Convert to grayscale using luminosity method | |
| uint8_t gray = static_cast<uint8_t>(0.21f * r + 0.72f * g + 0.07f * b); | |
| output[y * width + x] = gray; | |
| } | |
| } | |
| // Define a wrapper for this kernel to align with the PyTorch extension interface | |
| void img2gray_cuda(torch::Tensor input, torch::Tensor output) { | |
| const int width = input.size(1); | |
| const int height = input.size(0); | |
| const dim3 blockSize(16, 16); | |
| const dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y); | |
| img2gray_kernel<<<gridSize, blockSize>>>( | |
| input.data_ptr<uint8_t>(), | |
| output.data_ptr<uint8_t>(), | |
| width, | |
| height | |
| ); | |
| cudaDeviceSynchronize(); | |
| } | 
