7 #include <cuda_runtime.h>
17 int idx = blockIdx.x * blockDim.x + threadIdx.x;
22 output[idx] = (input[idx] > 0.0) ? 1.0 : 0.0;
35 int size = rows * cols;
38 int threadsPerBlock = 256;
41 int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
44 reluDerivativeKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, result.d_data, size);
47 cudaDeviceSynchronize();
Represents a matrix with GPU-accelerated operations.
Matrix relu_derivative() const
Applies the derivative of the ReLU activation function to the matrix.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void reluDerivativeKernel(const double *input, double *output, int size)
CUDA kernel for applying the ReLU derivative function element-wise.