7 #include <cuda_runtime.h>
15 __global__
void reluKernel(
const double* input,
double* output,
int size) {
17 int idx = blockIdx.x * blockDim.x + threadIdx.x;
22 output[idx] = fmax(0.0, input[idx]);
35 int size = rows * cols;
38 int threadsPerBlock = 256;
41 int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
44 reluKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, result.d_data, size);
47 cudaDeviceSynchronize();
Represents a matrix with GPU-accelerated operations.
Matrix relu() const
Applies the ReLU activation function to the matrix.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void reluKernel(const double *input, double *output, int size)
CUDA kernel for applying the ReLU activation function element-wise.