7 #include <cuda_runtime.h>
17 __global__
void softmaxKernel(
const double* input,
double* output,
int rows,
int cols) {
19 int col = blockIdx.x * blockDim.x + threadIdx.x;
24 double max_val = -DBL_MAX;
27 for (
int row = 0; row < rows; ++row) {
28 max_val = fmax(max_val, input[row * cols + col]);
35 for (
int row = 0; row < rows; ++row) {
36 sum_exp += exp(input[row * cols + col] - max_val);
43 for (
int row = 0; row < rows; ++row) {
44 output[row * cols + col] = exp(input[row * cols + col] - max_val) / sum_exp;
58 int threadsPerBlock = 256;
61 int blocksPerGrid = (cols + threadsPerBlock - 1) / threadsPerBlock;
64 softmaxKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, result.d_data, rows, cols);
67 cudaDeviceSynchronize();
Represents a matrix with GPU-accelerated operations.
Matrix softmax() const
Applies the softmax function to the matrix column-wise.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void softmaxKernel(const double *input, double *output, int rows, int cols)
CUDA kernel for applying the softmax function column-wise.