7 #include <cuda_runtime.h>
16 __global__
void argmax_GPU(
const double *m,
double *result,
int rows,
int cols) {
18 int col = blockIdx.x * blockDim.x + threadIdx.x;
23 double max_val = m[col];
27 for (
int row = 1; row < rows; row++) {
28 double val = m[row * cols + col];
36 result[col] =
static_cast<double>(max_idx);
49 int threadsPerBlock = 256;
50 int blocksPerGrid = (cols + threadsPerBlock - 1) / threadsPerBlock;
53 argmax_GPU<<<blocksPerGrid, threadsPerBlock>>>(d_data, result.
get_data(), rows, cols);
56 cudaDeviceSynchronize();
Vector argmax() const
Computes the argmax of each column in the matrix.
Represents a vector with GPU-accelerated operations.
double * get_data() const
Get the raw data pointer of the vector.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void argmax_GPU(const double *m, double *result, int rows, int cols)
CUDA kernel for computing the argmax of each column in a matrix.