8 #include <cuda_runtime.h>
21 int col = blockIdx.x * blockDim.x + threadIdx.x;
22 int row = blockIdx.y * blockDim.y + threadIdx.y;
25 if (row < rows && col < cols) {
27 int index = row * cols + col;
42 throw std::invalid_argument(
"Vector dimension must match matrix rows for addition");
46 dim3 threadsPerBlock(16, 16);
49 dim3 numBlocks((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
50 (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
53 addVectorToMatrixKernel<<<numBlocks, threadsPerBlock>>>(d_data, v.
get_data(), rows, cols);
56 cudaError_t cudaStatus = cudaGetLastError();
57 if (cudaStatus != cudaSuccess) {
58 throw std::runtime_error(
"Kernel launch failed: " + std::string(cudaGetErrorString(cudaStatus)));
62 cudaDeviceSynchronize();
void add_vector(const Vector &v)
Adds a vector to each column of the matrix.
Represents a vector with GPU-accelerated operations.
int get_rows() const
Get the number of elements in the vector.
double * get_data() const
Get the raw data pointer of the vector.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void addVectorToMatrixKernel(double *m, const double *v, int rows, int cols)
CUDA kernel for adding a vector to each column of a matrix.
Defines the Vector class for GPU-accelerated vector operations.