7 #include <cuda_runtime.h>
27 int row = blockIdx.y * blockDim.y + threadIdx.y;
28 int col = blockIdx.x * blockDim.x + threadIdx.x;
31 if (row < m && col < k) {
36 for (
int i = 0; i < n; ++i) {
37 sum += a[row * n + i] * b[i * k + col];
41 c[row * k + col] = sum;
53 if (cols != other.rows) {
54 throw std::invalid_argument(
"Matrix dimensions are incompatible for multiplication");
58 Matrix result(rows, other.cols);
61 dim3 threadsPerBlock(16, 16);
64 dim3 numBlocks((other.cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
65 (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
68 matrixMultiplyKernel<<<numBlocks, threadsPerBlock>>>(d_data,
75 cudaError_t cudaStatus = cudaGetLastError();
76 if (cudaStatus != cudaSuccess) {
77 throw std::runtime_error(
"Kernel launch failed: " + std::string(cudaGetErrorString(cudaStatus)));
81 cudaDeviceSynchronize();
Represents a matrix with GPU-accelerated operations.
Matrix multiply(const Matrix &other) const
Multiplies this matrix with another matrix.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void matrixMultiplyKernel(const double *a, const double *b, double *c, int m, int n, int k)
CUDA kernel for matrix multiplication.