7 #include <cuda_runtime.h>
21 int row = blockIdx.y * blockDim.y + threadIdx.y;
22 int col = blockIdx.x * blockDim.x + threadIdx.x;
25 if (row < rows && col < cols) {
27 int index = row * cols + col;
30 c[index] = a[index] - b[index];
42 if (rows != other.rows || cols != other.cols) {
43 throw std::invalid_argument(
"Matrix dimensions must be identical for subtraction");
50 dim3 threadsPerBlock(16, 16);
53 dim3 numBlocks((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
54 (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
57 matrixSubtractKernel<<<numBlocks, threadsPerBlock>>>(d_data, other.d_data, result.d_data, rows, cols);
60 cudaError_t cudaStatus = cudaGetLastError();
61 if (cudaStatus != cudaSuccess) {
62 throw std::runtime_error(
"Kernel launch failed: " + std::string(cudaGetErrorString(cudaStatus)));
66 cudaDeviceSynchronize();
Represents a matrix with GPU-accelerated operations.
Matrix subtract(const Matrix &other) const
Subtracts another matrix from this matrix.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void matrixSubtractKernel(const double *a, const double *b, double *c, int rows, int cols)
CUDA kernel for element-wise matrix subtraction.