7 #include <cuda_runtime.h>
20 int idx = blockIdx.x * blockDim.x + threadIdx.x;
25 if (fabs(scalar) < DBL_EPSILON) {
28 data[idx] = (data[idx] == 0.0) ? 0.0 : ((data[idx] > 0.0) ? DBL_MAX : -DBL_MAX);
31 else if (fabs(data[idx]) > DBL_MAX / 2) {
33 data[idx] = (data[idx] > 0.0) ? DBL_MAX : -DBL_MAX;
50 throw std::invalid_argument(
"Cannot divide by exactly zero");
54 int size = rows * cols;
57 int threadsPerBlock = 256;
58 int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
61 divideScalarKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, scalar, size);
64 cudaError_t cudaStatus = cudaGetLastError();
65 if (cudaStatus != cudaSuccess) {
66 throw std::runtime_error(
"Kernel launch failed: " + std::string(cudaGetErrorString(cudaStatus)));
70 cudaDeviceSynchronize();
void divide_scalar(double scalar)
Divides all elements in the matrix by a scalar.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void divideScalarKernel(double *data, double scalar, int size)
CUDA kernel for dividing matrix elements by a scalar.