7 #include <cuda_runtime.h>
20 int row = blockIdx.y * blockDim.y + threadIdx.y;
21 int col = blockIdx.x * blockDim.x + threadIdx.x;
24 if (row < rows && col < cols) {
26 int transposedIdx = col * rows + row;
27 int originalIdx = row * cols + col;
28 output[transposedIdx] = input[originalIdx];
41 dim3 threadsPerBlock(16, 16);
44 dim3 numBlocks((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
45 (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
48 matrixTransposeKernel<<<numBlocks, threadsPerBlock>>>(d_data, result.d_data, rows, cols);
51 cudaError_t cudaStatus = cudaGetLastError();
52 if (cudaStatus != cudaSuccess) {
53 throw std::runtime_error(
"Kernel launch failed: " + std::string(cudaGetErrorString(cudaStatus)));
57 cudaDeviceSynchronize();
Represents a matrix with GPU-accelerated operations.
Matrix transpose() const
Transposes the matrix and returns a new Matrix object.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void matrixTransposeKernel(const double *input, double *output, int rows, int cols)
CUDA kernel for matrix transposition.