CUDA Networks
matrix_transpose.cu
Go to the documentation of this file.
1 /**
2  * @file matrix_transpose.cu
3  * @brief Implementation of the Matrix::transpose method for GPU-accelerated matrix transposition.
4  */
5 
6 #include "matrix.h"
7 #include <cuda_runtime.h>
8 #include <stdexcept>
9 #include <string>
10 
11 /**
12  * @brief CUDA kernel for matrix transposition.
13  * @param input Pointer to the input matrix data.
14  * @param output Pointer to the output (transposed) matrix data.
15  * @param rows Number of rows in the input matrix.
16  * @param cols Number of columns in the input matrix.
17  */
18 __global__ void matrixTransposeKernel(const double* input, double* output, int rows, int cols) {
19  // Calculate global thread indices
20  int row = blockIdx.y * blockDim.y + threadIdx.y;
21  int col = blockIdx.x * blockDim.x + threadIdx.x;
22 
23  // Check if thread is within matrix bounds
24  if (row < rows && col < cols) {
25  // Calculate transposed index and assign value
26  int transposedIdx = col * rows + row;
27  int originalIdx = row * cols + col;
28  output[transposedIdx] = input[originalIdx];
29  }
30 }
31 
32 /**
33  * @brief Transposes the matrix and returns a new Matrix object containing the transposed data.
34  * @return A new Matrix object with transposed dimensions.
35  */
37  // Create a new matrix to hold the transposed data
38  Matrix result(cols, rows);
39 
40  // Define block dimensions (16x16 is common for matrix operations)
41  dim3 threadsPerBlock(16, 16);
42 
43  // Calculate grid dimensions to cover the entire matrix
44  dim3 numBlocks((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
45  (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
46 
47  // Launch the CUDA kernel to perform transposition
48  matrixTransposeKernel<<<numBlocks, threadsPerBlock>>>(d_data, result.d_data, rows, cols);
49 
50  // Check for kernel launch errors
51  cudaError_t cudaStatus = cudaGetLastError();
52  if (cudaStatus != cudaSuccess) {
53  throw std::runtime_error("Kernel launch failed: " + std::string(cudaGetErrorString(cudaStatus)));
54  }
55 
56  // Synchronize device to ensure completion
57  cudaDeviceSynchronize();
58 
59  return result;
60 }
Represents a matrix with GPU-accelerated operations.
Definition: matrix.h:18
Matrix transpose() const
Transposes the matrix and returns a new Matrix object.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void matrixTransposeKernel(const double *input, double *output, int rows, int cols)
CUDA kernel for matrix transposition.