7 #include <cuda_runtime.h>
21 __global__
void selectBatchKernel(
const double* src,
double* dst,
int src_cols,
int dst_cols,
22 int start_row,
int start_col,
int num_rows,
int num_cols) {
24 int row = blockIdx.y * blockDim.y + threadIdx.y;
25 int col = blockIdx.x * blockDim.x + threadIdx.x;
28 if (row < num_rows && col < num_cols) {
30 int src_idx = (start_row + row) * src_cols + (start_col + col);
31 int dst_idx = row * dst_cols + col;
34 dst[dst_idx] = src[src_idx];
40 if (start_row < 0 || end_row > rows || start_col < 0 || end_col > cols ||
41 start_row >= end_row || start_col >= end_col) {
42 throw std::out_of_range(
"Invalid row or column range specified");
46 int num_rows = end_row - start_row;
47 int num_cols = end_col - start_col;
50 Matrix result(num_rows, num_cols);
53 dim3 threadsPerBlock(16, 16);
54 dim3 numBlocks((num_cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
55 (num_rows + threadsPerBlock.y - 1) / threadsPerBlock.y);
58 selectBatchKernel<<<numBlocks, threadsPerBlock>>>(
59 d_data, result.d_data, cols, num_cols, start_row, start_col, num_rows, num_cols
63 cudaError_t cudaStatus = cudaGetLastError();
64 if (cudaStatus != cudaSuccess) {
65 throw std::runtime_error(
"Kernel launch failed: " + std::string(cudaGetErrorString(cudaStatus)));
69 cudaDeviceSynchronize();
Represents a matrix with GPU-accelerated operations.
Matrix select_batch(int start_row, int end_row, int start_col, int end_col) const
Selects a subset of the matrix based on specified row and column ranges.
Defines the Matrix class for GPU-accelerated matrix operations.
__global__ void selectBatchKernel(const double *src, double *dst, int src_cols, int dst_cols, int start_row, int start_col, int num_rows, int num_cols)
CUDA kernel for selecting a subset of the matrix.