cuBLAS是CUDA基础线性代数子程序的API接口,它允许用户使用GPU的计算资源来进行加速计算。
cuBLAS接口可以分为三类:
代码引用头文件"cublas.h"或者"cublas_v2.h",编译时链接cuBLAS的动态库(Linux系统下动态cublas.so)。
示例代码:
//cublas_example.c, Application Using C and cuBLAS: 0-based indexing //----------------------------------------------------------- #include <stdio.h> #include <stdlib.h> #include <math.h> #include <cuda_runtime.h> #include "cublas_v2.h" // 包含cublas头文件 #define M 6 #define N 5 #define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 定义0-based数组的访问方式 static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){ cublasSscal (handle, n-q, &alpha, &m[IDX2C(p,q,ldm)], ldm); cublasSscal (handle, ldm-p, &beta, &m[IDX2C(p,q,ldm)], 1); } int main (void){ cudaError_t cudaStat; cublasStatus_t stat; cublasHandle_t handle; int i, j; float* devPtrA; float* a = 0; a = (float *)malloc (M * N * sizeof (*a)); // 申请CPU数组内存 if (!a) { printf ("host memory allocation failed"); return EXIT_FAILURE; } for (j = 0; j < N; j++) { for (i = 0; i < M; i++) { a[IDX2C(i,j,M)] = (float)(i * N + j + 1); } } cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a)); // 申请GPU数组内存 if (cudaStat != cudaSuccess) { printf ("device memory allocation failed"); return EXIT_FAILURE; } stat = cublasCreate(&handle); // 创建cublas上下文 if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); return EXIT_FAILURE; } stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M); // 矩阵赋值 if (stat != CUBLAS_STATUS_SUCCESS) { printf ("data download failed"); cudaFree (devPtrA); cublasDestroy(handle); return EXIT_FAILURE; } modify (handle, devPtrA, M, N, 1, 2, 16.0f, 12.0f); // 矩阵计算 stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("data upload failed"); cudaFree (devPtrA); cublasDestroy(handle); return EXIT_FAILURE; } cudaFree (devPtrA); // 释放GPU内存 cublasDestroy(handle); // 销毁cublas上下文句柄 for (j = 0; j < N; j++) { for (i = 0; i < M; i++) { printf ("%7.0f", a[IDX2C(i,j,M)]); // 打印计算结果 } printf ("\n"); } free(a); // 释放CPU内存 return EXIT_SUCCESS;
编译示例代码:
nvcc cublas_example.c -lcublas -o cublas_example
示例代码执行结果:
1 6 11 16 21 26 2 7 12 17 22 27 3 1536 156 216 276 336 4 144 14 19 24 29 5 160 15 20 25 30
官网参考链接:https://docs.nvidia.com/cuda/cublas/index.html#introduction