To illustrate the difference in code between the Runtime and Driver APIs, compare Host code for adding two vectors using the CUDA Runtime and Host code for adding two vectors using the CUDA Driver API, which are examples of a vector addition in which two arrays are added.
const unsigned int cnBlockSize = 512; const unsigned int cnBlocks = 3; const unsigned int cnDimension = cnBlocks * cnBlockSize; // create CUDA device & context cudaSetDevice( 0 ); // pick first device // allocate host vectors float * pA = new float[cnDimension]; float * pB = new float[cnDimension]; float * pC = new float[cnDimension]; // initialize host memory randomInit(pA, cnDimension); randomInit(pB, cnDimension); // allocate device memory float *pDeviceMemA, *pDeviceMemB, *pDeviceMemC; cudaMalloc(&pDeviceMemA, cnDimension * sizeof(float)); cudaMalloc(&pDeviceMemB, cnDimension * sizeof(float)); cudaMalloc(&pDeviceMemC, cnDimension * sizeof(float)); // copy host vectors to device cudaMemcpy(pDeviceMemA, pA, cnDimension * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(pDeviceMemB, pB, cnDimension * sizeof(float), cudaMemcpyHostToDevice); vectorAdd<<<cnBlocks, cnBlockSize>>> (pDeviceMemA, pDeviceMemB, pDeviceMemC); // copy result from device to host cudaMemcpy ((void *) pC, pDeviceMemC, cnDimension * sizeof(float), cudaMemcpyDeviceToHost); delete[] pA; delete[] pB; delete[] pC; cudaFree(pDeviceMemA); cudaFree(pDeviceMemB); cudaFree(pDeviceMemC);
Host code for adding two vectors using the CUDA Runtime consists of 27 lines of code. Host code for adding two vectors using the CUDA Driver API shows the same functionality implemented using the CUDA Driver API.
const unsigned int cnBlockSize = 512; const unsigned int cnBlocks = 3; const unsigned int cnDimension = cnBlocks * cnBlockSize; CUdevice hDevice; CUcontext hContext; CUmodule hModule; CUfunction hFunction; // create CUDA device & context cuInit(0); cuDeviceGet(&hContext, 0); // pick first device cuCtxCreate(&hContext, 0, hDevice)); cuModuleLoad(&hModule, "vectorAdd.cubin"); cuModuleGetFunction(&hFunction, hModule, "vectorAdd"); // allocate host vectors float * pA = new float[cnDimension]; float * pB = new float[cnDimension]; float * pC = new float[cnDimension]; // initialize host memory randomInit(pA, cnDimension); randomInit(pB, cnDimension); // allocate memory on the device CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC; cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float)); cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float)); cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float)); // copy host vectors to device cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float)); cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float)); // set up parameter values cuFuncSetBlockShape(cuFunction, cnBlockSize, 1, 1); #define ALIGN_UP(offset, alignment) / (offset) = ((offset) + (alignment) – 1) & ~((alignment) – 1) int offset = 0; ALIGN_UP(offset, __alignof(pDeviceMemA)); cuParamSetv(cuFunction, offset, &ptr, sizeof(pDeviceMemA)); offset += sizeof(pDeviceMemA); ALIGN_UP(offset, __alignof(pDeviceMemB)); cuParamSetv(cuFunction, offset, &ptr, sizeof(pDeviceMemB)); offset += sizeof(pDeviceMemB); ALIGN_UP(offset, __alignof(pDeviceMemC)); cuParamSetv(cuFunction, offset, &ptr, sizeof(pDeviceMemC)); offset += sizeof(pDeviceMemC); cuParamSetSize(cuFunction, offset); // execute kernel cuLaunchGrid(cuFunction, cnBlocks, 1); // copy the result from device back to host cuMemcpyDtoH((void *) pC, pDeviceMemC, cnDimension * sizeof(float)); delete[] pA; delete[] pB; delete[] pC; cuMemFree(pDeviceMemA); cuMemFree(pDeviceMemB); cuMemFree(pDeviceMemC);
Host code for adding two vectors using the CUDA Driver API contains 50 lines of code and performs several lower-level operations than the Runtime API. These additional calls are evident in several places, especially the setup necessary in the Driver API prior to the kernel launch.