我制作了一个OpenCL程序并使用固定内存(CL_MEM_ALLOC_HOST_PTR
)来获得从设备到主机的更高传输速率.
传输速率按我的预期增加(使用AMD APP Profiler 2.4获得传输速率).问题是矩阵4096 x 4096(64 MB)的传输速率高于PCIe带宽(93703 GB/s).
当我使用零复制缓冲区(CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer)时也发生了这种情况.我搜索一些信息,如果固定内存和零复制缓冲区具有高传输速率,但它仍然受限于PCIe带宽用于独立GPU.那么,如果传输速率超过PCIe带宽(使用PCIe带宽2.0 x 16),这是正常的吗?
我的操作系统是Windows 7 64位.我使用AMD APP SDK 2.6和独立GPU AMD HD 6630M.
编辑:这是代码:
#include#include #include #include using namespace std; #ifdef __APPLE__ #include #else #include #endif #define MAX_SOURCE_SIZE (0x100000) cl_context context = NULL; cl_command_queue queue = NULL; cl_program program = NULL; void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size) { cl_int err; cl_kernel naive; // Create Kernel Object Bound To Kernel Function naive = clCreateKernel(program, "naiveAlgorithm", &err); //Set size of global work item and work tem in each work goups int globalsize = size; int localsize; if(globalsize >= 16) { localsize =16; }else { localsize = globalsize; } size_t global_work_items [2] = {globalsize, globalsize}; size_t local_work_items [2] = {localsize, localsize}; // Setup Kernel Argument err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A); err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B); err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C); err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size); // Execute OpenCL kernel for Naive Algorithm err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL); clFinish(queue); //Release Kernel err = clReleaseKernel(naive); } void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size) { int err; // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; // Allocate Device Memory For Input And Output d_A = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err); d_B = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err); d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err); // Copy Host Memory To Memory Device err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL); MatrixMul(d_A, d_B, d_C, size); err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL); err = clReleaseMemObject(d_A); err = clReleaseMemObject(d_B); err = clReleaseMemObject(d_C); } //Main Function int main(int argc, char **argv) { //Size of matrix for Strassen Algorithm cl_int size = 4096; //Matrix for input and output cl_float * matrixA; cl_float * matrixB; cl_float * matrixC; //Allocate and init memory for the host matrixA = (cl_float *) malloc(size*size*sizeof(cl_float)); matrixB = (cl_float *) malloc(size*size*sizeof(cl_float)); matrixC = (cl_float *) malloc(size*size*sizeof(cl_float)); //Fill matrix fillMatrix(matrixA,size); fillMatrix(matrixB,size); //print input for matrix A and B cout<<"Input for matrix A :"< (platforms [platformtype]), 0, 0 }; context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err); ![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); //Load Kernel Source FILE *fp; const char fileName[] = "./MatMul_Kernel.cl"; size_t source_size; char *source_str; fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // Create Program Object program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *), &source_size, &err); // Build Program err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); Naive(matrixA, matrixB, matrixC, size); //Cleanup all memory err = clFlush(queue); err = clFinish(queue); err = clReleaseProgram(program); err = clReleaseCommandQueue(queue); err = clReleaseContext(context); // Display result of matrix multiplication cout<<"Output for matrix C :"< 这是内核代码:
__kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) { int tx = get_global_id(0); //2D Thread IDx int ty = get_global_id(1); //2D Thread IDy float sum = 0; //Calculate result of one element of Matrix C for (int k = 0; k < size; k++) { sum += A[ty*size+k] * B[k*size+tx]; } C[ty*size+tx] = sum; }这是图像: