CUBLAS 矩阵乘法

发布于 2024-10-30 14:01:25 字数 4597 浏览 10 评论 0原文

使用 CUDA 实现矩阵乘法后。我尝试用 CUBLAS 来实现它（感谢论坛中一些人的建议）。

我可以乘方阵，但是（是的，再一次......）我在处理非方阵时遇到困难。唯一有效的非方阵乘法类型是当您改变矩阵 A 的宽度(A*B=C) 时。

我没有收到任何错误，但结果矩阵返回错误的值。这是我的代码（它基本上是 simpleCUBLAS SDK 示例的改编）：

#include <stdlib.h>
#include <stdio.h>
#include "cublas.h"
#define HA 2
#define WA 9
#define WB 2
#define HB WA 
#define WC WB   
#define HC HA  
#define index(i,j,ld) (((j)*(ld))+(i))

void printMat(float*P,int uWP,int uHP){
//printf("\n %f",P[1]);
int i,j;
for(i=0;i<uHP;i++){

    printf("\n");

    for(j=0;j<uWP;j++)
        printf("%f ",P[index(i,j,uHP)]);
        //printf("%f ",P[i*uWP+j]);
}
}




 int  main (int argc, char** argv) {
    cublasStatus status;
        int i,j;
        cublasInit();

        float *A = (float*)malloc(HA*WA*sizeof(float));
        float *B = (float*)malloc(HB*WB*sizeof(float));
        float *C = (float*)malloc(HC*WC*sizeof(float));
    if (A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    if (B == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    if (C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
        }


        for (i=0;i<HA;i++)
     for (j=0;j<WA;j++)
        A[index(i,j,HA)] = (float) index(i,j,HA);   
        for (i=0;i<HB;i++)
     for (j=0;j<WB;j++)
        B[index(i,j,HB)] = (float) index(i,j,HB); 
    /*
    for (i=0;i<HA*WA;i++)
    A[i]=(float) i;
    for (i=0;i<HB*WB;i++)
    B[i]=(float) i;         */  


        float* AA; float* BB; float* CC;

    /*ALLOCATE ON THE DEVICE*/
    status=cublasAlloc(HA*WA,sizeof(float),(void**)&AA);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

        status=cublasAlloc(HB*WB,sizeof(float),(void**)&BB);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

        status=cublasAlloc(HC*WC,sizeof(float),(void**)&CC);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

    /*SET MATRIX*/
        status=cublasSetMatrix(HA,WA,sizeof(float),A,HA,AA,HA);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

        status=cublasSetMatrix(HB,WB,sizeof(float),B,HB,BB,HB);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

    /*KERNEL*/
        cublasSgemm('n','n',HA,WB,WA,1,AA,HA,BB,HB,0,CC,HC);

        status = cublasGetError();
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
        }
        cublasGetMatrix(HC,WC,sizeof(float),CC,HC,C,HC);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device read error (A)\n");
        return EXIT_FAILURE;
        }


    /* PERFORMANCE OUTPUT*/

    printf("\nMatriz A:\n");
    printMat(A,WA,HA);
    printf("\nMatriz B:\n");
    printMat(B,WB,HB);
    printf("\nMatriz C:\n");
    printMat(C,WC,HC);

        free( A );  free( B );  free ( C );
        status = cublasFree(AA);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
        }
        status = cublasFree(BB);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
        }
        status = cublasFree(CC);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (C)\n");
    return EXIT_FAILURE;
    }

        /* Shutdown */
        status = cublasShutdown();
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
        }

    if (argc > 1) {
        if (!strcmp(argv[1], "-noprompt") ||!strcmp(argv[1], "-qatest") ) 
        {
    return EXIT_SUCCESS;
        }
        } 
        else
        {
            printf("\nPress ENTER to exit...\n");
            getchar();
        }

return EXIT_SUCCESS;


  }

有什么想法吗？另外，有没有人在 CUBLAS 中有一个可以工作的矩阵乘法实现，所以我可以比较？提前致谢。

原文

After implementing matrix multiplication with CUDA. I tried to implement it with CUBLAS(thanks to the advice of some people here in the forum).

I can multiply square matrices but (yes once again...) I am having difficulties working with non square matrices. The only type of non square matrix multiplication that works is when you vary Matrix A's Width(A*B=C).

I don't get any errors but the resulting matrix returns wrong values. Here is my code(it is basically an adaptation of the simpleCUBLAS SDK example):

#include <stdlib.h>
#include <stdio.h>
#include "cublas.h"
#define HA 2
#define WA 9
#define WB 2
#define HB WA 
#define WC WB   
#define HC HA  
#define index(i,j,ld) (((j)*(ld))+(i))

void printMat(float*P,int uWP,int uHP){
//printf("\n %f",P[1]);
int i,j;
for(i=0;i<uHP;i++){

    printf("\n");

    for(j=0;j<uWP;j++)
        printf("%f ",P[index(i,j,uHP)]);
        //printf("%f ",P[i*uWP+j]);
}
}




 int  main (int argc, char** argv) {
    cublasStatus status;
        int i,j;
        cublasInit();

        float *A = (float*)malloc(HA*WA*sizeof(float));
        float *B = (float*)malloc(HB*WB*sizeof(float));
        float *C = (float*)malloc(HC*WC*sizeof(float));
    if (A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    if (B == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    if (C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
        }


        for (i=0;i<HA;i++)
     for (j=0;j<WA;j++)
        A[index(i,j,HA)] = (float) index(i,j,HA);   
        for (i=0;i<HB;i++)
     for (j=0;j<WB;j++)
        B[index(i,j,HB)] = (float) index(i,j,HB); 
    /*
    for (i=0;i<HA*WA;i++)
    A[i]=(float) i;
    for (i=0;i<HB*WB;i++)
    B[i]=(float) i;         */  


        float* AA; float* BB; float* CC;

    /*ALLOCATE ON THE DEVICE*/
    status=cublasAlloc(HA*WA,sizeof(float),(void**)&AA);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

        status=cublasAlloc(HB*WB,sizeof(float),(void**)&BB);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

        status=cublasAlloc(HC*WC,sizeof(float),(void**)&CC);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

    /*SET MATRIX*/
        status=cublasSetMatrix(HA,WA,sizeof(float),A,HA,AA,HA);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

        status=cublasSetMatrix(HB,WB,sizeof(float),B,HB,BB,HB);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
        }

    /*KERNEL*/
        cublasSgemm('n','n',HA,WB,WA,1,AA,HA,BB,HB,0,CC,HC);

        status = cublasGetError();
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
        }
        cublasGetMatrix(HC,WC,sizeof(float),CC,HC,C,HC);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device read error (A)\n");
        return EXIT_FAILURE;
        }


    /* PERFORMANCE OUTPUT*/

    printf("\nMatriz A:\n");
    printMat(A,WA,HA);
    printf("\nMatriz B:\n");
    printMat(B,WB,HB);
    printf("\nMatriz C:\n");
    printMat(C,WC,HC);

        free( A );  free( B );  free ( C );
        status = cublasFree(AA);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
        }
        status = cublasFree(BB);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
        }
        status = cublasFree(CC);
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (C)\n");
    return EXIT_FAILURE;
    }

        /* Shutdown */
        status = cublasShutdown();
        if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
        }

    if (argc > 1) {
        if (!strcmp(argv[1], "-noprompt") ||!strcmp(argv[1], "-qatest") ) 
        {
    return EXIT_SUCCESS;
        }
        } 
        else
        {
            printf("\nPress ENTER to exit...\n");
            getchar();
        }

return EXIT_SUCCESS;


  }

Any thoughts? Also, does anyone has a matrix multiplication implementation in CUBLAS that is working, so i could compare? Thanks in advance.

分享到QQ

分享到微博