如何将嵌套结构的成员复制到 CUDA 设备的内存空间？

发布于 2024-12-09 06:12:38 字数 3968 浏览 4 评论 0原文

我正在尝试将一些嵌套结构复制到设备内存，以便在 CUDA 加速神经网络模拟器中使用内核。此代码链接并运行，但它抛出一些异常和 CUDA 错误：

typedef struct rdLayer
{
    long NeuronQty ;
    long DendriteQty ;

    cuDoubleComplex *gpuWeights ;
    cuDoubleComplex *gpuZOutputs ;
    cuDoubleComplex *gpuDeltas ;
    cuDoubleComplex *gpuUnWeights ;
} rdLayer;

typedef struct rdNetwork
{
    long SectorQty;
    double K_DIV_TWO_PI;
    double two_pi_div_sect_qty;
    cuDoubleComplex *gpuSectorBdry;
    long LayerQty;
    rdLayer *rLayer;
} rdNetwork;

struct rdLearningSet 
{
    long EvalMode ;
    long SampleQty ;
    long InputQty ;
    long OutputQty ;
    long ContOutputs ;
    long SampleIdxReq ;

    cuDoubleComplex *gpuXInputs ;
    cuDoubleComplex *gpuDOutputs ;
    cuDoubleComplex *gpuYOutputs ;
    double *gpudSE1024 ;
    cuDoubleComplex *gpuOutScalar ;
};

[...]
    struct rdLearningSet * rdLearn;
    struct rdNetwork * rdNet;
[...]
    cudaMalloc(&rdNet, sizeof(rdNetwork));
    cudaMalloc(&rdLearn, sizeof(rdLearningSet));
[...]
    cuDoubleComplex * dummy;
    struct rdLayer rdlSource, * rdldummy;
[...]
    //rdLayer *rLayer;
    cudaMalloc(&rdldummy, sizeof(rdLayer)*rSes.rNet->LayerQty);
    cudaMemcpy( &rdNet->rLayer, &rdldummy, sizeof(rdLayer*), cudaMemcpyHostToDevice);
    for (int L=1; L<rSes.rNet->LayerQty; L++){
            // construct layer to be copied
            rdlSource.NeuronQty=rSes.rNet->rLayer[L].iNeuronQty 
            rdlSource.DendriteQty=rSes.rNet->rLayer[L].iDendriteQty 
            cudaMalloc( &rdlSource.gpuWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) ) 
                    mCheckCudaWorked
            cudaMalloc( &rdlSource.gpuZOutputs, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) ) 
                    mCheckCudaWorked
            cudaMalloc( &rdlSource.gpuDeltas, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) ) 
                    mCheckCudaWorked
            cudaMalloc( &rdlSource.gpuUnWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) ) 
                    mCheckCudaWorked
            //copy layer sructure to Device mem
            cudaMemcpyToSymbol( "rdNet->rLayer", &rdlSource, sizeof(rdLayer), sizeof(rdLayer) * L, cudaMemcpyHostToDevice );/*! 2D neuron cx weight matrix on GPU */
                    mCheckCudaWorked
    }
[...]   
    cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1) ); /*! 2D complex input tuples in GPU. */
            cudaMemcpy( &rdLearn->gpuXInputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );
                    cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1), cudaMemcpyHostToDevice); 
                    mCheckCudaWorked        
    cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1) ); /*! 2D desired complex outputs in GPU. */
            cudaMemcpy( &rdLearn->gpuDOutputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );
                    cudaMemcpy( &dummy, &rSes.rLearn->gpuDOutputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1), cudaMemcpyHostToDevice); 
                    mCheckCudaWorked
[...]

不幸的是，cudaMemcpyToSymbol 调用返回一个错误，mCheckCudaWorked 宏表示“无效的设备符号”，而最后一个 (cudaMemcpy( &dummy, &rSes.rLearn ->gpuDOutputs...) 和倒数第三个 (cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs...) cudaMemcpy 调用返回“无效参数”。

我不知道如何继续将这些项目复制到设备内存并可从内核代码寻址。 ;dummy 和 &rdldummy 肯定会作为指向已分配内存等待的设备内存地址的指针返回，我可以将这些指针写入设备内存，但我无法将大部分成员值复制到指定的分配中？

原文

I'm trying to copy some nested structs to device memory for kernel use in a CUDA-accelerated neural network simulator. This code links and runs, but it throws some exceptions and CUDA errors:

typedef struct rdLayer
{
    long NeuronQty ;
    long DendriteQty ;

    cuDoubleComplex *gpuWeights ;
    cuDoubleComplex *gpuZOutputs ;
    cuDoubleComplex *gpuDeltas ;
    cuDoubleComplex *gpuUnWeights ;
} rdLayer;

typedef struct rdNetwork
{
    long SectorQty;
    double K_DIV_TWO_PI;
    double two_pi_div_sect_qty;
    cuDoubleComplex *gpuSectorBdry;
    long LayerQty;
    rdLayer *rLayer;
} rdNetwork;

struct rdLearningSet 
{
    long EvalMode ;
    long SampleQty ;
    long InputQty ;
    long OutputQty ;
    long ContOutputs ;
    long SampleIdxReq ;

    cuDoubleComplex *gpuXInputs ;
    cuDoubleComplex *gpuDOutputs ;
    cuDoubleComplex *gpuYOutputs ;
    double *gpudSE1024 ;
    cuDoubleComplex *gpuOutScalar ;
};

[...]
    struct rdLearningSet * rdLearn;
    struct rdNetwork * rdNet;
[...]
    cudaMalloc(&rdNet, sizeof(rdNetwork));
    cudaMalloc(&rdLearn, sizeof(rdLearningSet));
[...]
    cuDoubleComplex * dummy;
    struct rdLayer rdlSource, * rdldummy;
[...]
    //rdLayer *rLayer;
    cudaMalloc(&rdldummy, sizeof(rdLayer)*rSes.rNet->LayerQty);
    cudaMemcpy( &rdNet->rLayer, &rdldummy, sizeof(rdLayer*), cudaMemcpyHostToDevice);
    for (int L=1; L<rSes.rNet->LayerQty; L++){
            // construct layer to be copied
            rdlSource.NeuronQty=rSes.rNet->rLayer[L].iNeuronQty 
            rdlSource.DendriteQty=rSes.rNet->rLayer[L].iDendriteQty 
            cudaMalloc( &rdlSource.gpuWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) ) 
                    mCheckCudaWorked
            cudaMalloc( &rdlSource.gpuZOutputs, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) ) 
                    mCheckCudaWorked
            cudaMalloc( &rdlSource.gpuDeltas, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) ) 
                    mCheckCudaWorked
            cudaMalloc( &rdlSource.gpuUnWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) ) 
                    mCheckCudaWorked
            //copy layer sructure to Device mem
            cudaMemcpyToSymbol( "rdNet->rLayer", &rdlSource, sizeof(rdLayer), sizeof(rdLayer) * L, cudaMemcpyHostToDevice );/*! 2D neuron cx weight matrix on GPU */
                    mCheckCudaWorked
    }
[...]   
    cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1) ); /*! 2D complex input tuples in GPU. */
            cudaMemcpy( &rdLearn->gpuXInputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );
                    cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1), cudaMemcpyHostToDevice); 
                    mCheckCudaWorked        
    cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1) ); /*! 2D desired complex outputs in GPU. */
            cudaMemcpy( &rdLearn->gpuDOutputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );
                    cudaMemcpy( &dummy, &rSes.rLearn->gpuDOutputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1), cudaMemcpyHostToDevice); 
                    mCheckCudaWorked
[...]

Unfortunately, the cudaMemcpyToSymbol call returns an error that the mCheckCudaWorked macro says is "invalid device symbol", while the last (cudaMemcpy( &dummy, &rSes.rLearn->gpuDOutputs...) and third-from-last (cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs...) cudaMemcpy calls return "invalid argument".

I am at a loss as to how to proceed to get these items copied to device memory and addressable from kernel code. &dummy and &rdldummy are positively being returned as the pointers to the device memory addresses where the allocated memory awaits, and I can write those pointers to the device memory, but I cannot coax the bulk of the member values into being copied to the pointed-at allocations. Help?

分享到QQ

分享到微博