如何成功读取 2D 纹理

发布于 2024-09-26 06:44:04 字数 4209 浏览 5 评论 0原文

我如何：

将 cudaMallocPitch 浮点内存绑定到 2D 纹理引用
将一些主机数据复制到设备上的 2D 数组
将 1 添加到纹理引用并写入 a.) Pitch 2D 数组或 b.) 写入线性内存array
读回答案并显示它。

下面是应该完成此操作的代码。请注意，对于 NxN 数组大小，我的代码有效。对于 NxM（其中 N!=M），我的代码毫无意义（不是正确的结果）。如果你能解决这个问题，我将奖励你1个互联网（供应有限）。也许我疯了，但根据文档，这应该可行（并且它确实适用于方形数组！）。附加的代码应使用“nvccwhateveryoucallit.cu -o runit”运行。

感谢帮助！

#include<stdio.h>
#include<cuda.h>
#include<iostream>
#define height 16
#define width 11
#define BLOCKSIZE 16

using namespace std;

// Device Kernels

//Texture reference Declaration
texture<float,2> texRefEx;


__global__ void kernel_w_textures(float* devMPPtr, float * devMPtr, int pitch)
{
 // Thread indexes
        unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
        unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

 // Texutre Coordinates
 float u=(idx)/float(width);
 float v=(idy)/float(height);
 devMPtr[idy*width+idx]=devMPPtr[idy*pitch/sizeof(float)+idx];
 // Write Texture Contents to malloc array +1
 devMPtr[idy*width+idx]= tex2D(texRefEx,u,v);//+1.0f;
}
int main()
{
 // memory size
 size_t memsize=height*width;
 size_t offset;
 float * data,  // input from host
  *h_out,  // host space for output
  *devMPPtr, // malloc Pitch ptr
  *devMPtr; // malloc ptr

 size_t pitch;

 // Allocate space on the host
 data=(float *)malloc(sizeof(float)*memsize);
 h_out=(float *)malloc(sizeof(float)*memsize);


// Define data
for (int i = 0; i <  height; i++)
 for (int j=0; j < width; j++)
  data[i*width+j]=float(j);

// Define the grid
dim3 grid((int)(width/BLOCKSIZE)+1,(int)(height/BLOCKSIZE)+1), threads(BLOCKSIZE,BLOCKSIZE);

// allocate Malloc Pitch
cudaMallocPitch((void**)&devMPPtr,&pitch, width * sizeof(float), height);

// Print the pitch
printf("The pitch is %d \n",pitch/sizeof(float));

// Texture Channel Description
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);

// Bind texture to pitch mem:
cudaBindTexture2D(&offset,&texRefEx,devMPPtr,&channelDesc,width,height,pitch);
cout << "My Description x is " << channelDesc.x << endl;
cout << "My Description y is " << channelDesc.y << endl;
cout << "My Description z is " << channelDesc.z << endl;
cout << "My Description w is " << channelDesc.w << endl;
cout << "My Description kind is " << channelDesc.f << endl;
cout << "Offset is " << offset << endl;

// Set mutable properties:
texRefEx.normalized=true;
texRefEx.addressMode[0]=cudaAddressModeWrap;
texRefEx.addressMode[1]=cudaAddressModeWrap;
texRefEx.filterMode= cudaFilterModePoint;

// Allocate cudaMalloc memory
cudaMalloc((void**)&devMPtr,memsize*sizeof(float));

// Read data from host to device
cudaMemcpy2D((void*)devMPPtr,pitch,(void*)data,sizeof(float)*width,
  sizeof(float)*width,height,cudaMemcpyHostToDevice);

//Read back and check this memory
cudaMemcpy2D((void*)h_out,width*sizeof(float),(void*)devMPPtr,pitch,
  sizeof(float)*width,height,cudaMemcpyDeviceToHost);

// Print the memory
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*width+j]);
  }
 cout << endl;
 }

 cout << "Done" << endl;
// Memory is fine... 

kernel_w_textures<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

// Copy back data to host
cudaMemcpy((void*)h_out,(void*)devMPtr,width*height*sizeof(float),cudaMemcpyDeviceToHost);


// Print the Result
 cout << endl;
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*width+j]);
  }
 cout << endl;
 }
 cout << "Done" << endl;

return(0);
}

10月17日编辑：所以我还没有找到解决这个问题的方法。英伟达对此保持沉默，似乎世界也是如此。我找到了使用共享内存的解决方法，但如果有人有纹理解决方案，我将非常高兴。

编辑 Octoboer 26：仍然没有解决方案，但如果有人知道的话，仍然对一个解决方案感兴趣。

编辑 7 月 26 日：哇，已经过去 9 个月了 - 我一直忽略了正确答案。诀窍是：

if ( idx < width  && idy < height){//.... code }

正如之前指出的那样。感谢所有贡献者！

原文

How can I:

Bind cudaMallocPitch float memory to a 2D texture reference
Copy some host data to the 2D array on the device
Add one to the texture reference and write to either a.) the Pitch 2D array OR b.) write to a linear memory array
Read the answer back and display it.

Below is a code that should accomplish this. Note that for NxN array sizes, my code works. For NxM where N!=M, my code bites the dust (not the correct result). If you can solve this problem I will award you 1 internets (supply limited). Maybe I'm crazy, but according to the documentation this should work (and it does work for square arrays!). The attached code should run with 'nvcc whateveryoucallit.cu -o runit'.

Help is appreciated!

#include<stdio.h>
#include<cuda.h>
#include<iostream>
#define height 16
#define width 11
#define BLOCKSIZE 16

using namespace std;

// Device Kernels

//Texture reference Declaration
texture<float,2> texRefEx;


__global__ void kernel_w_textures(float* devMPPtr, float * devMPtr, int pitch)
{
 // Thread indexes
        unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
        unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

 // Texutre Coordinates
 float u=(idx)/float(width);
 float v=(idy)/float(height);
 devMPtr[idy*width+idx]=devMPPtr[idy*pitch/sizeof(float)+idx];
 // Write Texture Contents to malloc array +1
 devMPtr[idy*width+idx]= tex2D(texRefEx,u,v);//+1.0f;
}
int main()
{
 // memory size
 size_t memsize=height*width;
 size_t offset;
 float * data,  // input from host
  *h_out,  // host space for output
  *devMPPtr, // malloc Pitch ptr
  *devMPtr; // malloc ptr

 size_t pitch;

 // Allocate space on the host
 data=(float *)malloc(sizeof(float)*memsize);
 h_out=(float *)malloc(sizeof(float)*memsize);


// Define data
for (int i = 0; i <  height; i++)
 for (int j=0; j < width; j++)
  data[i*width+j]=float(j);

// Define the grid
dim3 grid((int)(width/BLOCKSIZE)+1,(int)(height/BLOCKSIZE)+1), threads(BLOCKSIZE,BLOCKSIZE);

// allocate Malloc Pitch
cudaMallocPitch((void**)&devMPPtr,&pitch, width * sizeof(float), height);

// Print the pitch
printf("The pitch is %d \n",pitch/sizeof(float));

// Texture Channel Description
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);

// Bind texture to pitch mem:
cudaBindTexture2D(&offset,&texRefEx,devMPPtr,&channelDesc,width,height,pitch);
cout << "My Description x is " << channelDesc.x << endl;
cout << "My Description y is " << channelDesc.y << endl;
cout << "My Description z is " << channelDesc.z << endl;
cout << "My Description w is " << channelDesc.w << endl;
cout << "My Description kind is " << channelDesc.f << endl;
cout << "Offset is " << offset << endl;

// Set mutable properties:
texRefEx.normalized=true;
texRefEx.addressMode[0]=cudaAddressModeWrap;
texRefEx.addressMode[1]=cudaAddressModeWrap;
texRefEx.filterMode= cudaFilterModePoint;

// Allocate cudaMalloc memory
cudaMalloc((void**)&devMPtr,memsize*sizeof(float));

// Read data from host to device
cudaMemcpy2D((void*)devMPPtr,pitch,(void*)data,sizeof(float)*width,
  sizeof(float)*width,height,cudaMemcpyHostToDevice);

//Read back and check this memory
cudaMemcpy2D((void*)h_out,width*sizeof(float),(void*)devMPPtr,pitch,
  sizeof(float)*width,height,cudaMemcpyDeviceToHost);

// Print the memory
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*width+j]);
  }
 cout << endl;
 }

 cout << "Done" << endl;
// Memory is fine... 

kernel_w_textures<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

// Copy back data to host
cudaMemcpy((void*)h_out,(void*)devMPtr,width*height*sizeof(float),cudaMemcpyDeviceToHost);


// Print the Result
 cout << endl;
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*width+j]);
  }
 cout << endl;
 }
 cout << "Done" << endl;

return(0);
}

Edit October 17: So I still haven't found a solution to this issue. Nvidia is pretty silent on this seems that the world is too. I found a workaround using shared mem but if anyone has a texture solution I would be very please.

Edit Octoboer 26: Still no soltuion, but still interested in one if anyone knows.

Edit July 26: Wow it has been 9 months - and I had overlooked the correct answer the whole time. The trick was:

if ( idx < width  && idy < height){//.... code }

As had been pointed out before. Thanks to all of those who contributed!

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

最好是你 2024-10-03 06:44:04

这可能与你的块大小有关。在此代码中，您尝试将 16x16 线程块写入 11x16 内存块。这意味着您的某些线程正在写入未分配的内存。这也解释了为什么你的测试 (16*M x 32*N) 有效：没有线程写入未分配的内存，因为你的尺寸是 16 的倍数。

这个问题的一个简单方法是这样的：

if ((x < width) && (y < height)) {
   // write output 
  devMPtr[idy*width+idx]= tex2D(texRefEx,u,v); 
}

解决在调用内核之前，您需要将高度和宽度传递给内核函数或将常量复制到卡。

It might have do with your blocksize. In this code you are trying to have a block of 16x16 threads write to a 11x16 memory block. That means that some of your threads are writing to unallocated memory. That also explains why your tests of (16*M by 32*N) worked: there were no threads writing to unallocated memory, since your dimensions were a multiple of 16.

An easy way to fix this problem is something like this:

if ((x < width) && (y < height)) {
   // write output 
  devMPtr[idy*width+idx]= tex2D(texRefEx,u,v); 
}

You'll need to either pass the height and width to the kernel function or copy a constant to the card before you call the kernel.

回复收藏 0 原文

那请放手 2024-10-03 06:44:04

我认为：

 float u=(idx)/float(width);
 float v=(idy)/float(height);

应该

 float u=(idx+0.5f)/float(width);
 float v=(idy+0.5f)/float(height);

为了获得相同的输入/输出，否则输出的第二列等于输入的第一列而不是第二列，并且输出的倒数第二列也是错误的。

如果您有不同的观察，请纠正我。

I think:

 float u=(idx)/float(width);
 float v=(idy)/float(height);

should be

 float u=(idx+0.5f)/float(width);
 float v=(idy+0.5f)/float(height);

In order the get identical input/output, otherwise the second column of output equals the first column of input rather than second and the second last column of output is also wrong.

Please correct me if you have different observation.

回复收藏 0 原文

无声情话 2024-10-03 06:44:04

 // Texutre Coordinates
 float u=(idx + 0.5)/float(width);
 float v=(idy + 0.5)/float(height);

您需要一个偏移量才能到达纹素的中心。我认为您的非 16 个纹理的倍数可能存在一些舍入错误。我尝试了这个，它对我有用（两个输出是相同的）。

 // Texutre Coordinates
 float u=(idx + 0.5)/float(width);
 float v=(idy + 0.5)/float(height);

You need an offset to get to the center of the texel. I think there might have been some rounding error for your non-multiple of 16 textures. I tried this and it worked for me (both outputs were identical).

回复收藏 0 原文

诗酒趁年少 2024-10-03 06:44:04

显卡通常期望纹理的尺寸为 2 的幂，对于 nVidia 卡尤其如此。 Cuda 的 cudaMallocPitch 和 cudaMemcpy2D 可以处理这些间距并查看您的代码，最安全的解决方案是自己调整宽度和高度以确保安全。否则，Cuda 可能会写入无效的内存，因为它会期望错误的偏移量：

#define height 16
#define width 11

...

size_t roundUpToPowerOf2(size_t v)
{
  // See http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
  --v;
  v |= v >> 1;
  v |= v >> 2;
  v |= v >> 4;
  v |= v >> 8;
  v |= v >> 16;
  ++v;
  return v;
}
...

size_t horizontal_pitch = roundUpToPowerOf2(width);
size_t vertical_pitch = roundUpToPowerOf2(height);
size_t memsize = horizontal_pitch * vertical_pitch;

...

// Read data from host to device
cudaMemcpy2D((void*)devMPPtr,pitch,(void*)data,sizeof(float)*horizontal_pitch,
  sizeof(float)*width,height,cudaMemcpyHostToDevice);

//Read back and check this memory
cudaMemcpy2D((void*)h_out,horizontal_pitch*sizeof(float),(void*)devMPPtr,pitch,
  sizeof(float)*width,height,cudaMemcpyDeviceToHost);

// Print the memory
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*horizontal_pitch+j]);
  }
 cout << endl;
 }

...

// Copy back data to host
cudaMemcpy((void*)h_out,(void*)devMPtr,horizontal_pitch*vertical_pitch*sizeof(float),cudaMemcpyDeviceToHost);

// Print the Result
 cout << endl;
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*horizontal_pitch+j]);
  }
 cout << endl;
 }
 cout << "Done" << endl;

希望我没有忽略任何应该使用水平间距/垂直间距而不是普通宽度/高度的地方。

Graphics cards usually expect textures to have dimensions that are powers of 2, this is especially true for nVidia cards. Cuda's cudaMallocPitch and cudaMemcpy2D work with these pitches and looking at your code, the safest solution is to adjust the width and height yourself to be on the safe side. Otherwise, Cuda might write to an invalid memory because it would be expecting wrong offsets:

#define height 16
#define width 11

...

size_t roundUpToPowerOf2(size_t v)
{
  // See http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
  --v;
  v |= v >> 1;
  v |= v >> 2;
  v |= v >> 4;
  v |= v >> 8;
  v |= v >> 16;
  ++v;
  return v;
}
...

size_t horizontal_pitch = roundUpToPowerOf2(width);
size_t vertical_pitch = roundUpToPowerOf2(height);
size_t memsize = horizontal_pitch * vertical_pitch;

...

// Read data from host to device
cudaMemcpy2D((void*)devMPPtr,pitch,(void*)data,sizeof(float)*horizontal_pitch,
  sizeof(float)*width,height,cudaMemcpyHostToDevice);

//Read back and check this memory
cudaMemcpy2D((void*)h_out,horizontal_pitch*sizeof(float),(void*)devMPPtr,pitch,
  sizeof(float)*width,height,cudaMemcpyDeviceToHost);

// Print the memory
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*horizontal_pitch+j]);
  }
 cout << endl;
 }

...

// Copy back data to host
cudaMemcpy((void*)h_out,(void*)devMPtr,horizontal_pitch*vertical_pitch*sizeof(float),cudaMemcpyDeviceToHost);

// Print the Result
 cout << endl;
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*horizontal_pitch+j]);
  }
 cout << endl;
 }
 cout << "Done" << endl;

Hopefully I haven't overlooked any place where horizontal_pitch/vertical_pitch should be used instead of plain width/height.

回复收藏 0 原文