CUDA 代码不起作用,为什么?
我有这段代码,但有时它有效,有时不行(写 printf("ERR:%d\n", id))。 我在 CUDA 4.1 下工作,并拥有计算能力 2.1 的 GTS450。
该代码没有更高的目的,我只是想找出为什么它不起作用,因为我的想法告诉我,没错:]
如果你想运行它,也许你需要执行几次,当“错误”出现或更改网格时尺寸!
PS:在这里你可以下载win64的exe文件 - 你需要有cuda4.1驱动
class MAN
{
public:
int m_id;
int m_use;
__device__
MAN()
{
m_id = -1;
m_use = 0;
}
};
__device__ int* d_ids = NULL;
__device__ int d_last_ids = 0;
__device__ MAN* d_mans = NULL;
__global__ void init()
{
d_mans = new MAN[500]; //note: 500 is more than enough!
d_ids = new int[500];
for(int i=0; i < 500; i++)
d_ids[i] = 0;
}
__device__ int getMAN() //every block get unique number, so at one moment all running blocks has different id
{
while(true)
{
for(int i=0; i < 500; i++)
if(atomicCAS(&(d_mans[i].m_use), 0, 1)==0)
return i;
}
}
__device__ void returnMAN(int id)
{
int s = atomicExch(&(d_mans[id].m_use), 0);
}
__global__ void testIt()
{
if(threadIdx.x==0)
{
int man = getMAN();
int id = d_mans[man].m_id;
if(id == -1) //If It never works with this "id", its creating new
{
id = atomicAdd(&d_last_ids, 2);
d_ids[id] = 10; //set to non-zero
d_mans[man].m_id = id; //save new id for next time
printf("ADD:%d\n", id);
}
if(d_ids[id]==0)
printf("ERR:%d\n", id); //THIS SHOULD NEVER HAPPEN, BUT BECOMES !!!
returnMAN(man);
}
}
int main()
{
init<<<1, 1>>>();
printf("init() err: %d\n", cudaDeviceSynchronize());
testIt<<<20000, 512>>>();
printf("testIt() err: %d\n", cudaDeviceSynchronize());
getchar();
return 0;
}
I have this code, but sometimes It works, sometimes NOT(write printf("ERR:%d\n", id)).
I work under CUDA 4.1 and have GTS450 which is compute capability 2.1.
The code doesnt have higher purpose, Iam just trying to find why Its not working, because My mind tell me, that right :]
If you want to run it, maybe you need to execute few times, when "error" appear or change grid size!
PS: here you can download exe file for win64 - you need to have cuda4.1 driver
class MAN
{
public:
int m_id;
int m_use;
__device__
MAN()
{
m_id = -1;
m_use = 0;
}
};
__device__ int* d_ids = NULL;
__device__ int d_last_ids = 0;
__device__ MAN* d_mans = NULL;
__global__ void init()
{
d_mans = new MAN[500]; //note: 500 is more than enough!
d_ids = new int[500];
for(int i=0; i < 500; i++)
d_ids[i] = 0;
}
__device__ int getMAN() //every block get unique number, so at one moment all running blocks has different id
{
while(true)
{
for(int i=0; i < 500; i++)
if(atomicCAS(&(d_mans[i].m_use), 0, 1)==0)
return i;
}
}
__device__ void returnMAN(int id)
{
int s = atomicExch(&(d_mans[id].m_use), 0);
}
__global__ void testIt()
{
if(threadIdx.x==0)
{
int man = getMAN();
int id = d_mans[man].m_id;
if(id == -1) //If It never works with this "id", its creating new
{
id = atomicAdd(&d_last_ids, 2);
d_ids[id] = 10; //set to non-zero
d_mans[man].m_id = id; //save new id for next time
printf("ADD:%d\n", id);
}
if(d_ids[id]==0)
printf("ERR:%d\n", id); //THIS SHOULD NEVER HAPPEN, BUT BECOMES !!!
returnMAN(man);
}
}
int main()
{
init<<<1, 1>>>();
printf("init() err: %d\n", cudaDeviceSynchronize());
testIt<<<20000, 512>>>();
printf("testIt() err: %d\n", cudaDeviceSynchronize());
getchar();
return 0;
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
这似乎发生了,因为
此代码
如果某个块写入 d_mans[man].m_id,但仍未写入 d_ids[id],则 包含竞争条件。编译器可能会交换指令“设置为非零”和“保存新的 id 以供下次使用”,或者缓存只是没有及时更新。
实际上,问题出在你的分配器上——记住最后使用的“man”的索引比寻找它更好。
This seems to happen, because
this code
Contains race condition if some block wrote to d_mans[man].m_id, but still haven't wrote to d_ids[id]. Probably compiler exchanges instruction "set to non-zero" and "save new id for next time" or cache just don't get updated in-time.
Actually, problem is with your allocator -- it is better to remember index of last used 'man' than look for it.
我已经将这个: 更改
为:
并且它工作正常!
甚至它也不需要 __threadfence();
I have changed this:
to this:
and it works ok!!!
And even It doesnt need __threadfence();