Unity Compute着色器的运行速度明显慢于CPU实现。我可以在某个地方进行优化吗?
我是使用Unity Compute着色器的新手,但在脚本和计算着色器中实现了流体求解器,但是即使在要模拟的大量点上,该脚本也明显更快。 我如何实施着色器有什么问题吗?我认为这可能是我发送的数据引起的瓶颈,但我不确定。这是运行着着色器的功能:
public struct dataLaxF
{
//Input Data
public Vector4 heightData;
public Vector4 fUData;
public Vector4 fVData;
public Vector4 uData;
public Vector4 vData;
//Output Data
public float outputHeight;
public float outputFU;
public float outputFV;
public float outputU;
public float outputV;
};
//Runs the algorithm on the GPU
public void SimulateGPU()
{
//Profiler Testing Start
Profiler.BeginSample("GPULaxF");
//Set data to structs for GPU
for (int x = 1; x < xGridSize-1; x++)
for (int y = 1; y < yGridSize-1; y++)
{
dataLaxF inData = new dataLaxF();
inData.heightData = new Vector4(height[x + 1, y], height[x - 1, y], height[x, y + 1], height[x, y - 1]);
inData.fUData = new Vector4(fU[x + 1, y], fU[x - 1, y], fU[x, y + 1], fU[x, y - 1]);
inData.fVData = new Vector4(fV[x + 1, y], fV[x - 1, y], fV[x, y + 1], fV[x, y - 1]);
inData.uData = new Vector4(u[x + 1, y], u[x - 1, y], u[x, y + 1], u[x, y - 1]);
inData.vData = new Vector4(v[x + 1, y], v[x - 1, y], v[x, y + 1], v[x, y - 1]);
inData.outputHeight = height[x, y];
inData.outputFU = fU[x, y];
inData.outputFV = fV[x, y];
inData.outputU = u[x, y];
inData.outputV = v[x, y];
data[(x * xGridSize) + y] = inData;
}
//Send data to GPU
int totalSize = sizeof(float) * 25;
ComputeBuffer dataBuffer = new ComputeBuffer(data.Length, totalSize);
dataBuffer.SetData(data);
computeShader.SetBuffer(0, "vDatas", dataBuffer);
computeShader.Dispatch(0, data.Length / 128, 1, 1);
dataBuffer.GetData(data);
//Recieve and read data
for (int x = 1; x < xGridSize-1; x++)
for (int y = 1; y < xGridSize-1; y++)
{
dataLaxF outData = data[x * xGridSize + y];
height[x, y] = outData.outputHeight;
fU[x, y] = outData.outputFU;
fV[x, y] = outData.outputFV;
u[x, y] = outData.outputU;
v[x, y] = outData.outputV;
}
dataBuffer.Dispose();
Profiler.EndSample();
}
这是计算着色器:
#pragma kernel CSMain
struct dataLaxF
{
//Input Data
float4 heightData;
float4 fUData;
float4 fVData;
float4 uData;
float4 vData;
//Output Data
float outputHeight;
float outputFU;
float outputFV;
float outputU;
float outputV;
};
//Data struct arrays
RWStructuredBuffer<dataLaxF> vDatas;
//Kernel for Lax Friedrichs
[numthreads(128,1,1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
float alpha = 0.02f;
float g = 9.8f;
dataLaxF data = vDatas[id.x];
//Height
data.outputHeight = 0.25f * (data.heightData.x + data.heightData.y + data.heightData.z + data.heightData.a) - alpha * (data.fUData.x - data.fUData.y) - alpha * (data.fVData.z - data.fVData.a);
//FU
data.outputFU = 0.25f * (data.fUData.x + data.fUData.y + data.fUData.z + data.fUData.a);
data.outputFU = data.outputFU - alpha * ((data.heightData.x * (data.uData.x* data.uData.x) + (0.5f * g * (data.heightData.x* data.heightData.x))) - (data.heightData.y * (data.uData.y* data.uData.y) + (0.5f * g * (data.heightData.y* data.heightData.y))));
data.outputFU = data.outputFU - alpha * (data.heightData.z * data.vData.z * data.uData.z - data.heightData.a * data.vData.a * data.uData.a);
//FV
data.outputFV = 0.25f * (data.fVData.x + data.fVData.y + data.fVData.z + data.fVData.a);
data.outputFV = data.outputFV - alpha * (data.heightData.x * data.vData.x * data.uData.x - data.heightData.y * data.vData.y * data.uData.y);
data.outputFV = data.outputFV - alpha * ((data.heightData.z * (data.vData.z* data.vData.z) + (0.5f * g *(data.heightData.z* data.heightData.z))) - (data.heightData.a * (data.vData.a* data.vData.a) + (0.5f * g * (data.heightData.a* data.heightData.a))));
//U
data.outputU = data.outputFU / data.outputHeight;
//V
data.outputV = data.outputFV / data.outputHeight;
vDatas[id.x] = data;
}
谢谢。
I'm new to using Unity compute shaders but have implemented a fluid solver in a script and a compute shader but the script is significantly faster even at high numbers of points to be simulated.
Is there anything majorly wrong with how I've implemented the shader? I think it might be the data I'm sending causing a bottleneck but I'm not entirely sure. Here is the function that runs the shader:
public struct dataLaxF
{
//Input Data
public Vector4 heightData;
public Vector4 fUData;
public Vector4 fVData;
public Vector4 uData;
public Vector4 vData;
//Output Data
public float outputHeight;
public float outputFU;
public float outputFV;
public float outputU;
public float outputV;
};
//Runs the algorithm on the GPU
public void SimulateGPU()
{
//Profiler Testing Start
Profiler.BeginSample("GPULaxF");
//Set data to structs for GPU
for (int x = 1; x < xGridSize-1; x++)
for (int y = 1; y < yGridSize-1; y++)
{
dataLaxF inData = new dataLaxF();
inData.heightData = new Vector4(height[x + 1, y], height[x - 1, y], height[x, y + 1], height[x, y - 1]);
inData.fUData = new Vector4(fU[x + 1, y], fU[x - 1, y], fU[x, y + 1], fU[x, y - 1]);
inData.fVData = new Vector4(fV[x + 1, y], fV[x - 1, y], fV[x, y + 1], fV[x, y - 1]);
inData.uData = new Vector4(u[x + 1, y], u[x - 1, y], u[x, y + 1], u[x, y - 1]);
inData.vData = new Vector4(v[x + 1, y], v[x - 1, y], v[x, y + 1], v[x, y - 1]);
inData.outputHeight = height[x, y];
inData.outputFU = fU[x, y];
inData.outputFV = fV[x, y];
inData.outputU = u[x, y];
inData.outputV = v[x, y];
data[(x * xGridSize) + y] = inData;
}
//Send data to GPU
int totalSize = sizeof(float) * 25;
ComputeBuffer dataBuffer = new ComputeBuffer(data.Length, totalSize);
dataBuffer.SetData(data);
computeShader.SetBuffer(0, "vDatas", dataBuffer);
computeShader.Dispatch(0, data.Length / 128, 1, 1);
dataBuffer.GetData(data);
//Recieve and read data
for (int x = 1; x < xGridSize-1; x++)
for (int y = 1; y < xGridSize-1; y++)
{
dataLaxF outData = data[x * xGridSize + y];
height[x, y] = outData.outputHeight;
fU[x, y] = outData.outputFU;
fV[x, y] = outData.outputFV;
u[x, y] = outData.outputU;
v[x, y] = outData.outputV;
}
dataBuffer.Dispose();
Profiler.EndSample();
}
And this is the compute shader:
#pragma kernel CSMain
struct dataLaxF
{
//Input Data
float4 heightData;
float4 fUData;
float4 fVData;
float4 uData;
float4 vData;
//Output Data
float outputHeight;
float outputFU;
float outputFV;
float outputU;
float outputV;
};
//Data struct arrays
RWStructuredBuffer<dataLaxF> vDatas;
//Kernel for Lax Friedrichs
[numthreads(128,1,1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
float alpha = 0.02f;
float g = 9.8f;
dataLaxF data = vDatas[id.x];
//Height
data.outputHeight = 0.25f * (data.heightData.x + data.heightData.y + data.heightData.z + data.heightData.a) - alpha * (data.fUData.x - data.fUData.y) - alpha * (data.fVData.z - data.fVData.a);
//FU
data.outputFU = 0.25f * (data.fUData.x + data.fUData.y + data.fUData.z + data.fUData.a);
data.outputFU = data.outputFU - alpha * ((data.heightData.x * (data.uData.x* data.uData.x) + (0.5f * g * (data.heightData.x* data.heightData.x))) - (data.heightData.y * (data.uData.y* data.uData.y) + (0.5f * g * (data.heightData.y* data.heightData.y))));
data.outputFU = data.outputFU - alpha * (data.heightData.z * data.vData.z * data.uData.z - data.heightData.a * data.vData.a * data.uData.a);
//FV
data.outputFV = 0.25f * (data.fVData.x + data.fVData.y + data.fVData.z + data.fVData.a);
data.outputFV = data.outputFV - alpha * (data.heightData.x * data.vData.x * data.uData.x - data.heightData.y * data.vData.y * data.uData.y);
data.outputFV = data.outputFV - alpha * ((data.heightData.z * (data.vData.z* data.vData.z) + (0.5f * g *(data.heightData.z* data.heightData.z))) - (data.heightData.a * (data.vData.a* data.vData.a) + (0.5f * g * (data.heightData.a* data.heightData.a))));
//U
data.outputU = data.outputFU / data.outputHeight;
//V
data.outputV = data.outputFV / data.outputHeight;
vDatas[id.x] = data;
}
Thanks in advance.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论