当前位置：文江博客话题详情

Unity中CPU和GPU着色器的移动对象的速度的差异

发布于 2025-02-04 19:50:11 字数 5022 浏览 4 评论 0 原文

我一直在测试通过普通C＃代码和HLSL着色器的统一移动许多对象。但是，速度没有差异。 FPS保持不变。使用不同的Perlin噪声来改变位置。 C＃代码使用标准Mathf.perlinnoise，而HLSL使用自定义噪声函数。

方案1-仅通过C＃代码

对象产生更新：

[SerializeField]
private GameObject prefab;

private void Start()
{
    for (int i = 0; i < 50; i++)
        for (int j = 0; j < 50; j++)
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
        }
}

通过C＃移动对象的代码。将此脚本添加到每个创建的对象：

private Vector3 position = new Vector3();

private void Start()
{
    position = new Vector3(transform.position.x, Mathf.PerlinNoise(Time.time, Time.time), transform.position.z);
}

private void Update()
{
    position.y = Mathf.PerlinNoise(transform.position.x / 20f + Time.time, transform.position.z / 20f + Time.time) * 5f;
    transform.position = position;
}

方案2-通过计算内核（GPGPU）

第1部分：C＃客户端代码

对象产生，在着色器上运行计算并将结果值分配给对象：

public struct Particle
{
    public Vector3 position;
}

[SerializeField]
private GameObject prefab;
[SerializeField]
private ComputeShader computeShader;

private List<GameObject> particlesList = new List<GameObject>();
private Particle[] particlesDataArray;

private void Start()
{
    CreateParticles();
}

private void Update()
{
    UpdateParticlePosition();
}

private void CreateParticles()
{
    List<Particle> particlesDataList = new List<Particle>();

    for (int i = 0; i < 50; i++)
        for (int j = 0; j < 50; j++)
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
            particlesList.Add(createdParticle);
            Particle particle = new Particle();
            particle.position = createdParticle.transform.position;
            particlesDataList.Add(particle);
        }

    particlesDataArray = particlesDataList.ToArray();
    particlesDataList.Clear();
    computeBuffer = new ComputeBuffer(particlesDataArray.Length, sizeof(float) * 7);
    computeBuffer.SetData(particlesDataArray);
    computeShader.SetBuffer(0, "particles", computeBuffer);
}

private ComputeBuffer computeBuffer;
private void UpdateParticlePosition()
{
    computeShader.SetFloat("time", Time.time);
    computeShader.Dispatch(computeShader.FindKernel("CSMain"), particlesDataArray.Length / 10, 1, 1);
    computeBuffer.GetData(particlesDataArray);

    for (int i = 0; i < particlesDataArray.Length; i++)
    {
        Vector3 pos = particlesList[i].transform.position;
        pos.y = particlesDataArray[i].position.y;
        particlesList[i].transform.position = pos;
    }
}

<强>第2部分：计算内核（GPGPU）

#pragma kernel CSMain

struct Particle {
    float3 position;
    float4 color;
};

RWStructuredBuffer<Particle> particles;
float time;

float mod(float x, float y)
{
    return x - y * floor(x / y);
}

float  permute(float x) { return floor(mod(((x * 34.0) + 1.0) * x, 289.0)); }
float3 permute(float3 x) { return mod(((x * 34.0) + 1.0) * x, 289.0); }
float4 permute(float4 x) { return mod(((x * 34.0) + 1.0) * x, 289.0); }
float taylorInvSqrt(float r) { return 1.79284291400159 - 0.85373472095314 * r; }
float4 taylorInvSqrt(float4 r) { return float4(taylorInvSqrt(r.x), taylorInvSqrt(r.y), taylorInvSqrt(r.z), taylorInvSqrt(r.w)); }

float3 rand3(float3 c) {
    float j = 4096.0 * sin(dot(c, float3(17.0, 59.4, 15.0)));
    float3 r;
    r.z = frac(512.0 * j);
    j *= .125;
    r.x = frac(512.0 * j);
    j *= .125;
    r.y = frac(512.0 * j);
    return r - 0.5;
}

float _snoise(float3 p) {
    const float F3 = 0.3333333;
    const float G3 = 0.1666667;
    float3 s = floor(p + dot(p, float3(F3, F3, F3)));
    float3 x = p - s + dot(s, float3(G3, G3, G3));

    float3 e = step(float3(0.0, 0.0, 0.0), x - x.yzx);
    float3 i1 = e * (1.0 - e.zxy);
    float3 i2 = 1.0 - e.zxy * (1.0 - e);

    float3 x1 = x - i1 + G3;
    float3 x2 = x - i2 + 2.0 * G3;
    float3 x3 = x - 1.0 + 3.0 * G3;

    float4 w, d;

    w.x = dot(x, x);
    w.y = dot(x1, x1);
    w.z = dot(x2, x2);
    w.w = dot(x3, x3);

    w = max(0.6 - w, 0.0);

    d.x = dot(rand3(s), x);
    d.y = dot(rand3(s + i1), x1);
    d.z = dot(rand3(s + i2), x2);
    d.w = dot(rand3(s + 1.0), x3);

    w *= w;
    w *= w;
    d *= w;

    return dot(d, float4(52.0, 52.0, 52.0, 52.0));
}

[numthreads(10, 1, 1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
    Particle particle = particles[id.x];
    float modifyTime = time / 5.0;
    float positionY = _snoise(float3(particle.position.x / 20.0 + modifyTime, 0.0, particle.position.z / 20.0 + modifyTime)) * 5.0;

    particle.position = float3(particle.position.x, positionY, particle.position.z);
    particles[id.x] = particle;
}

我在做什么错，为什么计算速度没有增加？：）

提前致谢！

原文

I have been testing moving a lot of objects in Unity through normal C# code and through HLSL shaders. However, there is no difference in speed. FPS remains the same. Different perlin noise is used to change the position. The C# code uses the standard Mathf.PerlinNoise, while the HLSL uses a custom noise function.

Scenario 1 - Update via C# code only

Object spawn:

[SerializeField]
private GameObject prefab;

private void Start()
{
    for (int i = 0; i < 50; i++)
        for (int j = 0; j < 50; j++)
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
        }
}

Code to move an object via C#. This script is added to each created object:

private Vector3 position = new Vector3();

private void Start()
{
    position = new Vector3(transform.position.x, Mathf.PerlinNoise(Time.time, Time.time), transform.position.z);
}

private void Update()
{
    position.y = Mathf.PerlinNoise(transform.position.x / 20f + Time.time, transform.position.z / 20f + Time.time) * 5f;
    transform.position = position;
}

Scenario 2 - via Compute Kernel (GPGPU)

Part 1: C# client code

Object spawn, running the calculation on the shader and assigning the resulting value to the objects:

public struct Particle
{
    public Vector3 position;
}

[SerializeField]
private GameObject prefab;
[SerializeField]
private ComputeShader computeShader;

private List<GameObject> particlesList = new List<GameObject>();
private Particle[] particlesDataArray;

private void Start()
{
    CreateParticles();
}

private void Update()
{
    UpdateParticlePosition();
}

private void CreateParticles()
{
    List<Particle> particlesDataList = new List<Particle>();

    for (int i = 0; i < 50; i++)
        for (int j = 0; j < 50; j++)
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
            particlesList.Add(createdParticle);
            Particle particle = new Particle();
            particle.position = createdParticle.transform.position;
            particlesDataList.Add(particle);
        }

    particlesDataArray = particlesDataList.ToArray();
    particlesDataList.Clear();
    computeBuffer = new ComputeBuffer(particlesDataArray.Length, sizeof(float) * 7);
    computeBuffer.SetData(particlesDataArray);
    computeShader.SetBuffer(0, "particles", computeBuffer);
}

private ComputeBuffer computeBuffer;
private void UpdateParticlePosition()
{
    computeShader.SetFloat("time", Time.time);
    computeShader.Dispatch(computeShader.FindKernel("CSMain"), particlesDataArray.Length / 10, 1, 1);
    computeBuffer.GetData(particlesDataArray);

    for (int i = 0; i < particlesDataArray.Length; i++)
    {
        Vector3 pos = particlesList[i].transform.position;
        pos.y = particlesDataArray[i].position.y;
        particlesList[i].transform.position = pos;
    }
}

Part 2: Compute kernel (GPGPU)

#pragma kernel CSMain

struct Particle {
    float3 position;
    float4 color;
};

RWStructuredBuffer<Particle> particles;
float time;

float mod(float x, float y)
{
    return x - y * floor(x / y);
}

float  permute(float x) { return floor(mod(((x * 34.0) + 1.0) * x, 289.0)); }
float3 permute(float3 x) { return mod(((x * 34.0) + 1.0) * x, 289.0); }
float4 permute(float4 x) { return mod(((x * 34.0) + 1.0) * x, 289.0); }
float taylorInvSqrt(float r) { return 1.79284291400159 - 0.85373472095314 * r; }
float4 taylorInvSqrt(float4 r) { return float4(taylorInvSqrt(r.x), taylorInvSqrt(r.y), taylorInvSqrt(r.z), taylorInvSqrt(r.w)); }

float3 rand3(float3 c) {
    float j = 4096.0 * sin(dot(c, float3(17.0, 59.4, 15.0)));
    float3 r;
    r.z = frac(512.0 * j);
    j *= .125;
    r.x = frac(512.0 * j);
    j *= .125;
    r.y = frac(512.0 * j);
    return r - 0.5;
}

float _snoise(float3 p) {
    const float F3 = 0.3333333;
    const float G3 = 0.1666667;
    float3 s = floor(p + dot(p, float3(F3, F3, F3)));
    float3 x = p - s + dot(s, float3(G3, G3, G3));

    float3 e = step(float3(0.0, 0.0, 0.0), x - x.yzx);
    float3 i1 = e * (1.0 - e.zxy);
    float3 i2 = 1.0 - e.zxy * (1.0 - e);

    float3 x1 = x - i1 + G3;
    float3 x2 = x - i2 + 2.0 * G3;
    float3 x3 = x - 1.0 + 3.0 * G3;

    float4 w, d;

    w.x = dot(x, x);
    w.y = dot(x1, x1);
    w.z = dot(x2, x2);
    w.w = dot(x3, x3);

    w = max(0.6 - w, 0.0);

    d.x = dot(rand3(s), x);
    d.y = dot(rand3(s + i1), x1);
    d.z = dot(rand3(s + i2), x2);
    d.w = dot(rand3(s + 1.0), x3);

    w *= w;
    w *= w;
    d *= w;

    return dot(d, float4(52.0, 52.0, 52.0, 52.0));
}

[numthreads(10, 1, 1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
    Particle particle = particles[id.x];
    float modifyTime = time / 5.0;
    float positionY = _snoise(float3(particle.position.x / 20.0 + modifyTime, 0.0, particle.position.z / 20.0 + modifyTime)) * 5.0;

    particle.position = float3(particle.position.x, positionY, particle.position.z);
    particles[id.x] = particle;
}

What am I doing wrong, why is there no increase in calculation speed? :)

Thanks in advance!

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

丘比特射中我 2025-02-11 19:50:11

tl; dr：您的gpgpu（计算着色器）场景是未取代的，从而偏向您的结果。考虑将材料绑定到 ComputeBuffer ，并通过 graphics.drawPrococedural 渲染。这样，一切都停留在GPU上。

OP：

我做错了什么，为什么计算速度没有增加？

本质上，您的问题有两个部分。

（1）GPU的阅读速度很慢，

大多数与GPU相关的内容，您通常希望避免从GPU阅读，因为它会阻止CPU。对于GPGPU方案也是如此。

如果我要危害一个猜测，那将是gpgpu（compute dacker）call computebuffer.getdata（） 如下所示：

private void Update()
{
    UpdateParticlePosition();
}

private void UpdateParticlePosition()
{
.
.
.
    computeBuffer.GetData(particlesDataArray); // <----- OUCH!

unity（我的重点）：

ComputeBuffer.getData

将数据值从缓冲区读取到数组...
请注意，此函数从GPU中读取数据，可以慢慢 ...如果提交了写入此缓冲区的任何GPU工作， Unity等待任务完成< /strong>在检索请求的数据之前。告诉我更多...

（2）（2）explicit GPU阅读是您的情况

我可以看到您正在创建 2,500 “粒子”，其中每个粒子都附加到 gameObject 。如果目的是仅绘制一个简单的四分之一一口气。

证明：请参见下面的视频。 2014年ERA NVIDIA卡上的60+ fps，

例如对于我的 gpgpu n-body星系模拟我就是这样做的。请注意 starmaterial.setBuffer（“星”，_StarsBuffer）在实际渲染过程中。这告诉GPU使用已经存在于GPU上的缓冲区，这是计算机着色器用来移动恒星位置的缓冲区。 在这里没有CPU阅读GPU

public class Galaxy1Controller : MonoBehaviour
{
    public Texture2D HueTexture;

    public int NumStars = 10000; // That's right! 10,000 stars!

    public ComputeShader StarCompute;
    public Material StarMaterial;
    private ComputeBuffer _quadPoints;
    private Star[] _stars;
    private ComputeBuffer _starsBuffer;
.
.
.
    private void Start()
    {
        _updateParticlesKernel = StarCompute.FindKernel("UpdateStars");
        _starsBuffer = new ComputeBuffer(NumStars, Constants.StarsStride);

        _stars = new Star[NumStars];
        // Create initial positions for stars here (not shown)
        _starsBuffer.SetData(_stars);

        _quadPoints = new ComputeBuffer(6, QuadStride);
        _quadPoints.SetData(...); // star quad      
    }

    private void Update()
    {
        // bind resources to compute shader
        StarCompute.SetBuffer(_updateParticlesKernel, "stars", _starsBuffer);
        StarCompute.SetFloat("deltaTime", Time.deltaTime*_manager.MasterSpeed);
        StarCompute.SetTexture(_updateParticlesKernel, "hueTexture", HueTexture);

        // dispatch, launch threads on GPU
        var numberOfGroups = Mathf.CeilToInt((float) NumStars/GroupSize);
        StarCompute.Dispatch(_updateParticlesKernel, numberOfGroups, 1, 1);

        // "Look Ma, no reading from the GPU!"
    }

    private void OnRenderObject()
    {
        // bind resources to material
        StarMaterial.SetBuffer("stars", _starsBuffer);
        StarMaterial.SetBuffer("quadPoints", _quadPoints);

        // set the pass
        StarMaterial.SetPass(0);

        // draw
        Graphics.DrawProcedural(MeshTopology.Triangles, 6, NumStars);
    }
}

。

我认为每个人都可以同意Microsoft的GPGPU文档非常稀疏，因此最好的选择是查看散布在Interwebs周围的示例。想到的是，出色的“ unity in Unity中的GPU Ray Tracing” 系列在三场眼游戏中。请参阅下面的链接。

另请参阅：

Mickyd， ” n-boty Galaxy模拟使用unity 3D在GPGPU上使用Compute着色器3D“ ，2014
Kuri，d，d，d，d， ，2018年

TL;DR: your GPGPU (compute shader) scenario is unoptimized thus skewing your results. Consider binding a material to the computeBuffer and rendering via Graphics.DrawProcedural. That way everything stays on the GPU.

OP:

What am I doing wrong, why is there no increase in calculation speed?

Essentially, there are two parts to your problem.

(1) Reading from the GPU is slow

With most things GPU-related, you generally want to avoid reading from the GPU since it will block the CPU. This is true also for GPGPU scenarios.

If I were to hazard a guess it would be the GPGPU (compute shader) call computeBuffer.GetData() shown below:

private void Update()
{
    UpdateParticlePosition();
}

private void UpdateParticlePosition()
{
.
.
.
    computeBuffer.GetData(particlesDataArray); // <----- OUCH!

Unity (my emphasis):

ComputeBuffer.GetData

Read data values from the buffer into an array...
Note that this function reads the data back from the GPU, which can be slow...If any GPU work has been submitted that writes to this buffer, Unity waits for the tasks to complete before it retrieves the requested data. Tell me more...

(2) Explicit GPU reading is not required in your scenario

I can see you are creating 2,500 "particles" where each particle is attached to a GameObject. If the intent is to just draw a simple quad then it's more efficient to create an array structs containing a Vector3 position and then performing a batch render call to draw all the particles in one go.

Proof: see video below of nBody simulation. 60+ FPS on 2014 era NVidia card

e.g. for my GPGPU n-Body Galaxy Simulation I do just that. Pay attention to the StarMaterial.SetBuffer("stars", _starsBuffer) during actual rendering. That tells the GPU to use the buffer that already exists on the GPU, the very same buffer that the computer shader used to move the star positions. There is no CPU reading the GPU here.

public class Galaxy1Controller : MonoBehaviour
{
    public Texture2D HueTexture;

    public int NumStars = 10000; // That's right! 10,000 stars!

    public ComputeShader StarCompute;
    public Material StarMaterial;
    private ComputeBuffer _quadPoints;
    private Star[] _stars;
    private ComputeBuffer _starsBuffer;
.
.
.
    private void Start()
    {
        _updateParticlesKernel = StarCompute.FindKernel("UpdateStars");
        _starsBuffer = new ComputeBuffer(NumStars, Constants.StarsStride);

        _stars = new Star[NumStars];
        // Create initial positions for stars here (not shown)
        _starsBuffer.SetData(_stars);

        _quadPoints = new ComputeBuffer(6, QuadStride);
        _quadPoints.SetData(...); // star quad      
    }

    private void Update()
    {
        // bind resources to compute shader
        StarCompute.SetBuffer(_updateParticlesKernel, "stars", _starsBuffer);
        StarCompute.SetFloat("deltaTime", Time.deltaTime*_manager.MasterSpeed);
        StarCompute.SetTexture(_updateParticlesKernel, "hueTexture", HueTexture);

        // dispatch, launch threads on GPU
        var numberOfGroups = Mathf.CeilToInt((float) NumStars/GroupSize);
        StarCompute.Dispatch(_updateParticlesKernel, numberOfGroups, 1, 1);

        // "Look Ma, no reading from the GPU!"
    }

    private void OnRenderObject()
    {
        // bind resources to material
        StarMaterial.SetBuffer("stars", _starsBuffer);
        StarMaterial.SetBuffer("quadPoints", _quadPoints);

        // set the pass
        StarMaterial.SetPass(0);

        // draw
        Graphics.DrawProcedural(MeshTopology.Triangles, 6, NumStars);
    }
}

n-Body galaxy simulation of 10,000 stars:

I think everyone can agree that Microsoft's GPGPU documentation is pretty sparse so your best bet is to check out examples scattered around the interwebs. One that comes to mind is the excellent "GPU Ray Tracing in Unity" series over at Three Eyed Games. See the link below.

Unity中CPU和GPU着色器的移动对象的速度的差异

方案1-仅通过C＃代码

方案2-通过计算内核（GPGPU）

Scenario 1 - Update via C# code only

Scenario 2 - via Compute Kernel (GPGPU)

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

评论（2）

（1）GPU的阅读速度很慢，

（2）（2）explicit GPU阅读是您的情况

另请参阅：

(1) Reading from the GPU is slow

(2) Explicit GPU reading is not required in your scenario

See also:

关于作者

相关话题

热门标签

推荐作者

李珊平

Quxin

范无咎

github_ZOJ2N8YxBm

若言

南…巷孤猫

友情链接

Unity中CPU和GPU着色器的移动对象的速度的差异

方案1-仅通过C＃代码

方案2-通过计算内核（GPGPU）

Scenario 1 - Update via C# code only

Scenario 2 - via Compute Kernel (GPGPU)

如果你对这篇内容有疑问，欢迎到本站社区发帖提问 参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

评论（2）

（1）GPU的阅读速度很慢，

（2）（2）explicit GPU阅读是您的情况

另请参阅：

(1) Reading from the GPU is slow

(2) Explicit GPU reading is not required in your scenario

See also:

关于作者

相关话题

热门标签

推荐作者

李珊平

Quxin

范无咎

github_ZOJ2N8YxBm

若言

南…巷孤猫

友情链接

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。