CUDA.NET 中的上下文迁移

发布于 2024-08-30 09:30:40 字数 1727 浏览 5 评论 0原文

我目前正在使用 GASS 的 CUDA.NET 库。我需要在一个CPU线程中初始化cuda数组（实际上是cublas向量，但这并不重要）并在其他CPU线程中使用它们。但是，保存所有初始化数组和加载函数的 CUDA 上下文只能附加到一个 CPU 线程。

有一种称为上下文迁移 API 的机制可以将上下文从一个线程分离并将其附加到另一个线程。但我不知道如何在 CUDA.NET 中正确使用它。

我尝试了这样的操作：

class Program
{
    private static float[] vector1, vector2;
    private static CUDA cuda;
    private static CUBLAS cublas;

    private static CUdeviceptr ptr;

    static void Main(string[] args)
    {
        cuda = new CUDA(false);
        cublas = new CUBLAS(cuda);
        cuda.Init();
        cuda.CreateContext(0);
        AllocateVectors();
        cuda.DetachContext();
        CUcontext context = cuda.PopCurrentContext();
        GetVectorFromDeviceAsync(context);
    }

    private static void AllocateVectors()
    {
        vector1 = new float[]{1f, 2f, 3f, 4f, 5f};
        ptr = cublas.Allocate(vector1.Length, sizeof (float));
        cublas.SetVector(vector1, ptr);

        vector2 = new float[5];
    }


    private static void GetVectorFromDevice(object objContext)
    {
        CUcontext localContext = (CUcontext) objContext;
        cuda.PushCurrentContext(localContext);
        cuda.AttachContext(localContext);

        //change vector somehow
        vector1[0] = -1;
        //copy changed vector to device
        cublas.SetVector(vector1, ptr);
        cublas.GetVector(ptr, vector2);
        CUDADriver.cuCtxPopCurrent(ref localContext);
    }

    private static void GetVectorFromDeviceAsync(CUcontext cUcontext)
    {
        Thread thread = new Thread(GetVectorFromDevice);
        thread.IsBackground = false;
        thread.Start(cUcontext);
    }
}

但是尝试将更改的向量复制到设备时执行失败，因为未附加上下文。其他原因不太可能，因为它在单线程模式下工作得很好。有什么想法可以让它发挥作用吗？

原文

I'm currently using CUDA.NET library by GASS.
I need to initialize cuda arrays (actually cublas vectors, but it doesn't matters) in one CPU thread and use them in other CPU thread. But CUDA context which holding all initialized arrays and loaded functions, can be attached only to one CPU thread.

There is mechanism called context migration API to detach context from one thread and attach it to another. But i don't how to properly use it in CUDA.NET.

I tried something like this:

class Program
{
    private static float[] vector1, vector2;
    private static CUDA cuda;
    private static CUBLAS cublas;

    private static CUdeviceptr ptr;

    static void Main(string[] args)
    {
        cuda = new CUDA(false);
        cublas = new CUBLAS(cuda);
        cuda.Init();
        cuda.CreateContext(0);
        AllocateVectors();
        cuda.DetachContext();
        CUcontext context = cuda.PopCurrentContext();
        GetVectorFromDeviceAsync(context);
    }

    private static void AllocateVectors()
    {
        vector1 = new float[]{1f, 2f, 3f, 4f, 5f};
        ptr = cublas.Allocate(vector1.Length, sizeof (float));
        cublas.SetVector(vector1, ptr);

        vector2 = new float[5];
    }


    private static void GetVectorFromDevice(object objContext)
    {
        CUcontext localContext = (CUcontext) objContext;
        cuda.PushCurrentContext(localContext);
        cuda.AttachContext(localContext);

        //change vector somehow
        vector1[0] = -1;
        //copy changed vector to device
        cublas.SetVector(vector1, ptr);
        cublas.GetVector(ptr, vector2);
        CUDADriver.cuCtxPopCurrent(ref localContext);
    }

    private static void GetVectorFromDeviceAsync(CUcontext cUcontext)
    {
        Thread thread = new Thread(GetVectorFromDevice);
        thread.IsBackground = false;
        thread.Start(cUcontext);
    }
}

But execution fails on attempt to copy changed vector to device because context is not attached. Other reasons are unlikely, because it works fine in single threaded mode. Any ideas how i can get it work?

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

放血 2024-09-06 09:30:40

我仍然没有找到这个问题的解决方案，但我确实想出了一个解决方法。
重点是在一个 CPU 线程中执行所有与 CUDA 相关的函数。
例如，你可以这样做：

class Program
{
    private static float[] vector1, vector2;
    private static CUDA cuda;
    private static CUBLAS cublas;

    private static CUdeviceptr ptr;

    private static readonly AutoResetEvent autoResetEvent = new AutoResetEvent(false);

    static void Main()
    {
        cuda = new CUDA(true);
        cublas = new CUBLAS(cuda);

        //allocate vector on cuda device in main thread
        CudaManager.CallMethod(AllocateVectors);

        //changing first vector from other thread
        Thread changeThread = new Thread(ChangeVectorOnDevice_ThreadRun) { IsBackground = false };
        changeThread.Start();

        //wait for changeThread to finish
        autoResetEvent.WaitOne();

        //getting vector from device in another one thread
        Thread getThread = new Thread(GetVectorFromDevice_ThreadRun) { IsBackground = false };
        getThread.Start();

        //wait for getThread to finish
        autoResetEvent.WaitOne();

        Console.WriteLine("({0}, {1}, {2}, {3}, {4})", vector2[0], vector2[1], vector2[2], vector2[3], vector2[4]);

        Console.ReadKey(true);
    }

    private static void AllocateVectors()
    {
        vector1 = new[] { 1f, 2f, 3f, 4f, 5f };
        vector2 = new float[5];
        //allocate memory and copy first vector to device
        ptr = cublas.Allocate(vector1.Length, sizeof(float));
        cublas.SetVector(vector1, ptr);

    }

    private static void GetVectorFromDevice()
    {
        cublas.GetVector(ptr, vector2);
    }

    private static void ChangeVectorOnDevice()
    {
        //changing vector and copying it to device
        vector1 = new[] { -1f, -2f, -3f, -4f, -5f };
        cublas.SetVector(vector1, ptr);
    }

    private static void ChangeVectorOnDevice_ThreadRun()
    {
        CudaManager.CallMethod(ChangeVectorOnDevice);
        //releasing main thread
        autoResetEvent.Set();
    }

    private static void GetVectorFromDevice_ThreadRun()
    {
        CudaManager.CallMethod(GetVectorFromDevice);
        //releasing main thread
        autoResetEvent.Set();
    }
}

public static class CudaManager
{
    public static Action WorkMethod { get; private set; }

    private static readonly AutoResetEvent actionRecived = new AutoResetEvent(false);
    private static readonly AutoResetEvent callbackEvent = new AutoResetEvent(false);

    private static readonly object mutext = new object();
    private static bool isCudaThreadRunning;

    private static void ThreadRun()
    {
        //waiting for work method to execute
        while (actionRecived.WaitOne())
        {
            //invoking recived method
            WorkMethod.Invoke();
            //releasing caller thread
            callbackEvent.Set();
        }
    }

    static CudaManager()
    {
        Run();
    }

    public static void Run()
    {
        if (!isCudaThreadRunning)
        {
            Thread thread = new Thread(ThreadRun);
            thread.IsBackground = true;
            thread.Start();
            isCudaThreadRunning = true;
        }
    }

    public static void CallMethod(Action method)
    {
        lock (mutext)
        {
            WorkMethod = method;
            //releasing ThreadRun method
            actionRecived.Set();
            //blocking caller thread untill delegate invokation is complete
            callbackEvent.WaitOne();
        }
    }
}

我希望它能帮助某人。

I still have not found a solution for this problem but i did came up with a workaround.
The point is to execute all the functions which have something to deal with CUDA in one CPU thread.
For example, you can do it like this:

class Program
{
    private static float[] vector1, vector2;
    private static CUDA cuda;
    private static CUBLAS cublas;

    private static CUdeviceptr ptr;

    private static readonly AutoResetEvent autoResetEvent = new AutoResetEvent(false);

    static void Main()
    {
        cuda = new CUDA(true);
        cublas = new CUBLAS(cuda);

        //allocate vector on cuda device in main thread
        CudaManager.CallMethod(AllocateVectors);

        //changing first vector from other thread
        Thread changeThread = new Thread(ChangeVectorOnDevice_ThreadRun) { IsBackground = false };
        changeThread.Start();

        //wait for changeThread to finish
        autoResetEvent.WaitOne();

        //getting vector from device in another one thread
        Thread getThread = new Thread(GetVectorFromDevice_ThreadRun) { IsBackground = false };
        getThread.Start();

        //wait for getThread to finish
        autoResetEvent.WaitOne();

        Console.WriteLine("({0}, {1}, {2}, {3}, {4})", vector2[0], vector2[1], vector2[2], vector2[3], vector2[4]);

        Console.ReadKey(true);
    }

    private static void AllocateVectors()
    {
        vector1 = new[] { 1f, 2f, 3f, 4f, 5f };
        vector2 = new float[5];
        //allocate memory and copy first vector to device
        ptr = cublas.Allocate(vector1.Length, sizeof(float));
        cublas.SetVector(vector1, ptr);

    }

    private static void GetVectorFromDevice()
    {
        cublas.GetVector(ptr, vector2);
    }

    private static void ChangeVectorOnDevice()
    {
        //changing vector and copying it to device
        vector1 = new[] { -1f, -2f, -3f, -4f, -5f };
        cublas.SetVector(vector1, ptr);
    }

    private static void ChangeVectorOnDevice_ThreadRun()
    {
        CudaManager.CallMethod(ChangeVectorOnDevice);
        //releasing main thread
        autoResetEvent.Set();
    }

    private static void GetVectorFromDevice_ThreadRun()
    {
        CudaManager.CallMethod(GetVectorFromDevice);
        //releasing main thread
        autoResetEvent.Set();
    }
}

public static class CudaManager
{
    public static Action WorkMethod { get; private set; }

    private static readonly AutoResetEvent actionRecived = new AutoResetEvent(false);
    private static readonly AutoResetEvent callbackEvent = new AutoResetEvent(false);

    private static readonly object mutext = new object();
    private static bool isCudaThreadRunning;

    private static void ThreadRun()
    {
        //waiting for work method to execute
        while (actionRecived.WaitOne())
        {
            //invoking recived method
            WorkMethod.Invoke();
            //releasing caller thread
            callbackEvent.Set();
        }
    }

    static CudaManager()
    {
        Run();
    }

    public static void Run()
    {
        if (!isCudaThreadRunning)
        {
            Thread thread = new Thread(ThreadRun);
            thread.IsBackground = true;
            thread.Start();
            isCudaThreadRunning = true;
        }
    }

    public static void CallMethod(Action method)
    {
        lock (mutext)
        {
            WorkMethod = method;
            //releasing ThreadRun method
            actionRecived.Set();
            //blocking caller thread untill delegate invokation is complete
            callbackEvent.WaitOne();
        }
    }
}

I hope it's gonna help someone.

回复收藏 0 原文