如何与张力一起使用cupy?

发布于 2025-02-02 23:12:24 字数 2051 浏览 4 评论 0原文

编辑:我解决了答案中的代码。

我遇到一个问题:使用numpy和openCV进行预处理数据比火车慢,并且基于张力的整个过程的结果比pytorch慢。
因此,我尝试使用OpenCV-CUDA和CUPY来加速我的数据预处理。但是我找不到任何示例,并且与如何将Cupy Array复制到Tensorrt缓冲区相混淆。例如:

import cupy as cp
import tensorrt as trt

# assume this is my input
image = cp.array([1, 3, 1080, 1920])

# allocate tensorrt buffer, from my tensorrt class
def allocate_buffers(self):
    inputs = []
    outputs = []
    bindings = []
    stream = cp.cuda.Stream()
    for binding in self.engine:     
        size = trt.volume(self.context.get_binding_shape(self.engine.get_binding_index(binding))) * self.engine.max_batch_size
        dtype = trt.nptype(self.engine.get_binding_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        if self.engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
     return inputs, outputs, bindings, stream

def infer(self, input):
    self.cfx.push()
    for index, data in enumerate(input):
        self.context.set_binding_shape(index, data.shape)
    self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
    self.stream.use()
    batchsize = input[0].shape[0]
    # for index, data in enumerate(input):
        # self.inputs[index].host = data

    # Here! How can I set the cupy array to buffer?
    
    # If I directly use cupy stream to infer, I get an error: cudnn_executation_failed
    # [cuda.memcpy_htod_async(inp.device, inp.host, self.stream) for inp in self.inputs]
    self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.ptr)
    [cuda.memcpy_dtoh_async(out.host, out.device, self.stream) for out in self.outputs]
    self.stream.synchronize()
    results = [o.host.reshape([batchsize, *shape]) for o, shape in zip(self.outputs, self.output_shape)]
    self.cfx.pop()

    return results

感谢您的帮助!

Edit: I solve it, code in the answer.

I meet a problem: using numpy and opencv to preprocess data is slower than torchvision and results in the whole process based on tensorrt is slower than pytorch.
So I try to use opencv-cuda and cupy to accelerate my data preprocess. But I can't find any examples for this and be confused with how to copy cupy array to a tensorrt buffer. For example:

import cupy as cp
import tensorrt as trt

# assume this is my input
image = cp.array([1, 3, 1080, 1920])

# allocate tensorrt buffer, from my tensorrt class
def allocate_buffers(self):
    inputs = []
    outputs = []
    bindings = []
    stream = cp.cuda.Stream()
    for binding in self.engine:     
        size = trt.volume(self.context.get_binding_shape(self.engine.get_binding_index(binding))) * self.engine.max_batch_size
        dtype = trt.nptype(self.engine.get_binding_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        if self.engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
     return inputs, outputs, bindings, stream

def infer(self, input):
    self.cfx.push()
    for index, data in enumerate(input):
        self.context.set_binding_shape(index, data.shape)
    self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
    self.stream.use()
    batchsize = input[0].shape[0]
    # for index, data in enumerate(input):
        # self.inputs[index].host = data

    # Here! How can I set the cupy array to buffer?
    
    # If I directly use cupy stream to infer, I get an error: cudnn_executation_failed
    # [cuda.memcpy_htod_async(inp.device, inp.host, self.stream) for inp in self.inputs]
    self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.ptr)
    [cuda.memcpy_dtoh_async(out.host, out.device, self.stream) for out in self.outputs]
    self.stream.synchronize()
    results = [o.host.reshape([batchsize, *shape]) for o, shape in zip(self.outputs, self.output_shape)]
    self.cfx.pop()

    return results

Thanks for your help!

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

一片旧的回忆 2025-02-09 23:12:24

https://forums.developer.nvidia.com/t/tensorrt-error-1-resize-cu-457-eror-code-457-eror-code-code-cuda-cuda-runtime-invalid-argument/216041
看这个链接

def infer(self, input):
    with cp.cuda.Stream(non_blocking=False) as stream:
        bindings = []
        for index, data in enumerate(input):
            self.context.set_binding_shape(index, data.shape)
            bindings.append(int(data.data))

        outputs = []
        for binding in self.engine:
            if not self.engine.binding_is_input(binding):
                device_mem = cp.zeros(self.context.get_binding_shape(self.engine.get_binding_index(binding)), dtype=cp.float32)

                bindings.append(int(device_mem.data))
                outputs.append(device_mem)

        self.context.execute_async(bindings=bindings, stream_handle=stream.ptr)
        stream.synchronize()

https://forums.developer.nvidia.com/t/tensorrt-error-1-resize-cu-457-error-code-1-cuda-runtime-invalid-argument/216041
Look at this link

def infer(self, input):
    with cp.cuda.Stream(non_blocking=False) as stream:
        bindings = []
        for index, data in enumerate(input):
            self.context.set_binding_shape(index, data.shape)
            bindings.append(int(data.data))

        outputs = []
        for binding in self.engine:
            if not self.engine.binding_is_input(binding):
                device_mem = cp.zeros(self.context.get_binding_shape(self.engine.get_binding_index(binding)), dtype=cp.float32)

                bindings.append(int(device_mem.data))
                outputs.append(device_mem)

        self.context.execute_async(bindings=bindings, stream_handle=stream.ptr)
        stream.synchronize()
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文