onnx模型转engine并进行推理全过程解析_专栏

文章目录

1、ONNX序列化为TensorRT Engine
- 1.1 创建builder
2、继承ILogger，实例化接口
2、读取序列化后TensorRT Engine 并进行推理

深度学习模型在训练好以后，下一步就是部署到不同的设备进行测试，不同设备之间的转换一般可以通过中间件ONNX进行转换，以达到不同平台的通用。本文以模型转为ONNX为起点，分析介绍ONNX转为TensorRT Engine并进行推理的整个流程链路。

1、ONNX序列化为TensorRT Engine

ONNX序列化为TRT模型的整个流程可以用下图表示

使用C++的API进行开发时，需要引入头文件NvInfer以及NvOnnxParser，C++的接口都是通过I开头的的接口类定义的，如ILogger、IBuilder等。

#include “NvInfer.h”
#include “NvOnnxParser.h”

using namespace nvonnxparser;
using namespace nvinfer1;

1.1 创建builder

创建构建器之前有两种方式实例化ILogger：
1、引用tensorrtx的logging.h，使用其中的Logger

    #include "logging.h"

    static Logger gLogger;
    IBuilder* builder = createInferBuilder(gLogger);

2、继承ILogger，实例化接口

    class Logger : public ILogger           
    {
        void log(Severity severity, const char* msg) noexcept override
        {
            if (severity <= Severity::kWARNING)
                std::cout << msg << std::endl;
        }
    } logger;
     IBuilder* builder = createInferBuilder(gLogger);

1.2 创建network

创建构建器后，需要创建网络定义来进行模型优化：

    INetworkDefinition *network = builder->createNetworkV2(0U); //是0U还是1u需视情况而定

1.3 创建parse解析器

创建onnx的解析器来进行网络定义的填充，并读取模型文件并处理是否存在错误。

    IParser* parser = createParser(*network, gLogger);
    parser->parseFromFile(onnx_path, static_cast<int32_t>(ILogger::Severity::kWARNING));
     for (int32_t i = 0; i < parser->getNbErrors(); ++i) 
    { 
        std::cout << parser->getError(i)->desc() << std::endl;
    }
    std::cout << "successfully parse the onnx model" << std::endl;

1.4 设置必要参数并创建Engine

    IBuilderConfig *config = builder->createBuilderConfig();
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);

    auto profile = builder->createOptimizationProfile();
    auto input_tensor = network->getInput(0);
    auto input_dims = input_tensor->getDimensions();

    input_dims.d[0] = 1;
    profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
    profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
    input_dims.d[0] = batchSize;
    profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
    config->addOptimizationProfile(profile);
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
#ifdef USE_INT8
    config->setFlag(BuilderFlag::kINT8);
#endif

1.5 创建Engine并序列化

    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    assert(engine != nullptr);
    (*modelStream) = engine->serialize();
    assert(modelStream != nullptr);
     std::ofstream p(engine_path, std::ios::binary);
    if (!p)
    {
        std::cerr << "could not open plan output file" << std::endl;
        return -1;
    }
    p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
    modelStream->destroy();

2、读取序列化后TensorRT Engine 并进行推理

onnx转换为engine并序列化后，可以减少构建和优化模型的时间，如下图所示，从序列化的engine读取开始完成整个推理过程。

2.1 反序列化engine

读取序列化的模型，存放在trtModelstream中。

    size_t size{ 0 };
    std::ifstream file(engine_path, std::ios::binary);
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();

2.2 创建runtime

通过logger创建runtime

     IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);

2.3 创建engine

通过runtime解析trtModelstream，创建engine

    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    assert(engine != nullptr);

2.4 创建context

    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    runtime->destroy();

2.5 前处理+前向推理+后处理

前处理

float* input_data = (float*)malloc(3 * input_h * input_w * sizeof(float));
int ImgCount = InputImage.size();
    for (int b = 0; b < ImgCount; b++) {
        cv::Mat img = InputImage.at(b);
        int w = img.cols;
        int h = img.rows;
        int i = 0;
        for (int row = 0; row < h; ++row) {
            uchar* uc_pixel = img.data + row * img.step;
            for (int col = 0; col < input_w; ++col) {
                input_data[b * 3 * input_h * input_w + i] = (float)uc_pixel[2] / 255.0;
                input_data[b * 3 * input_h * input_w + i + input_h * input_w] = (float)uc_pixel[1] / 255.0;
                input_data[b * 3 * input_h * input_w + i + 2 * input_h * input_w] = (float)uc_pixel[0] / 255.0;
                uc_pixel += 3;
                ++i;
            }
        }

    }

前向推理

void doInference()
{
    const ICudaEngine& engine = context.getEngine();
    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    //assert(engine.getNbBindings() == 2);
    void* buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
    //const int inputIndex = 0;
    //const int outputIndex = 1;
    // Create GPU buffers on device
    cudaMalloc(&buffers[inputIndex], batchSize * 3 * input_h * input_w * sizeof(float));
    cudaMalloc(&buffers[outputIndex], batchSize * output_size * sizeof(float));
    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 *input_h * input_w * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));

后处理
以LPRNet为例

    std::vector<int> preds;
    std::cout << std::endl;
    for (int i = 0; i < 18; i++) {
        int maxj = 0;
        for (int j = 0; j < 68; j++) {
            if (prob[i + 18 * j] > prob[i + 18 * maxj]) maxj = j;
        }
        preds.push_back(maxj);
    }
    int pre_c = preds[0];
    std::vector<int> no_repeat_blank_label;
    for (auto c: preds) {
        if (c == pre_c || c == 68 - 1) {
            if (c == 68 - 1) pre_c = c;
            continue;
        }
        no_repeat_blank_label.push_back(c);
        pre_c = c;
    }
    std::string str;
    for (auto v: no_repeat_blank_label) {
        str += alphabet[v];
    }