caffe TensorRT

caffe模型TensorRT部署实践（一）无眠栀 2019-05-10 21:11:46 4666 收藏 10分类专栏：深度学习版权参考代码TensorRT安装包下的samples/sampleMNIST/sampleMNIST.cpp1.设置使用的gpu id，如果不设置，默认使用第0块。cudaSetDevice(3); //set device id12.定义模型的输入输出和logger

AI视觉网奇

2026人浏览 · 2021-04-12 10:41:03

AI视觉网奇 · 2021-04-12 10:41:03 发布

caffe模型TensorRT部署实践（一）

无眠栀 2019-05-10 21:11:46 4666 收藏 10
分类专栏：深度学习
版权
参考代码
TensorRT安装包下的samples/sampleMNIST/sampleMNIST.cpp

1.设置使用的gpu id，如果不设置，默认使用第0块。
cudaSetDevice(3); //set device id
1
2.定义模型的输入输出和logger
static const int INPUT_H = 299; //输入图像高
static const int INPUT_W = 299;//输入图像宽
static const int CHANNELS = 3;//输入图像通道
static const int OUTPUT_SIZE = 1536;//输出特征维度
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";//deploy文件中定义的输入层名称
const char* OUTPUT_BLOB_NAME = "pool_8x8_s2";//deploy文件中定义的输出层名称
1
2
3
4
5
6
7
8
3.定义GIE模型，并将训练好的caffe模型转换到GIE模型
// create a GIE model from the caffe model and serialize it to a stream
IHostMemory *gieModelStream{nullptr};
caffeToGIEModel("deploy.prototxt", "inceptionv4.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);
1
2
3
4.准备输入图像，可以采用opencv读取，也可用其他方式，根据情况编写与处理部分，最终存入一个float*中
float data[INPUT_H*INPUT_W*CHANNELS];

cv::Mat im = imread("gap.jpg") ;
cv::resize(im, im, cv::Size(INPUT_W, INPUT_H));
int mean_data[] = {104, 117, 123}; //均值
float *pdata = data;
for(int c = 0; c < CHANNELS; ++c)
{
for(int h = 0; h < INPUT_H; ++h)
{
for(int w = 0; w < INPUT_W; ++w)
{
*pdata++ = float(im.at<Vec3b>(h,w)[c] - mean_data[c]) ;
}
}
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
5. 反序列化前向引擎
// deserialize the engine
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
if (gieModelStream) gieModelStream->destroy();
1
2
3
4
6.开始前向推断
IExecutionContext *context = engine->createExecutionContext();

std::cout << "begin inference\n";
// run inference
CProTimer timet;
float prob[OUTPUT_SIZE];
doInference(*context, data, prob, 1);

std::cout << "end inference " << timet.GetTime(true) << "\n";
1
2
3
4
5
6
7
8
9
7.释放资源并输出结果
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();

// print a histogram of the output distribution
std::cout << "\n\n";
for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
{
std::cout << prob[i] << " ";
}
std::cout << std::endl;
1
2
3
4
5
6
7
8
9
10
11
12
caffeToGIEModel和doInference可参考开头给出的示例cpp。
void caffeToGIEModel(const std::string& deployFile, // name for caffe prototxt
const std::string& modelFile, // name for model
const std::vector<std::string>& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
IHostMemory *&gieModelStream) // output buffer for the GIE model
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);

// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
locateFile(modelFile, directories).c_str(),
*network,
nvinfer1::DataType::kFLOAT);

// specify which tensors are outputs
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str()));

// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20);

ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);

// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();

// serialize the engine, then close everything down
gieModelStream = engine->serialize();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];

// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * CHANNELS * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));

// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * CHANNELS * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);

// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
8. 编写MakeFile文件并编译
代码需依赖cuda, cudnn 和TensorRT库，gcc版本5.3以上，其他库可根据自身需要设定

OPENCV_INC_DIR="/user/3rdparty/opencv-3.1.0/include/"
OPENCV_LIB_DIR="/user/3rdparty/opencv-3.1.0/lib/"
CUDA_INC_DIR="/user/3rdparty/cuda/include/"
CUDA_LIB_DIR="/user/3rdparty/cuda/lib64/"
CUDNN_INC_DIR="/user/3rdparty/cudnn_7.0.5/include/"
CUDNN_LIB_DIR="/user/3rdparty/cudnn_7.0.5/lib64/"
TENSORRT_INC_DIR="/user/3rdparty/TensorRT-4.0.0.3/include/"
TENSORRT_LIB_DIR="/user/3rdparty/TensorRT-4.0.0.3/lib/"

export PATH=/user/3rdparty/gcc-5.3.0/bin:$PATH

INCLUFLAGS = -I${OPENCV_INC_DIR} \
-I${CUDA_INC_DIR} -I${CUDNN_INC_DIR}\
-I../common/ \
-I${TENSORRT_INC_DIR}

LIBFLAGS = -L${OPENCV_LIB_DIR} -lopencv_imgcodecs -lopencv_imgproc -lopencv_core -lopencv_highgui \
-L${CUDA_LIB_DIR} -L${CUDNN_LIB_DIR} -lcudnn -lcublas -lcudart_static -lnvToolsExt -lcudart \
-L${TENSORRT_LIB_DIR} -lnvinfer -lnvparsers -lnvinfer_plugin

LIBFLAGS += -lrt -ldl -lpthread

SOURCES = main.cpp

CXXFLAGS = -Wall -std=c++11

EXE = inceptionv4_tensorrt

OBJECTS = $(subst .c,.o,$(SOURCES:%.cpp=%.o))

all:
g++ -o $(EXE) $(SOURCES) $(CXXFLAGS) $(INCLUFLAGS) $(LIBFLAGS)
clean:
rm -f $(OBJECTS) $(EXE)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
9.精度和速度对比
TensorRT的float32模型与原始caffe精度基本无差异，但速度快很多，单batch的平均gpu前向速度是原始caffe模型的4～5倍左右，优化还是很给力。
————————————————
版权声明：本文为CSDN博主「无眠栀」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/may0324/article/details/90083988