opencv 场景文字识别

opencv 场景文字识别文章目录opencv 场景文字识别前言目标API场景文字检测：` cv::dnn::TextDetectionModel`场景文字识别：`cv::dnn::TextRecognitionModel `模型和数据准备TextDetectionModel:TextRecognitionModel:场景文字检测example场景文字识别example总结不足前言opencv4.

青锋断尘

7530人浏览 · 2021-08-30 11:20:00

青锋断尘 · 2021-08-30 11:20:00 发布

opencv 场景文字识别

文章目录

opencv 场景文字识别
前言

前言

opencv4.5版本以上在CNN模块中加入场景文字识别的高级API

目标

利用opencv OCR的高级API实现对场景中所有中英文进行检测和识别

API

场景文字检测：`cv::dnn::TextDetectionModel`

cv::dnn::TextDetectionModel支持以下算法：

DB模型：cv::dnn::TextDetectionModel_DB
EAST模型：cv::dnn::TextDetectionModel_EAST

场景文字识别：`cv::dnn::TextRecognitionModel`

cv::dnn::TextRecognitionModel支持CNN+RNN+CTC算法，也提供CTC的贪心解码算法

模型和数据准备

场景文字检测和文字识别均有现成的模型可以直接使用

TextDetectionModel:

可选模型

 DB_IC15_resnet50.onnx
 https://drive.google.com/uc?export=dowload&id=17_ABp79PlFt9yPCxSaarVc_DKTmrSGGf
 推荐参数： -inputHeight=736, -inputWidth=1280;

这个模型是在 ICDAR2015 上训练的，所以它只能检测英文文本实例。

DB_IC15_resnet18.onnx
https://drive.google.com/uc?export=dowload&id=1sZszH3pEt8hliyBlTmB-iulxHP1dCQWV
 推荐参数：-inputHeight=736, -inputWidth=1280

这个模型是在 ICDAR2015 上训练的，所以它只能检测英文文本实例。

DB_TD500_resnet50.onnx
https://drive.google.com/uc?export=dowload&id=19YWhArrNccaoSza0CfkXlA8im4-lAGsR
 推荐参数： -inputHeight=736, -inputWidth=736;

该模型在 MSRA-TD500 上训练，可以检测英文和中文文本实例。

DB_TD500_resnet18.onnx
https://drive.google.com/uc?export=dowload&id=1vY_KsDZZZb_svd5RT6pjyI8BS1nPbBSX
 推荐参数： -inputHeight=736, -inputWidth=736;

该模型在 MSRA-TD500 上训练，可以检测英文和中文文本实例。

测试数据

url: https://drive.google.com/uc?export=dowload&id=149tAhIcvfCYeyufRoZ9tmc2mZDKE_XrF

TextRecognitionModel:

可选模型

1. crnn.onnx:
网址：https://drive.google.com/uc?export=dowload&id=1ooaLR-rkTl8jdpGy1DoQs0-X0lQsB6Fj
字典集：alphabet_36.txt
网址：https://drive.google.com/uc？export=dowload&id=1oPOYx5rQRp8L6XQciUwmwhMCfX0KyO4b
推荐参数：rgb=0

该模型采用的训练集是MJSynth，类别数量为36（0~9 + a~z）

2. crnn_cs.onnx:
https://drive.google.com/uc?export=dowload&id=12diBsVJrS9ZEl6BNUiRp9s0xPALBS7kt
字典集:alphabet_94.txt
https://drive.google.com/uc?export=dowload&id=1oKXxXKusquimp7XY1mFvj9nwLzldVgBR
推荐参数：rgb=1

该模型在MJsynth and SynthText训练，类别数量为94（0~9 + a~z + A~Z+标点符号）

3 crnn_cs_CN.onnx
https://drive.google.com/uc?export=dowload&id=1is4eYEUKH7HR7Gl37Sw4WPXx6Ir8oQEG
字典集:alphabet_3944.txt
https://drive.google.com/uc?export=dowload&id=18IZUUdNzJ44heWTndDO6NNfIpJMmN-ul
推荐参数：rgb=1

训练数据集：ReCTS (https://rrc.cvc.uab.es/?ch=12)，识别类别数量3944（0~9 + a~z + A~Z + +中文字符+特殊字符）

测试数据

https://drive.google.com/uc?export=dowload&id=1nMcEy68zDNpIlqAn6xCk_kYcUTIeSOtN

场景文字检测example

该example可以实现对输入图片进行文字检测，将图片中所有文字框出来。从实用性出发则不使用测试数据进行测试，直接使用现实中的图片

目标：框选出图片中所有中英文

#include <iostream>
#include <fstream>

#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn/dnn.hpp>

using namespace cv;
using namespace cv::dnn;

int text_detection()
{
    // 初始化参数
    float binThresh = 0.3;                                      //二值图的置信度阈值
    float polyThresh  = 0.5 ;                                   //文本多边形阈值
    double unclipRatio = 2.0;                               //检测到的文本区域的未压缩比率，gai比率确定输出大小
    uint maxCandidates = 200;                           //输出结果的最大数量
    int height = 736;                                                   //输出图片长宽
    int width = 736;

    cv::String modelPath = "/home/haijun/code_study/opencv/chapter1/modle/DB_TD500_resnet50.onnx";  //模型权重文件

    // Load the network
    TextDetectionModel_DB detector(modelPath);
    detector.setBinaryThreshold(binThresh)
            .setPolygonThreshold(polyThresh)
            .setUnclipRatio(unclipRatio)
            .setMaxCandidates(maxCandidates);

    double scale = 1.0 / 255.0;
    Size inputSize = Size(width, height);
    Scalar mean = Scalar(122.67891434, 116.66876762, 104.00698793);
    detector.setInputParams(scale, inputSize, mean);

    // Create a window
    static const std::string winName = "TextDetectionModel";

  
               //检测单张图片
        // Open an image file
        Mat frame = imread("/home/haijun/code_study/opencv/chapter1/picture/3.png");
        CV_Assert(!frame.empty());
        Mat frame1;
        cv::resize(frame , frame1 , Size(frame.rows ,  frame.cols) , 0 , 0 , 1);
        std::cout <<  "height:" <<frame1.rows << "         width" << frame1.cols <<std::endl;

        // 推理
        std::vector<std::vector<Point>> results;
        detector.detect(frame1, results);

        polylines(frame1, results, true, Scalar(255, 0, 0), 2);
        imshow(winName, frame1);
        waitKey();


    return 0;
}

检测结果如下：

在这里插入图片描述

场景文字识别example

该example是对文字检测的方框逐个进行文字识别

由于opencv不能进行中文标注，所以中文文字标注失败，因此只能打印出识别结果

#include <iostream>
#include <fstream>

#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn/dnn.hpp>

using namespace cv;
using namespace cv::dnn;

// 文本区域仿射变化预处理:将所有方框变成水平
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
bool sortPts(const Point& p1, const Point& p2);

int text_spotting()
{
    // 初始化参数
    // DB文本检测模型
    float binThresh = 0.3;                                      //二值图的置信度阈值
    float polyThresh  = 0.5 ;                                   //文本多边形阈值
    double unclipRatio = 2.0;                               //检测到的文本区域的未压缩比率，gai比率确定输出大小
    uint maxCandidates = 200;                           //输出结果的最大数量
    int height = 736;                                                   //输出图片长宽
    int width = 736;
    cv::String detModelPath = "/home/haijun/code_study/opencv/chapter1/modle/DB_TD500_resnet50.onnx";  //DB模型权重文件

    // CRNN文本识别模型
    String recModelPath = "/home/haijun/code_study/opencv/chapter1/modle/crnn_cs_CN.onnx";         //文字识别模型文件
    String vocPath = "/home/haijun/code_study/opencv/chapter1/modle/alphabet_3944.txt";              //字典文件
   int imreadRGB = 1;         //0：以灰度图读取图像   1：以彩色图读取图像

    // 载入模型
    if (detModelPath.empty())
    {
        std::cout << "DB模型文件加载失败"  <<std::endl;
        return -1;
    }
    TextDetectionModel_DB detector(detModelPath);
    detector.setBinaryThreshold(binThresh)
            .setPolygonThreshold(polyThresh)
            .setUnclipRatio(unclipRatio)
            .setMaxCandidates(maxCandidates);

    if (recModelPath.empty())
    {
         std::cout << "CRNN模型文件加载失败"  <<std::endl;
        return -1;       
    }
    TextRecognitionModel recognizer(recModelPath);

    // 载入字典
     if (vocPath.empty())
    {
         std::cout << "字典模型文件加载失败"  <<std::endl;
        return -1;       
    }

    std::ifstream vocFile;
    vocFile.open(samples::findFile(vocPath));
    CV_Assert(vocFile.is_open());
    String vocLine;
    std::vector<String> vocabulary;
    while (std::getline(vocFile, vocLine)) {
        vocabulary.push_back(vocLine);
    }
    recognizer.setVocabulary(vocabulary);
    recognizer.setDecodeType("CTC-greedy");

    // 设置检测参数
    double detScale = 1.0 / 255.0;
    Size detInputSize = Size(width, height);
    Scalar detMean = Scalar(122.67891434, 116.66876762, 104.00698793);
    detector.setInputParams(detScale, detInputSize, detMean);

    // 设置识别参数
    double recScale = 1.0 / 127.5;
    Scalar recMean = Scalar(127.5);
    Size recInputSize = Size(100, 32);
    recognizer.setInputParams(recScale, recInputSize, recMean);

    // Create a window
    static const std::string winName = "Text_Spotting";

    // 载入图像
    Mat frame = imread("/home/haijun/code_study/opencv/chapter1/picture/11.jpg");
    if (frame.empty())
    {
        std::cout << "图像加载失败"  <<std::endl;
        return -1;            
    }
    std::cout << "图像大小："<< frame.size << std::endl;

    // 推理
    std::vector< std::vector<Point> > detResults;
    detector.detect(frame, detResults);

    if (detResults.size() > 0) {
        //文本识别
        Mat recInput;
        if (!imreadRGB) {
            cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
        } else {
            recInput = frame;
        }
        std::vector< std::vector<Point> > contours;
        for (uint i = 0; i < detResults.size(); i++)
        {
            const auto& quadrangle = detResults[i];
            CV_CheckEQ(quadrangle.size(), (size_t)4, "");       //j检测Mat是否为Vector

            contours.emplace_back(quadrangle);                      //插入数据到向量

            std::vector<Point2f> quadrangle_2f;
            for (int j = 0; j < 4; j++)
                quadrangle_2f.emplace_back(quadrangle[j]);

            // 转换和裁剪图像
            Mat cropped;
            fourPointsTransform(recInput, &quadrangle_2f[0], cropped);

            std::string recognitionResult = recognizer.recognize(cropped);
            std::cout << i << ": '" << recognitionResult << "'" << std::endl;

            putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
        }
        polylines(frame, contours, true, Scalar(0, 255, 0), 2);
    } else {
        std::cout << "No Text Detected." << std::endl;
    }
    imshow(winName, frame);
    waitKey();

    return 0;
}

void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
{
    const Size outputSize = Size(100, 32);

    Point2f targetVertices[4] = {
        Point(0, outputSize.height - 1),
        Point(0, 0),
        Point(outputSize.width - 1, 0),
        Point(outputSize.width - 1, outputSize.height - 1)
    };
    Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);

    warpPerspective(frame, result, rotationMatrix, outputSize);

#if 1
    imshow("roi", result);
    waitKey();
#endif
}

bool sortPts(const Point& p1, const Point& p2)
{
    return p1.x < p2.x;
}

识别结果：
在这里插入图片描述