Python自动化脚本图像识别与OpenCV实战

weixin_44118318

538人浏览 · 2026-05-11 11:30:46

weixin_44118318 · 2026-05-11 11:30:46 发布

Python在图像处理和计算机视觉领域有着强大的能力。从简单的图像处理到复杂的人脸识别、物体检测，OpenCV和多种AI模型让这些变得触手可及。本文详细介绍OpenCV的基础操作、图像识别技术以及在实际自动化场景中的应用。

一、环境准备

1.1 安装依赖

pip install opencv-python opencv-contrib-python
pip install pillow numpy matplotlib
pip install pytesseract  # OCR识别
pip install face-recognition  # 人脸识别

1.2 基础图像操作

import cv2
import numpy as np
from PIL import Image

# 读取图像
img = cv2.imread('image.jpg')
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # OpenCV使用BGR格式

# 保存图像
cv2.imwrite('output.jpg', img)

# 获取图像信息
height, width, channels = img.shape
print(f"尺寸: {width}x{height}")
print(f"通道数: {channels}")

# 图像基本变换
resized = cv2.resize(img, (800, 600))  # 调整大小
rotated = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)  # 旋转
flipped = cv2.flip(img, 1)  # 水平翻转

二、图像处理基础

2.1 灰度化和二值化

import cv2
import numpy as np

def preprocess_for_ocr(image_path):
    """OCR预处理"""
    img = cv2.imread(image_path)
    
    # 转灰度
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 高斯模糊去噪
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # 二值化
    _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # 形态学操作
    kernel = np.ones((3, 3), np.uint8)
    processed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    return processed

# 自适应阈值（适合光照不均的图片）
adaptive = cv2.adaptiveThreshold(
    gray, 255,
    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
    cv2.THRESH_BINARY, 11, 2
)

2.2 边缘检测和轮廓

import cv2

def detect_edges(image_path):
    """边缘检测"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Canny边缘检测
    edges = cv2.Canny(gray, 50, 150)
    
    # 查找轮廓
    contours, hierarchy = cv2.findContours(
        edges,
        cv2.RETR_TREE,
        cv2.CHAIN_APPROX_SIMPLE
    )
    
    # 绘制轮廓
    result = img.copy()
    cv2.drawContours(result, contours, -1, (0, 255, 0), 2)
    
    # 筛选大轮廓
    large_contours = [c for c in contours if cv2.contourArea(c) > 1000]
    
    return result, large_contours

def get_contour_info(contour):
    """获取轮廓信息"""
    # 边界矩形
    x, y, w, h = cv2.boundingRect(contour)
    
    # 最小外接矩形（可旋转）
    rect = cv2.minAreaRect(contour)
    box = cv2.boxPoints(rect)
    box = np.int0(box)
    
    # 面积和周长
    area = cv2.contourArea(contour)
    perimeter = cv2.arcLength(contour, True)
    
    return {
        'bbox': (x, y, w, h),
        'area': area,
        'perimeter': perimeter
    }

三、OCR文字识别

3.1 Tesseract OCR

import pytesseract
from PIL import Image
import cv2

class OCRProcessor:
    """OCR文字识别处理器"""
    
    def __init__(self, lang='eng+chi_sim'):
        self.lang = lang
        
    def extract_text(self, image_path, config=''):
        """提取文字"""
        img = Image.open(image_path)
        text = pytesseract.image_to_string(
            img,
            lang=self.lang,
            config=config
        )
        return text.strip()
    
    def extract_text_detailed(self, image_path):
        """详细提取（包含位置信息）"""
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # 获取详细的文本数据
        data = pytesseract.image_to_data(
            gray,
            output_type=pytesseract.Output.DICT
        )
        
        # 解析结果
        results = []
        for i, text in enumerate(data['text']):
            if text.strip():  # 过滤空白
                results.append({
                    'text': text,
                    'x': data['left'][i],
                    'y': data['top'][i],
                    'width': data['width'][i],
                    'height': data['height'][i],
                    'confidence': data['conf'][i]
                })
                
        return results
    
    def preprocess_for_ocr(self, img):
        """OCR图像预处理"""
        # 转灰度
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # 去噪
        denoised = cv2.fastNlMeansDenoising(gray)
        
        # 锐化
        kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
        sharpened = cv2.filter2D(denoised, -1, kernel)
        
        # 二值化
        _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        return binary

ocr = OCRProcessor()

# 简单提取
text = ocr.extract_text('screenshot.png')
print(text)

# 带位置信息的提取
detailed = ocr.extract_text_detailed('document.png')
for item in detailed:
    if item['confidence'] > 60:  # 置信度过滤
        print(f"{item['text']} @ ({item['x']}, {item['y']})")

3.2 验证码识别

import pytesseract
import cv2
import numpy as np

class CaptchaSolver:
    """验证码识别器"""
    
    def __init__(self):
        self.characters = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    
    def remove_noise(self, img, kernel_size=2):
        """去除噪点"""
        kernel = np.ones((kernel_size, kernel_size), np.uint8)
        opened = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)
        return opened
    
    def segment_characters(self, img):
        """分割字符"""
        # 投影法分割
        h, w = img.shape
        
        # 垂直投影
        vertical_sum = np.sum(img, axis=0)
        vertical_sum = vertical_sum / 255
        
        # 找出字符边界
        in_char = False
        char_bounds = []
        start = 0
        
        for i, val in enumerate(vertical_sum):
            if val > 5 and not in_char:
                start = i
                in_char = True
            elif val <= 5 and in_char:
                if i - start > 10:  # 最小字符宽度
                    char_bounds.append((start, i))
                in_char = False
                
        return char_bounds
    
    def solve(self, image_path):
        """识别验证码"""
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # 预处理
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV)
        cleaned = self.remove_noise(binary)
        
        # 分割
        char_bounds = self.segment_characters(cleaned)
        
        # 识别每个字符
        result = []
        for start, end in char_bounds:
            char_img = cleaned[:, start:end]
            
            # 调整大小为标准尺寸
            char_resized = cv2.resize(char_img, (20, 20))
            
            # OCR识别
            char = pytesseract.image_to_string(
                char_resized,
                config='--psm 10 -c tessedit_char_whitelist=0123456789'
            ).strip()
            
            result.append(char)
            
        return ''.join(result)

四、自动化场景实战

4.1 截图对比和差异检测

import cv2
import numpy as np
from pathlib import Path

class ScreenComparator:
    """屏幕截图对比工具"""
    
    def __init__(self, threshold=0.95):
        self.threshold = threshold
        
    def compare_images(self, img1_path, img2_path):
        """比较两张图片是否相同"""
        img1 = cv2.imread(str(img1_path))
        img2 = cv2.imread(str(img2_path))
        
        if img1 is None or img2 is None:
            return False, 0
            
        # 调整大小一致
        img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))
        
        # 计算直方图相似度
        hist1 = cv2.calcHist([img1], [0], None, [256], [0, 256])
        hist2 = cv2.calcHist([img2], [0], None, [256], [0, 256])
        
        similarity = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
        
        return similarity >= self.threshold, similarity
    
    def find_differences(self, img1_path, img2_path, output_path=None):
        """找出图片差异"""
        img1 = cv2.imread(str(img1_path))
        img2 = cv2.imread(str(img2_path))
        
        img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))
        
        # 计算差异
        diff = cv2.absdiff(img1, img2)
        
        # 转灰度
        gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
        
        # 二值化
        _, thresh = cv2.threshold(gray, 30, 255, cv2.THRESH_BINARY)
        
        # 膨胀连接相邻区域
        kernel = np.ones((5, 5), np.uint8)
        dilated = cv2.dilate(thresh, kernel, iterations=2)
        
        # 找轮廓
        contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # 标记差异区域
        result = img1.copy()
        for contour in contours:
            if cv2.contourArea(contour) > 100:  # 过滤小噪点
                x, y, w, h = cv2.boundingRect(contour)
                cv2.rectangle(result, (x, y), (x+w, y+h), (0, 0, 255), 2)
                
        # 保存结果
        if output_path:
            cv2.imwrite(str(output_path), result)
            
        return result, len(contours)
    
    def detect_changes(self, baseline_path, current_path, min_area=500):
        """检测画面变化"""
        baseline = cv2.imread(str(baseline_path))
        current = cv2.imread(str(current_path))
        
        # 缩放以提高性能
        scale = 0.5
        baseline = cv2.resize(baseline, None, fx=scale, fy=scale)
        current = cv2.resize(current, None, fx=scale, fy=scale)
        
        # 计算差异
        diff = cv2.absdiff(baseline, current)
        gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
        
        # 找出变化区域
        thresh = cv2.threshold(gray, 25, 255, cv2.THRESH_BINARY)[1]
        thresh = cv2.dilate(thresh, None, iterations=3)
        
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # 过滤小变化
        significant_changes = [
            cv2.boundingRect(c) 
            for c in contours 
            if cv2.contourArea(c) > min_area
        ]
        
        return significant_changes

comparator = ScreenComparator()
is_same, similarity = comparator.compare_images('before.png', 'after.png')
print(f"相似度: {similarity:.2%}")

# 检测变化区域
changes = comparator.detect_changes('baseline.png', 'current.png')
print(f"发现 {len(changes)} 个变化区域")

4.2 自动化测试截图验证

import cv2
import numpy as np
from PIL import ImageGrab
import time

class UITestHelper:
    """UI自动化测试辅助工具"""
    
    @staticmethod
    def screenshot(region=None):
        """截取屏幕"""
        if region:
            x, y, w, h = region
            img = ImageGrab.grab(bbox=(x, y, x+w, y+h))
        else:
            img = ImageGrab.grab()
            
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        return img_cv
    
    @staticmethod
    def find_image_on_screen(target_path, threshold=0.8, region=None):
        """在屏幕上查找图片"""
        screenshot = UITestHelper.screenshot(region)
        target = cv2.imread(str(target_path))
        
        if target is None:
            return None
            
        # 模板匹配
        result = cv2.matchTemplate(screenshot, target, cv2.TM_CCOEFF_NORMED)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
        
        if max_val >= threshold:
            x, y = max_loc
            h, w = target.shape[:2]
            return {'x': x, 'y': y, 'width': w, 'height': h, 'confidence': max_val}
            
        return None
    
    @staticmethod
    def wait_for_image(target_path, timeout=10, interval=0.5, region=None):
        """等待图片出现"""
        start_time = time.time()
        
        while time.time() - start_time < timeout:
            result = UITestHelper.find_image_on_screen(target_path, region=region)
            if result:
                return result
            time.sleep(interval)
            
        return None
    
    @staticmethod
    def click_on_image(target_path, offset=(0, 0), region=None):
        """点击图片位置"""
        import pyautogui
        
        result = UITestHelper.wait_for_image(target_path, region=region)
        if result:
            x = result['x'] + result['width'] // 2 + offset[0]
            y = result['y'] + result['height'] // 2 + offset[1]
            pyautogui.click(x, y)
            return True
        return False
    
    @staticmethod
    def wait_for_change(initial_path, timeout=30, region=None):
        """等待画面变化"""
        initial = UITestHelper.screenshot(region)
        
        start_time = time.time()
        while time.time() - start_time < timeout:
            current = UITestHelper.screenshot(region)
            
            diff = cv2.absdiff(initial, current)
            if cv2.sumElems(diff)[0] > 10000:  # 有明显变化
                return True
                
            time.sleep(0.5)
            initial = current
            
        return False

4.3 文档扫描和矫正

import cv2
import numpy as np

class DocumentScanner:
    """文档扫描器"""
    
    def __init__(self):
        self.ratio = 500 / 800  # 缩放比例
    
    def order_points(self, pts):
        """排序四个角点"""
        rect = np.zeros((4, 2), dtype='float32')
        
        s = pts.sum(axis=1)
        rect[0] = pts[np.argmin(s)]  # 左上
        rect[2] = pts[np.argmax(s)]  # 右下
        
        diff = np.diff(pts, axis=1)
        rect[1] = pts[np.argmin(diff)]  # 右上
        rect[3] = pts[np.argmax(diff)]  # 左下
        
        return rect
    
    def perspective_transform(self, image, pts):
        """透视变换"""
        rect = self.order_points(pts)
        (tl, tr, br, bl) = rect
        
        width_a = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
        width_b = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
        max_width = max(int(width_a), int(width_b))
        
        height_a = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
        height_b = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
        max_height = max(int(height_a), int(height_b))
        
        dst = np.array([
            [0, 0],
            [max_width - 1, 0],
            [max_width - 1, max_height - 1],
            [0, max_height - 1]
        ], dtype='float32')
        
        M = cv2.getPerspectiveTransform(rect, dst)
        warped = cv2.warpPerspective(image, M, (max_width, max_height))
        
        return warped
    
    def scan_document(self, image_path, output_path=None):
        """扫描文档"""
        image = cv2.imread(str(image_path))
        orig = image.copy()
        ratio = image.shape[0] / 500.0
        image = cv2.resize(image, (500, int(image.shape[0] * 500 / image.shape[1])))
        
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        gray = cv2.GaussianBlur(gray, (5, 5), 0)
        edged = cv2.Canny(gray, 75, 200)
        
        # 找轮廓
        contours, _ = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
        contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
        
        screen_cnt = None
        for c in contours:
            peri = cv2.arcLength(c, True)
            approx = cv2.approxPolyDP(c, 0.02 * peri, True)
            
            if len(approx) == 4:
                screen_cnt = approx
                break
        
        if screen_cnt is not None:
            # 透视变换
            warped = self.perspective_transform(orig, screen_cnt.reshape(4, 2) * ratio)
            
            if output_path:
                cv2.imwrite(str(output_path), warped)
                
            return warped
        
        return orig

scanner = DocumentScanner()
scanner.scan_document('receipt.jpg', 'receipt_scanned.jpg')

五、实战案例：自动化票据识别系统

class ReceiptRecognitionSystem:
    """票据识别系统"""
    
    def __init__(self):
        self.ocr = OCRProcessor()
        self.scanner = DocumentScanner()
        
    def process_receipt(self, image_path):
        """处理票据"""
        # 1. 文档矫正
        scanned = self.scanner.scan_document(image_path)
        
        # 2. 预处理
        processed = self.ocr.preprocess_for_ocr(scanned)
        
        # 3. OCR识别
        text = self.ocr.extract_text_detailed(processed)
        
        # 4. 解析关键信息
        result = self.parse_receipt(text)
        
        return result
    
    def parse_receipt(self, text_data):
        """解析票据信息"""
        receipt = {
            'items': [],
            'total': None,
            'date': None,
            'merchant': None
        }
        
        for item in text_data:
            text = item['text']
            
            # 检测总价
            if 'total' in text.lower():
                receipt['total'] = text
                
            # 检测日期
            if '/' in text and len(text) <= 12:
                receipt['date'] = text
                
            # 检测金额
            if self.is_amount(text):
                receipt['items'].append(text)
                
        return receipt
    
    @staticmethod
    def is_amount(text):
        """判断是否为金额"""
        import re
        return bool(re.match(r'^\$?\d+\.\d{2}$', text))
    
    def batch_process(self, image_folder, output_folder):
        """批量处理"""
        from pathlib import Path
        
        input_dir = Path(image_folder)
        output_dir = Path(output_folder)
        output_dir.mkdir(exist_ok=True)
        
        results = []
        for img_path in input_dir.glob('*.jpg'):
            try:
                result = self.process_receipt(img_path)
                results.append(result)
                
                # 保存结果
                output_file = output_dir / f"{img_path.stem}_result.json"
                import json
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(result, f, ensure_ascii=False, indent=2)
            except Exception as e:
                print(f"处理失败 {img_path}: {e}")
                
        return results

总结

Python图像识别与OpenCV为自动化脚本带来无限可能：

图像预处理：灰度化、二值化、去噪、锐化
边缘检测和轮廓：找形状、定位物体
OCR识别：从图片中提取文字
截图对比：UI自动化测试、变更检测
文档扫描：透视矫正、自动扫描

关键应用场景：

验证码识别和自动填写
票据/发票识别和录入
UI自动化测试
监控画面变化检测
文档数字化处理

掌握这些技术，让你的自动化脚本"看得见"！

亚马逊云科技技术品牌专区

更多推荐

Kiro Editor 开发实战：使用 Cargo 构建、测试与性能优化指南

欢迎来到这篇终极指南，我们将深入探索如何使用Rust构建高性能的终端文本编辑器Kiro Editor。无论你是Rust新手还是经验丰富的开发者，这篇完整教程将带你了解如何利用Cargo工具链进行高效的开发、测试和性能优化，打造一款快速、轻量且功能强大的UTF-8文本编辑器。## 什么是Kiro Editor？Kiro Editor是一款使用Rust编写的极简终端文本编辑器，它最初是著名编辑