Python自动化脚本图像识别与OpenCV实战
·
Python在图像处理和计算机视觉领域有着强大的能力。从简单的图像处理到复杂的人脸识别、物体检测,OpenCV和多种AI模型让这些变得触手可及。本文详细介绍OpenCV的基础操作、图像识别技术以及在实际自动化场景中的应用。
一、环境准备
1.1 安装依赖
pip install opencv-python opencv-contrib-python
pip install pillow numpy matplotlib
pip install pytesseract # OCR识别
pip install face-recognition # 人脸识别
1.2 基础图像操作
import cv2
import numpy as np
from PIL import Image
# 读取图像
img = cv2.imread('image.jpg')
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # OpenCV使用BGR格式
# 保存图像
cv2.imwrite('output.jpg', img)
# 获取图像信息
height, width, channels = img.shape
print(f"尺寸: {width}x{height}")
print(f"通道数: {channels}")
# 图像基本变换
resized = cv2.resize(img, (800, 600)) # 调整大小
rotated = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE) # 旋转
flipped = cv2.flip(img, 1) # 水平翻转
二、图像处理基础
2.1 灰度化和二值化
import cv2
import numpy as np
def preprocess_for_ocr(image_path):
"""OCR预处理"""
img = cv2.imread(image_path)
# 转灰度
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 高斯模糊去噪
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
# 二值化
_, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 形态学操作
kernel = np.ones((3, 3), np.uint8)
processed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
return processed
# 自适应阈值(适合光照不均的图片)
adaptive = cv2.adaptiveThreshold(
gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2
)
2.2 边缘检测和轮廓
import cv2
def detect_edges(image_path):
"""边缘检测"""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Canny边缘检测
edges = cv2.Canny(gray, 50, 150)
# 查找轮廓
contours, hierarchy = cv2.findContours(
edges,
cv2.RETR_TREE,
cv2.CHAIN_APPROX_SIMPLE
)
# 绘制轮廓
result = img.copy()
cv2.drawContours(result, contours, -1, (0, 255, 0), 2)
# 筛选大轮廓
large_contours = [c for c in contours if cv2.contourArea(c) > 1000]
return result, large_contours
def get_contour_info(contour):
"""获取轮廓信息"""
# 边界矩形
x, y, w, h = cv2.boundingRect(contour)
# 最小外接矩形(可旋转)
rect = cv2.minAreaRect(contour)
box = cv2.boxPoints(rect)
box = np.int0(box)
# 面积和周长
area = cv2.contourArea(contour)
perimeter = cv2.arcLength(contour, True)
return {
'bbox': (x, y, w, h),
'area': area,
'perimeter': perimeter
}
三、OCR文字识别
3.1 Tesseract OCR
import pytesseract
from PIL import Image
import cv2
class OCRProcessor:
"""OCR文字识别处理器"""
def __init__(self, lang='eng+chi_sim'):
self.lang = lang
def extract_text(self, image_path, config=''):
"""提取文字"""
img = Image.open(image_path)
text = pytesseract.image_to_string(
img,
lang=self.lang,
config=config
)
return text.strip()
def extract_text_detailed(self, image_path):
"""详细提取(包含位置信息)"""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 获取详细的文本数据
data = pytesseract.image_to_data(
gray,
output_type=pytesseract.Output.DICT
)
# 解析结果
results = []
for i, text in enumerate(data['text']):
if text.strip(): # 过滤空白
results.append({
'text': text,
'x': data['left'][i],
'y': data['top'][i],
'width': data['width'][i],
'height': data['height'][i],
'confidence': data['conf'][i]
})
return results
def preprocess_for_ocr(self, img):
"""OCR图像预处理"""
# 转灰度
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 去噪
denoised = cv2.fastNlMeansDenoising(gray)
# 锐化
kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
sharpened = cv2.filter2D(denoised, -1, kernel)
# 二值化
_, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return binary
ocr = OCRProcessor()
# 简单提取
text = ocr.extract_text('screenshot.png')
print(text)
# 带位置信息的提取
detailed = ocr.extract_text_detailed('document.png')
for item in detailed:
if item['confidence'] > 60: # 置信度过滤
print(f"{item['text']} @ ({item['x']}, {item['y']})")
3.2 验证码识别
import pytesseract
import cv2
import numpy as np
class CaptchaSolver:
"""验证码识别器"""
def __init__(self):
self.characters = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
def remove_noise(self, img, kernel_size=2):
"""去除噪点"""
kernel = np.ones((kernel_size, kernel_size), np.uint8)
opened = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)
return opened
def segment_characters(self, img):
"""分割字符"""
# 投影法分割
h, w = img.shape
# 垂直投影
vertical_sum = np.sum(img, axis=0)
vertical_sum = vertical_sum / 255
# 找出字符边界
in_char = False
char_bounds = []
start = 0
for i, val in enumerate(vertical_sum):
if val > 5 and not in_char:
start = i
in_char = True
elif val <= 5 and in_char:
if i - start > 10: # 最小字符宽度
char_bounds.append((start, i))
in_char = False
return char_bounds
def solve(self, image_path):
"""识别验证码"""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 预处理
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV)
cleaned = self.remove_noise(binary)
# 分割
char_bounds = self.segment_characters(cleaned)
# 识别每个字符
result = []
for start, end in char_bounds:
char_img = cleaned[:, start:end]
# 调整大小为标准尺寸
char_resized = cv2.resize(char_img, (20, 20))
# OCR识别
char = pytesseract.image_to_string(
char_resized,
config='--psm 10 -c tessedit_char_whitelist=0123456789'
).strip()
result.append(char)
return ''.join(result)
四、自动化场景实战
4.1 截图对比和差异检测
import cv2
import numpy as np
from pathlib import Path
class ScreenComparator:
"""屏幕截图对比工具"""
def __init__(self, threshold=0.95):
self.threshold = threshold
def compare_images(self, img1_path, img2_path):
"""比较两张图片是否相同"""
img1 = cv2.imread(str(img1_path))
img2 = cv2.imread(str(img2_path))
if img1 is None or img2 is None:
return False, 0
# 调整大小一致
img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))
# 计算直方图相似度
hist1 = cv2.calcHist([img1], [0], None, [256], [0, 256])
hist2 = cv2.calcHist([img2], [0], None, [256], [0, 256])
similarity = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
return similarity >= self.threshold, similarity
def find_differences(self, img1_path, img2_path, output_path=None):
"""找出图片差异"""
img1 = cv2.imread(str(img1_path))
img2 = cv2.imread(str(img2_path))
img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))
# 计算差异
diff = cv2.absdiff(img1, img2)
# 转灰度
gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
# 二值化
_, thresh = cv2.threshold(gray, 30, 255, cv2.THRESH_BINARY)
# 膨胀连接相邻区域
kernel = np.ones((5, 5), np.uint8)
dilated = cv2.dilate(thresh, kernel, iterations=2)
# 找轮廓
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# 标记差异区域
result = img1.copy()
for contour in contours:
if cv2.contourArea(contour) > 100: # 过滤小噪点
x, y, w, h = cv2.boundingRect(contour)
cv2.rectangle(result, (x, y), (x+w, y+h), (0, 0, 255), 2)
# 保存结果
if output_path:
cv2.imwrite(str(output_path), result)
return result, len(contours)
def detect_changes(self, baseline_path, current_path, min_area=500):
"""检测画面变化"""
baseline = cv2.imread(str(baseline_path))
current = cv2.imread(str(current_path))
# 缩放以提高性能
scale = 0.5
baseline = cv2.resize(baseline, None, fx=scale, fy=scale)
current = cv2.resize(current, None, fx=scale, fy=scale)
# 计算差异
diff = cv2.absdiff(baseline, current)
gray = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
# 找出变化区域
thresh = cv2.threshold(gray, 25, 255, cv2.THRESH_BINARY)[1]
thresh = cv2.dilate(thresh, None, iterations=3)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# 过滤小变化
significant_changes = [
cv2.boundingRect(c)
for c in contours
if cv2.contourArea(c) > min_area
]
return significant_changes
comparator = ScreenComparator()
is_same, similarity = comparator.compare_images('before.png', 'after.png')
print(f"相似度: {similarity:.2%}")
# 检测变化区域
changes = comparator.detect_changes('baseline.png', 'current.png')
print(f"发现 {len(changes)} 个变化区域")
4.2 自动化测试截图验证
import cv2
import numpy as np
from PIL import ImageGrab
import time
class UITestHelper:
"""UI自动化测试辅助工具"""
@staticmethod
def screenshot(region=None):
"""截取屏幕"""
if region:
x, y, w, h = region
img = ImageGrab.grab(bbox=(x, y, x+w, y+h))
else:
img = ImageGrab.grab()
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
return img_cv
@staticmethod
def find_image_on_screen(target_path, threshold=0.8, region=None):
"""在屏幕上查找图片"""
screenshot = UITestHelper.screenshot(region)
target = cv2.imread(str(target_path))
if target is None:
return None
# 模板匹配
result = cv2.matchTemplate(screenshot, target, cv2.TM_CCOEFF_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
if max_val >= threshold:
x, y = max_loc
h, w = target.shape[:2]
return {'x': x, 'y': y, 'width': w, 'height': h, 'confidence': max_val}
return None
@staticmethod
def wait_for_image(target_path, timeout=10, interval=0.5, region=None):
"""等待图片出现"""
start_time = time.time()
while time.time() - start_time < timeout:
result = UITestHelper.find_image_on_screen(target_path, region=region)
if result:
return result
time.sleep(interval)
return None
@staticmethod
def click_on_image(target_path, offset=(0, 0), region=None):
"""点击图片位置"""
import pyautogui
result = UITestHelper.wait_for_image(target_path, region=region)
if result:
x = result['x'] + result['width'] // 2 + offset[0]
y = result['y'] + result['height'] // 2 + offset[1]
pyautogui.click(x, y)
return True
return False
@staticmethod
def wait_for_change(initial_path, timeout=30, region=None):
"""等待画面变化"""
initial = UITestHelper.screenshot(region)
start_time = time.time()
while time.time() - start_time < timeout:
current = UITestHelper.screenshot(region)
diff = cv2.absdiff(initial, current)
if cv2.sumElems(diff)[0] > 10000: # 有明显变化
return True
time.sleep(0.5)
initial = current
return False
4.3 文档扫描和矫正
import cv2
import numpy as np
class DocumentScanner:
"""文档扫描器"""
def __init__(self):
self.ratio = 500 / 800 # 缩放比例
def order_points(self, pts):
"""排序四个角点"""
rect = np.zeros((4, 2), dtype='float32')
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)] # 左上
rect[2] = pts[np.argmax(s)] # 右下
diff = np.diff(pts, axis=1)
rect[1] = pts[np.argmin(diff)] # 右上
rect[3] = pts[np.argmax(diff)] # 左下
return rect
def perspective_transform(self, image, pts):
"""透视变换"""
rect = self.order_points(pts)
(tl, tr, br, bl) = rect
width_a = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
width_b = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
max_width = max(int(width_a), int(width_b))
height_a = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
height_b = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
max_height = max(int(height_a), int(height_b))
dst = np.array([
[0, 0],
[max_width - 1, 0],
[max_width - 1, max_height - 1],
[0, max_height - 1]
], dtype='float32')
M = cv2.getPerspectiveTransform(rect, dst)
warped = cv2.warpPerspective(image, M, (max_width, max_height))
return warped
def scan_document(self, image_path, output_path=None):
"""扫描文档"""
image = cv2.imread(str(image_path))
orig = image.copy()
ratio = image.shape[0] / 500.0
image = cv2.resize(image, (500, int(image.shape[0] * 500 / image.shape[1])))
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.GaussianBlur(gray, (5, 5), 0)
edged = cv2.Canny(gray, 75, 200)
# 找轮廓
contours, _ = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
screen_cnt = None
for c in contours:
peri = cv2.arcLength(c, True)
approx = cv2.approxPolyDP(c, 0.02 * peri, True)
if len(approx) == 4:
screen_cnt = approx
break
if screen_cnt is not None:
# 透视变换
warped = self.perspective_transform(orig, screen_cnt.reshape(4, 2) * ratio)
if output_path:
cv2.imwrite(str(output_path), warped)
return warped
return orig
scanner = DocumentScanner()
scanner.scan_document('receipt.jpg', 'receipt_scanned.jpg')
五、实战案例:自动化票据识别系统
class ReceiptRecognitionSystem:
"""票据识别系统"""
def __init__(self):
self.ocr = OCRProcessor()
self.scanner = DocumentScanner()
def process_receipt(self, image_path):
"""处理票据"""
# 1. 文档矫正
scanned = self.scanner.scan_document(image_path)
# 2. 预处理
processed = self.ocr.preprocess_for_ocr(scanned)
# 3. OCR识别
text = self.ocr.extract_text_detailed(processed)
# 4. 解析关键信息
result = self.parse_receipt(text)
return result
def parse_receipt(self, text_data):
"""解析票据信息"""
receipt = {
'items': [],
'total': None,
'date': None,
'merchant': None
}
for item in text_data:
text = item['text']
# 检测总价
if 'total' in text.lower():
receipt['total'] = text
# 检测日期
if '/' in text and len(text) <= 12:
receipt['date'] = text
# 检测金额
if self.is_amount(text):
receipt['items'].append(text)
return receipt
@staticmethod
def is_amount(text):
"""判断是否为金额"""
import re
return bool(re.match(r'^\$?\d+\.\d{2}$', text))
def batch_process(self, image_folder, output_folder):
"""批量处理"""
from pathlib import Path
input_dir = Path(image_folder)
output_dir = Path(output_folder)
output_dir.mkdir(exist_ok=True)
results = []
for img_path in input_dir.glob('*.jpg'):
try:
result = self.process_receipt(img_path)
results.append(result)
# 保存结果
output_file = output_dir / f"{img_path.stem}_result.json"
import json
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"处理失败 {img_path}: {e}")
return results
总结
Python图像识别与OpenCV为自动化脚本带来无限可能:
- 图像预处理:灰度化、二值化、去噪、锐化
- 边缘检测和轮廓:找形状、定位物体
- OCR识别:从图片中提取文字
- 截图对比:UI自动化测试、变更检测
- 文档扫描:透视矫正、自动扫描
关键应用场景:
- 验证码识别和自动填写
- 票据/发票识别和录入
- UI自动化测试
- 监控画面变化检测
- 文档数字化处理
掌握这些技术,让你的自动化脚本"看得见"!
更多推荐
所有评论(0)