优化图片分类和图片方向判断

This commit is contained in:
2024-10-16 17:01:56 +08:00
parent 947b4f20f3
commit d374e0743a
2 changed files with 197 additions and 99 deletions

View File

@@ -1,6 +1,7 @@
import json import json
import logging import logging
import os import os
import re
import shutil import shutil
import time import time
import uuid import uuid
@@ -8,7 +9,6 @@ from collections import defaultdict
from time import sleep from time import sleep
import cv2 import cv2
import fitz
import jieba import jieba
import numpy as np import numpy as np
import zxingcpp import zxingcpp
@@ -21,44 +21,48 @@ from log import HOSTNAME
from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \ from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \
DEPARTMENT_FILTER, DISCHARGE_KEY, set_batch_id, get_batch_id DEPARTMENT_FILTER, DISCHARGE_KEY, set_batch_id, get_batch_id
from services.paddle_services import IE_KEY from services.paddle_services import IE_KEY
from ucloud import ufile from ucloud import ufile, BUCKET
from util import image_util, common_util, html_util, model_util from util import image_util, common_util, html_util, model_util
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \ from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \
handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital, \ handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital, \
parse_page_num, handle_tiny_int parse_page_num, handle_tiny_int
# 尝试从二维码中获取高清图片 def parse_qrcode(img_path, image_id):
def get_better_image_from_qrcode(img_path, image_id, dpi=150): """
解析二维码,尝试从中获取高清图片
:param img_path: 待解析图片
:param image_id: 图片id
:return: 解析结果
"""
def _parse_pdf_url(pdf_url_to_parse): def _parse_pdf_url(pdf_url_to_parse):
pdf_file = None
local_pdf_path = None local_pdf_path = None
img_name, img_ext = common_util.parse_save_path(img_path)
try: try:
local_pdf_path = html_util.download_pdf(pdf_url_to_parse) local_pdf_path = html_util.download_pdf(pdf_url_to_parse)
# 打开PDF文件 pdf_imgs = image_util.pdf_to_imgs(local_pdf_path)
pdf_file = fitz.open(local_pdf_path) # 结算单部分
# 选择第一页 better_settlement_path = common_util.get_processed_img_path(f'{img_name}.better_settlement.jpg')
page = pdf_file[0] cv2.imwrite(better_settlement_path, pdf_imgs[0][0])
# 定义缩放系数DPI # 费用清单部分
default_dpi = 72 better_cost_path = common_util.get_processed_img_path(f'{img_name}.better_cost.jpg')
zoom = dpi / default_dpi total_height = sum([p[0].shape[0] for p in pdf_imgs[1:]])
# 设置矩阵变换参数 common_width = pdf_imgs[1][0].shape[1]
mat = fitz.Matrix(zoom, zoom) better_cost_img = np.zeros((total_height, common_width, 3), dtype=np.uint8)
# 渲染页面 current_y = 0
pix = page.get_pixmap(matrix=mat) for pdf in pdf_imgs[1:]:
# 将渲染结果转换为OpenCV兼容的格式 height = pdf[0].shape[0]
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1)) better_cost_img[current_y:current_y + height, :, :] = pdf[0]
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) current_y += height
img_name, img_ext = common_util.parse_save_path(img_path) # cost_text += pdf[1] # 费用清单文本暂时没用到
better_img_path = common_util.get_processed_img_path(f'{img_name}.better.{img_ext}') cv2.imwrite(better_cost_path, better_cost_img)
cv2.imwrite(better_img_path, img)
return better_img_path, page.get_text() return better_settlement_path, pdf_imgs[0][1], better_cost_path
except Exception as ex: except Exception as ex:
logging.getLogger('error').error('解析pdf失败', exc_info=ex) logging.getLogger('error').error('解析pdf失败', exc_info=ex)
return None, None return None, None, None
finally: finally:
if pdf_file:
pdf_file.close()
if local_pdf_path: if local_pdf_path:
common_util.delete_temp_file(local_pdf_path) common_util.delete_temp_file(local_pdf_path)
@@ -99,7 +103,7 @@ def get_better_image_from_qrcode(img_path, image_id, dpi=150):
logging.getLogger('error').error('从二维码中获取高清图片时出错', exc_info=e) logging.getLogger('error').error('从二维码中获取高清图片时出错', exc_info=e)
continue continue
return None, None return None, None, None
# 关键信息提取 # 关键信息提取
@@ -110,61 +114,48 @@ def information_extraction(phrec, pk_phhd):
:param pk_phhd:案子主键 :param pk_phhd:案子主键
:return:记录类型,信息抽取结果 :return:记录类型,信息抽取结果
""" """
original_img_path = common_util.get_img_path(phrec.cfjaddress)
if not original_img_path:
img_url = ufile.get_private_url(phrec.cfjaddress)
if not img_url:
return None, None, None
original_img_path = common_util.save_to_local(img_url)
img_path = common_util.get_processed_img_path(phrec.cfjaddress) img_path = common_util.get_processed_img_path(phrec.cfjaddress)
shutil.copy2(original_img_path, img_path) if image_util.is_photo(img_path):
book_img_path = model_util.det_book(img_path) # 识别文档区域并裁剪
dewarped_img_path = model_util.dewarp(book_img_path) # 去扭曲
else: # todo:也可能是图片,后续添加细分逻辑
dewarped_img_path = img_path
angles = model_util.clas_orientation(dewarped_img_path)
ocr_text = ''
info_extract = []
rec_type = None
for angle in angles:
ocr_result = []
rotated_img = image_util.rotate(dewarped_img_path, int(angle))
split_results = image_util.split(rotated_img)
for split_result in split_results:
if split_result['img'] is None:
continue
a4_img = image_util.expand_to_a4_size(split_result['img'])
ocr_result += model_util.ocr(a4_img)
tmp_ocr_text = common_util.ocr_result_to_text(ocr_result)
# 尝试从二维码中获取高清图片 tmp_rec_type = model_util.clas_text(tmp_ocr_text) if ocr_text else None
better_img_path, text = get_better_image_from_qrcode(img_path, phrec.cfjaddress) if not tmp_rec_type:
if phrec.cRectype != '1': rec_dict = {
better_img_path = None # 非结算单暂时不进行替换 '1': '基本医保结算单',
if better_img_path is not None: '3': '出院记录',
rec_type = '基本医保结算' '4': '费用清',
if text: }
info_extract = model_util.ie_settlement_text(text) tmp_rec_type = rec_dict.get(phrec.cRectype)
if tmp_rec_type == '基本医保结算单':
tmp_info_extract = model_util.ie_settlement(rotated_img, common_util.ocr_result_to_layout(ocr_result))
elif tmp_rec_type == '出院记录':
tmp_info_extract = model_util.ie_discharge(rotated_img, common_util.ocr_result_to_layout(ocr_result))
elif tmp_rec_type == '费用清单':
tmp_info_extract = model_util.ie_cost(rotated_img, common_util.ocr_result_to_layout(ocr_result))
else: else:
info_extract = model_util.ie_settlement( tmp_info_extract = []
better_img_path, common_util.ocr_result_to_layout(model_util.ocr(better_img_path))
)
ocr_text = None # 此处肯定不是出院记录,后续用不到
else:
if image_util.is_photo(img_path):
book_img_path = model_util.det_book(img_path) # 识别文档区域并裁剪
dewarped_img_path = model_util.dewarp(book_img_path) # 去扭曲
else: # todo:也可能是图片,后续添加细分逻辑
dewarped_img_path = img_path
angles = model_util.clas_orientation(dewarped_img_path)
ocr_text = ''
info_extract = []
for angle in angles:
ocr_result = []
rotated_img = image_util.rotate(dewarped_img_path, int(angle))
split_results = image_util.split(rotated_img)
for split_result in split_results:
if split_result['img'] is None:
continue
a4_img = image_util.expand_to_a4_size(split_result['img'])
ocr_result += model_util.ocr(a4_img)
tmp_ocr_text = common_util.ocr_result_to_text(ocr_result)
rec_type = model_util.clas_text(tmp_ocr_text) if ocr_text else None if len(tmp_info_extract) > len(info_extract):
if rec_type == '基本医保结算单': info_extract = tmp_info_extract
tmp_info_extract = model_util.ie_settlement(rotated_img, common_util.ocr_result_to_layout(ocr_result)) ocr_text = tmp_ocr_text
elif rec_type == '出院记录': rec_type = tmp_rec_type
tmp_info_extract = model_util.ie_discharge(rotated_img, common_util.ocr_result_to_layout(ocr_result))
elif rec_type == '费用清单':
tmp_info_extract = model_util.ie_cost(rotated_img, common_util.ocr_result_to_layout(ocr_result))
else:
tmp_info_extract = None
if len(tmp_info_extract) > len(info_extract):
info_extract = tmp_info_extract
ocr_text = tmp_ocr_text
if info_extract: if info_extract:
result_json = json.dumps(info_extract, ensure_ascii=False) result_json = json.dumps(info_extract, ensure_ascii=False)
@@ -425,6 +416,29 @@ def cost_task(pk_phhd, cost_list_ie_result):
return cost_data return cost_data
def parse_pdf_text(settlement_text):
pattern = (r'(?:交款人:(.*?)\n|住院时间:(.*?)至(.*?)\n|\(小写\)(.*?)\n|个人现金支付:(.*?)\n|个人账户支付:(.*?)\n'
r'|个人自费:(.*?)\n|医保类型:(.*?)\n|住院科别:(.*?)\n|住院号:(.*?)\n|票据号码:(.*?)\n|)')
# 查找所有匹配项
matches = re.findall(pattern, settlement_text)
results = {}
keys = ['患者姓名', '入院日期', '出院日期', '费用总额', '个人现金支付', '个人账户支付', '个人自费', '医保类型',
'科室', '住院号', '医保结算单号码']
for match in matches:
for key, value in zip(keys, match):
if value:
results[key] = [[{'text': value, 'probability': 1}]]
settlement_key = ['患者姓名', '入院日期', '出院日期', '费用总额', '个人现金支付', '个人账户支付', '个人自费',
'医保类型', '住院号', '医保结算单号码']
discharge_key = ['科室', '患者姓名', '入院日期', '出院日期', '住院号']
cost_key = ['患者姓名', '入院日期', '出院日期', '费用总额']
settlement_result = {key: results[key] for key in settlement_key if key in results}
discharge_result = {key: results[key] for key in discharge_key if key in results}
cost_result = {key: results[key] for key in cost_key if key in results}
return settlement_result, discharge_result, cost_result
def photo_review(pk_phhd, name): def photo_review(pk_phhd, name):
""" """
处理单个报销案子 处理单个报销案子
@@ -438,7 +452,7 @@ def photo_review(pk_phhd, name):
session = MysqlSession() session = MysqlSession()
phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter( phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter(
ZxPhrec.pk_phhd == pk_phhd ZxPhrec.pk_phhd == pk_phhd
).all() ).order_by(ZxPhrec.cRectype).all()
session.close() session.close()
# 同一批图的标识 # 同一批图的标识
@@ -446,25 +460,76 @@ def photo_review(pk_phhd, name):
processed_img_dir = common_util.get_processed_img_path('') processed_img_dir = common_util.get_processed_img_path('')
os.makedirs(processed_img_dir, exist_ok=True) os.makedirs(processed_img_dir, exist_ok=True)
discharge_text = '' has_pdf = False # 是否获取到了pdf获取到可以直接利用pdf更快的获取信息
better_settlement_path = None
better_cost_path = None
settlement_text = ''
for phrec in phrecs: for phrec in phrecs:
rec_type, ie_result, ocr_text = information_extraction(phrec, pk_phhd) original_img_path = common_util.get_img_path(phrec.cfjaddress)
if rec_type == '基本医保结算单': if not original_img_path:
rec_result = settlement_result img_url = ufile.get_private_url(phrec.cfjaddress)
elif rec_type == '出院记录': if not img_url:
rec_result = discharge_result continue
discharge_text += ocr_text original_img_path = common_util.save_to_local(img_url)
elif rec_type == '费用清单': img_path = common_util.get_processed_img_path(phrec.cfjaddress)
rec_result = cost_result shutil.copy2(original_img_path, img_path)
else: # 尝试从二维码中获取高清图片
rec_result = None better_settlement_path, settlement_text, better_cost_path = parse_qrcode(img_path, phrec.cfjaddress)
if rec_result is not None: if better_settlement_path:
for key, value in ie_result.items(): has_pdf = True
rec_result[key].append(value) break
# 删除多余图片 discharge_text = ''
if os.path.exists(processed_img_dir) and os.path.isdir(processed_img_dir): if has_pdf:
shutil.rmtree(processed_img_dir) settlement_result, discharge_result, cost_result = parse_pdf_text(settlement_text)
discharge_result = defaultdict(list, discharge_result)
is_settlement_updated = False
is_cost_updated = False
for phrec in phrecs:
if phrec.cRectype == '1':
if not is_settlement_updated:
try:
ufile.copy_file(BUCKET, phrec.cfjaddress, "drg2015", phrec.cfjaddress)
ufile.upload_file(phrec.cfjaddress, better_settlement_path)
except Exception as e:
logging.error("更新结算单pdf图片出错", exc_info=e)
finally:
is_settlement_updated = True
elif phrec.cRectype == '3':
rec_type, ie_result, ocr_text = information_extraction(phrec, pk_phhd)
if rec_type == '出院记录':
discharge_text += ocr_text
for key, value in ie_result.items():
discharge_result[key].append(value)
elif phrec.cRectype == '4':
if not is_cost_updated:
try:
ufile.copy_file(BUCKET, phrec.cfjaddress, "drg2015", phrec.cfjaddress)
ufile.upload_file(phrec.cfjaddress, better_cost_path)
except Exception as e:
logging.error("更新费用清单pdf图片出错", exc_info=e)
finally:
is_cost_updated = True
else:
for phrec in phrecs:
rec_type, ie_result, ocr_text = information_extraction(phrec, pk_phhd)
if rec_type == '基本医保结算单':
rec_result = settlement_result
elif rec_type == '出院记录':
rec_result = discharge_result
discharge_text += ocr_text
elif rec_type == '费用清单':
rec_result = cost_result
else:
rec_result = None
if rec_result is not None:
for key, value in ie_result.items():
rec_result[key].append(value)
# 删除多余图片
if os.path.exists(processed_img_dir) and os.path.isdir(processed_img_dir):
shutil.rmtree(processed_img_dir)
settlement_data = settlement_task(pk_phhd, settlement_result) settlement_data = settlement_task(pk_phhd, settlement_result)
discharge_data = discharge_task(pk_phhd, discharge_result) discharge_data = discharge_task(pk_phhd, discharge_result)
@@ -560,8 +625,9 @@ def photo_review(pk_phhd, name):
else: else:
review_result['discharge_date_match'] = '0' review_result['discharge_date_match'] = '0'
review_result['consistency'] = (review_result['name_match'] == '1' and review_result['admission_date_match'] == '1' review_result['consistency'] = (
and review_result['discharge_date_match'] == '1') review_result['name_match'] == '1' and review_result['admission_date_match'] == '1'
and review_result['discharge_date_match'] == '1')
review_result['success'] = review_result['integrity'] and review_result['consistency'] review_result['success'] = review_result['integrity'] and review_result['consistency']
save_or_update_ie(ZxIeReview, pk_phhd, review_result) save_or_update_ie(ZxIeReview, pk_phhd, review_result)

View File

@@ -1,6 +1,8 @@
import logging
import math import math
import cv2 import cv2
import fitz
import numpy import numpy
from PIL import Image from PIL import Image
@@ -27,7 +29,7 @@ def capture(image, rectangle):
def split(img_path, ratio=1.414, overlap=0.05, x_compensation=3): def split(img_path, ratio=1.414, overlap=0.05, x_compensation=3):
""" """
分割图片 分割图片
:param img_path:图片路径 :param img_path: 图片路径
:param ratio: 分割后的比例 :param ratio: 分割后的比例
:param overlap: 图片之间的覆盖比例 :param overlap: 图片之间的覆盖比例
:param x_compensation: 横向补偿倍率 :param x_compensation: 横向补偿倍率
@@ -251,3 +253,33 @@ def is_photo(img_path):
if any(tag in exif for tag in (271, 272)): if any(tag in exif for tag in (271, 272)):
return True return True
return False return False
def pdf_to_imgs(pdf_path, dpi=150):
pdf_file = None
# 定义缩放系数DPI
default_dpi = 72
zoom = dpi / default_dpi
try:
# 打开PDF文件
pdf_file = fitz.open(pdf_path)
pdf_imgs = []
for page in pdf_file:
# 设置矩阵变换参数
mat = fitz.Matrix(zoom, zoom)
# 渲染页面
pix = page.get_pixmap(matrix=mat)
# 将渲染结果转换为OpenCV兼容的格式
img = numpy.frombuffer(pix.samples, dtype=numpy.uint8).reshape((pix.height, pix.width, -1))
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
pdf_imgs.append([img, page.get_text()])
return pdf_imgs
except Exception as ex:
logging.getLogger('error').error('pdf转图片失败', exc_info=ex)
return None
finally:
if pdf_file:
pdf_file.close()