diff --git a/photo_review/auto_photo_review.py b/photo_review/auto_photo_review.py index 89cbd2f..a0ee475 100644 --- a/photo_review/auto_photo_review.py +++ b/photo_review/auto_photo_review.py @@ -1,6 +1,7 @@ import json import logging import os +import re import shutil import time import uuid @@ -8,7 +9,6 @@ from collections import defaultdict from time import sleep import cv2 -import fitz import jieba import numpy as np import zxingcpp @@ -21,44 +21,48 @@ from log import HOSTNAME from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \ DEPARTMENT_FILTER, DISCHARGE_KEY, set_batch_id, get_batch_id from services.paddle_services import IE_KEY -from ucloud import ufile +from ucloud import ufile, BUCKET from util import image_util, common_util, html_util, model_util from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \ handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital, \ parse_page_num, handle_tiny_int -# 尝试从二维码中获取高清图片 -def get_better_image_from_qrcode(img_path, image_id, dpi=150): +def parse_qrcode(img_path, image_id): + """ + 解析二维码,尝试从中获取高清图片 + :param img_path: 待解析图片 + :param image_id: 图片id + :return: 解析结果 + """ + def _parse_pdf_url(pdf_url_to_parse): - pdf_file = None local_pdf_path = None + img_name, img_ext = common_util.parse_save_path(img_path) try: local_pdf_path = html_util.download_pdf(pdf_url_to_parse) - # 打开PDF文件 - pdf_file = fitz.open(local_pdf_path) - # 选择第一页 - page = pdf_file[0] - # 定义缩放系数(DPI) - default_dpi = 72 - zoom = dpi / default_dpi - # 设置矩阵变换参数 - mat = fitz.Matrix(zoom, zoom) - # 渲染页面 - pix = page.get_pixmap(matrix=mat) - # 将渲染结果转换为OpenCV兼容的格式 - img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1)) - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - img_name, img_ext = common_util.parse_save_path(img_path) - better_img_path = common_util.get_processed_img_path(f'{img_name}.better.{img_ext}') - cv2.imwrite(better_img_path, img) - return better_img_path, page.get_text() + pdf_imgs = image_util.pdf_to_imgs(local_pdf_path) + # 结算单部分 + better_settlement_path = common_util.get_processed_img_path(f'{img_name}.better_settlement.jpg') + cv2.imwrite(better_settlement_path, pdf_imgs[0][0]) + # 费用清单部分 + better_cost_path = common_util.get_processed_img_path(f'{img_name}.better_cost.jpg') + total_height = sum([p[0].shape[0] for p in pdf_imgs[1:]]) + common_width = pdf_imgs[1][0].shape[1] + better_cost_img = np.zeros((total_height, common_width, 3), dtype=np.uint8) + current_y = 0 + for pdf in pdf_imgs[1:]: + height = pdf[0].shape[0] + better_cost_img[current_y:current_y + height, :, :] = pdf[0] + current_y += height + # cost_text += pdf[1] # 费用清单文本暂时没用到 + cv2.imwrite(better_cost_path, better_cost_img) + + return better_settlement_path, pdf_imgs[0][1], better_cost_path except Exception as ex: logging.getLogger('error').error('解析pdf失败!', exc_info=ex) - return None, None + return None, None, None finally: - if pdf_file: - pdf_file.close() if local_pdf_path: common_util.delete_temp_file(local_pdf_path) @@ -99,7 +103,7 @@ def get_better_image_from_qrcode(img_path, image_id, dpi=150): logging.getLogger('error').error('从二维码中获取高清图片时出错', exc_info=e) continue - return None, None + return None, None, None # 关键信息提取 @@ -110,61 +114,48 @@ def information_extraction(phrec, pk_phhd): :param pk_phhd:案子主键 :return:记录类型,信息抽取结果 """ - original_img_path = common_util.get_img_path(phrec.cfjaddress) - if not original_img_path: - img_url = ufile.get_private_url(phrec.cfjaddress) - if not img_url: - return None, None, None - original_img_path = common_util.save_to_local(img_url) img_path = common_util.get_processed_img_path(phrec.cfjaddress) - shutil.copy2(original_img_path, img_path) + if image_util.is_photo(img_path): + book_img_path = model_util.det_book(img_path) # 识别文档区域并裁剪 + dewarped_img_path = model_util.dewarp(book_img_path) # 去扭曲 + else: # todo:也可能是图片,后续添加细分逻辑 + dewarped_img_path = img_path + angles = model_util.clas_orientation(dewarped_img_path) + ocr_text = '' + info_extract = [] + rec_type = None + for angle in angles: + ocr_result = [] + rotated_img = image_util.rotate(dewarped_img_path, int(angle)) + split_results = image_util.split(rotated_img) + for split_result in split_results: + if split_result['img'] is None: + continue + a4_img = image_util.expand_to_a4_size(split_result['img']) + ocr_result += model_util.ocr(a4_img) + tmp_ocr_text = common_util.ocr_result_to_text(ocr_result) - # 尝试从二维码中获取高清图片 - better_img_path, text = get_better_image_from_qrcode(img_path, phrec.cfjaddress) - if phrec.cRectype != '1': - better_img_path = None # 非结算单暂时不进行替换 - if better_img_path is not None: - rec_type = '基本医保结算单' - if text: - info_extract = model_util.ie_settlement_text(text) + tmp_rec_type = model_util.clas_text(tmp_ocr_text) if ocr_text else None + if not tmp_rec_type: + rec_dict = { + '1': '基本医保结算单', + '3': '出院记录', + '4': '费用清单', + } + tmp_rec_type = rec_dict.get(phrec.cRectype) + if tmp_rec_type == '基本医保结算单': + tmp_info_extract = model_util.ie_settlement(rotated_img, common_util.ocr_result_to_layout(ocr_result)) + elif tmp_rec_type == '出院记录': + tmp_info_extract = model_util.ie_discharge(rotated_img, common_util.ocr_result_to_layout(ocr_result)) + elif tmp_rec_type == '费用清单': + tmp_info_extract = model_util.ie_cost(rotated_img, common_util.ocr_result_to_layout(ocr_result)) else: - info_extract = model_util.ie_settlement( - better_img_path, common_util.ocr_result_to_layout(model_util.ocr(better_img_path)) - ) - ocr_text = None # 此处肯定不是出院记录,后续用不到 - else: - if image_util.is_photo(img_path): - book_img_path = model_util.det_book(img_path) # 识别文档区域并裁剪 - dewarped_img_path = model_util.dewarp(book_img_path) # 去扭曲 - else: # todo:也可能是图片,后续添加细分逻辑 - dewarped_img_path = img_path - angles = model_util.clas_orientation(dewarped_img_path) - ocr_text = '' - info_extract = [] - for angle in angles: - ocr_result = [] - rotated_img = image_util.rotate(dewarped_img_path, int(angle)) - split_results = image_util.split(rotated_img) - for split_result in split_results: - if split_result['img'] is None: - continue - a4_img = image_util.expand_to_a4_size(split_result['img']) - ocr_result += model_util.ocr(a4_img) - tmp_ocr_text = common_util.ocr_result_to_text(ocr_result) + tmp_info_extract = [] - rec_type = model_util.clas_text(tmp_ocr_text) if ocr_text else None - if rec_type == '基本医保结算单': - tmp_info_extract = model_util.ie_settlement(rotated_img, common_util.ocr_result_to_layout(ocr_result)) - elif rec_type == '出院记录': - tmp_info_extract = model_util.ie_discharge(rotated_img, common_util.ocr_result_to_layout(ocr_result)) - elif rec_type == '费用清单': - tmp_info_extract = model_util.ie_cost(rotated_img, common_util.ocr_result_to_layout(ocr_result)) - else: - tmp_info_extract = None - - if len(tmp_info_extract) > len(info_extract): - info_extract = tmp_info_extract - ocr_text = tmp_ocr_text + if len(tmp_info_extract) > len(info_extract): + info_extract = tmp_info_extract + ocr_text = tmp_ocr_text + rec_type = tmp_rec_type if info_extract: result_json = json.dumps(info_extract, ensure_ascii=False) @@ -425,6 +416,29 @@ def cost_task(pk_phhd, cost_list_ie_result): return cost_data +def parse_pdf_text(settlement_text): + pattern = (r'(?:交款人:(.*?)\n|住院时间:(.*?)至(.*?)\n|\(小写\)(.*?)\n|个人现金支付:(.*?)\n|个人账户支付:(.*?)\n' + r'|个人自费:(.*?)\n|医保类型:(.*?)\n|住院科别:(.*?)\n|住院号:(.*?)\n|票据号码:(.*?)\n|)') + # 查找所有匹配项 + matches = re.findall(pattern, settlement_text) + results = {} + keys = ['患者姓名', '入院日期', '出院日期', '费用总额', '个人现金支付', '个人账户支付', '个人自费', '医保类型', + '科室', '住院号', '医保结算单号码'] + + for match in matches: + for key, value in zip(keys, match): + if value: + results[key] = [[{'text': value, 'probability': 1}]] + settlement_key = ['患者姓名', '入院日期', '出院日期', '费用总额', '个人现金支付', '个人账户支付', '个人自费', + '医保类型', '住院号', '医保结算单号码'] + discharge_key = ['科室', '患者姓名', '入院日期', '出院日期', '住院号'] + cost_key = ['患者姓名', '入院日期', '出院日期', '费用总额'] + settlement_result = {key: results[key] for key in settlement_key if key in results} + discharge_result = {key: results[key] for key in discharge_key if key in results} + cost_result = {key: results[key] for key in cost_key if key in results} + return settlement_result, discharge_result, cost_result + + def photo_review(pk_phhd, name): """ 处理单个报销案子 @@ -438,7 +452,7 @@ def photo_review(pk_phhd, name): session = MysqlSession() phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter( ZxPhrec.pk_phhd == pk_phhd - ).all() + ).order_by(ZxPhrec.cRectype).all() session.close() # 同一批图的标识 @@ -446,25 +460,76 @@ def photo_review(pk_phhd, name): processed_img_dir = common_util.get_processed_img_path('') os.makedirs(processed_img_dir, exist_ok=True) - discharge_text = '' + has_pdf = False # 是否获取到了pdf,获取到可以直接利用pdf更快的获取信息 + better_settlement_path = None + better_cost_path = None + settlement_text = '' for phrec in phrecs: - rec_type, ie_result, ocr_text = information_extraction(phrec, pk_phhd) - if rec_type == '基本医保结算单': - rec_result = settlement_result - elif rec_type == '出院记录': - rec_result = discharge_result - discharge_text += ocr_text - elif rec_type == '费用清单': - rec_result = cost_result - else: - rec_result = None - if rec_result is not None: - for key, value in ie_result.items(): - rec_result[key].append(value) + original_img_path = common_util.get_img_path(phrec.cfjaddress) + if not original_img_path: + img_url = ufile.get_private_url(phrec.cfjaddress) + if not img_url: + continue + original_img_path = common_util.save_to_local(img_url) + img_path = common_util.get_processed_img_path(phrec.cfjaddress) + shutil.copy2(original_img_path, img_path) + # 尝试从二维码中获取高清图片 + better_settlement_path, settlement_text, better_cost_path = parse_qrcode(img_path, phrec.cfjaddress) + if better_settlement_path: + has_pdf = True + break - # 删除多余图片 - if os.path.exists(processed_img_dir) and os.path.isdir(processed_img_dir): - shutil.rmtree(processed_img_dir) + discharge_text = '' + if has_pdf: + settlement_result, discharge_result, cost_result = parse_pdf_text(settlement_text) + discharge_result = defaultdict(list, discharge_result) + + is_settlement_updated = False + is_cost_updated = False + for phrec in phrecs: + if phrec.cRectype == '1': + if not is_settlement_updated: + try: + ufile.copy_file(BUCKET, phrec.cfjaddress, "drg2015", phrec.cfjaddress) + ufile.upload_file(phrec.cfjaddress, better_settlement_path) + except Exception as e: + logging.error("更新结算单pdf图片出错", exc_info=e) + finally: + is_settlement_updated = True + elif phrec.cRectype == '3': + rec_type, ie_result, ocr_text = information_extraction(phrec, pk_phhd) + if rec_type == '出院记录': + discharge_text += ocr_text + for key, value in ie_result.items(): + discharge_result[key].append(value) + elif phrec.cRectype == '4': + if not is_cost_updated: + try: + ufile.copy_file(BUCKET, phrec.cfjaddress, "drg2015", phrec.cfjaddress) + ufile.upload_file(phrec.cfjaddress, better_cost_path) + except Exception as e: + logging.error("更新费用清单pdf图片出错", exc_info=e) + finally: + is_cost_updated = True + else: + for phrec in phrecs: + rec_type, ie_result, ocr_text = information_extraction(phrec, pk_phhd) + if rec_type == '基本医保结算单': + rec_result = settlement_result + elif rec_type == '出院记录': + rec_result = discharge_result + discharge_text += ocr_text + elif rec_type == '费用清单': + rec_result = cost_result + else: + rec_result = None + if rec_result is not None: + for key, value in ie_result.items(): + rec_result[key].append(value) + + # 删除多余图片 + if os.path.exists(processed_img_dir) and os.path.isdir(processed_img_dir): + shutil.rmtree(processed_img_dir) settlement_data = settlement_task(pk_phhd, settlement_result) discharge_data = discharge_task(pk_phhd, discharge_result) @@ -560,8 +625,9 @@ def photo_review(pk_phhd, name): else: review_result['discharge_date_match'] = '0' - review_result['consistency'] = (review_result['name_match'] == '1' and review_result['admission_date_match'] == '1' - and review_result['discharge_date_match'] == '1') + review_result['consistency'] = ( + review_result['name_match'] == '1' and review_result['admission_date_match'] == '1' + and review_result['discharge_date_match'] == '1') review_result['success'] = review_result['integrity'] and review_result['consistency'] save_or_update_ie(ZxIeReview, pk_phhd, review_result) diff --git a/util/image_util.py b/util/image_util.py index 3eee353..4388dc1 100644 --- a/util/image_util.py +++ b/util/image_util.py @@ -1,6 +1,8 @@ +import logging import math import cv2 +import fitz import numpy from PIL import Image @@ -27,7 +29,7 @@ def capture(image, rectangle): def split(img_path, ratio=1.414, overlap=0.05, x_compensation=3): """ 分割图片 - :param img_path:图片路径 + :param img_path: 图片路径 :param ratio: 分割后的比例 :param overlap: 图片之间的覆盖比例 :param x_compensation: 横向补偿倍率 @@ -251,3 +253,33 @@ def is_photo(img_path): if any(tag in exif for tag in (271, 272)): return True return False + + +def pdf_to_imgs(pdf_path, dpi=150): + pdf_file = None + # 定义缩放系数(DPI) + default_dpi = 72 + zoom = dpi / default_dpi + try: + # 打开PDF文件 + pdf_file = fitz.open(pdf_path) + + pdf_imgs = [] + for page in pdf_file: + # 设置矩阵变换参数 + mat = fitz.Matrix(zoom, zoom) + # 渲染页面 + pix = page.get_pixmap(matrix=mat) + # 将渲染结果转换为OpenCV兼容的格式 + img = numpy.frombuffer(pix.samples, dtype=numpy.uint8).reshape((pix.height, pix.width, -1)) + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + pdf_imgs.append([img, page.get_text()]) + + return pdf_imgs + + except Exception as ex: + logging.getLogger('error').error('pdf转图片失败!', exc_info=ex) + return None + finally: + if pdf_file: + pdf_file.close()