diff --git a/photo_review.py b/photo_review.py index 235644f..e7490c4 100644 --- a/photo_review.py +++ b/photo_review.py @@ -5,12 +5,13 @@ from time import sleep from sqlalchemy import update -from my_email.error_email import send_error_email from db import MysqlSession from db.mysql import ZxPhhd from log import LOGGING_CONFIG +from my_email.error_email import send_error_email from photo_review import auto_photo_review, SEND_ERROR_EMAIL +# 照片审核自动识别脚本入口 if __name__ == '__main__': program_name = '照片审核自动识别脚本' logging.config.dictConfig(LOGGING_CONFIG) @@ -19,7 +20,7 @@ if __name__ == '__main__': parser.add_argument('--clean', default=False, type=bool, help='是否将识别中的案子改为待识别状态') args = parser.parse_args() if args.clean: - # 主要用于启动时,清除仍在涂抹中的案子 + # 启动时清除仍在识别中的案子 session = MysqlSession() update_flag = (update(ZxPhhd).where(ZxPhhd.exsuccess_flag == '2').values(exsuccess_flag='1')) session.execute(update_flag) @@ -33,7 +34,6 @@ if __name__ == '__main__': logging.info(f'【{program_name}】开始运行') auto_photo_review.main() except Exception as e: - error_logger = logging.getLogger('error') - error_logger.error(traceback.format_exc()) + logging.getLogger('error').error(traceback.format_exc()) if SEND_ERROR_EMAIL: send_error_email(program_name, repr(e), traceback.format_exc()) diff --git a/photo_review/auto_photo_review.py b/photo_review/auto_photo_review.py index 49615cf..4c8eddc 100644 --- a/photo_review/auto_photo_review.py +++ b/photo_review/auto_photo_review.py @@ -1,7 +1,4 @@ -import json import logging -import os -import tempfile import time from collections import defaultdict from time import sleep @@ -10,72 +7,24 @@ import cv2 import fitz import jieba import numpy as np -import requests import zxingcpp from rapidfuzz import process, fuzz from sqlalchemy import update from db import MysqlSession -from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview +from db.mysql import BdYljg, BdYlks, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview from log import HOSTNAME from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \ DEPARTMENT_FILTER -from services.paddle_services import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, \ - PERSONAL_CASH_PAYMENT, PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, \ - DEPARTMENT, DOCTOR, ADMISSION_ID, SETTLEMENT_ID, AGE, UPPERCASE_MEDICAL_EXPENSES +from services.paddle_services import IE_KEY from ucloud import ufile from util import image_util, common_util, html_util, model_util from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \ handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital -# 合并信息抽取结果 -def merge_result(result1, result2): - for key in result2: - result1[key] = result1.get(key, []) + result2[key] - return result1 - - -def ie_temp_image(ie, ocr, image): - with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file: - cv2.imwrite(temp_file.name, image) - - ie_result = [] - try: - layout = common_util.get_ocr_layout(ocr, temp_file.name) - if not layout: - # 无识别结果 - ie_result = [] - else: - ie_result = ie({"doc": temp_file.name, "layout": layout})[0] - except Exception as e: - logging.error("信息抽取时出错", exc_info=e) - finally: - try: - os.remove(temp_file.name) - except Exception as e: - logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e) - return ie_result - - -# 关键信息提取 -def request_ie_result(task_enum, phrecs): - url = task_enum.request_url() - identity = int(time.time()) - images = [] - for phrec in phrecs: - images.append({"name": phrec.cfjaddress, "pk": phrec.pk_phrec}) - payload = {"images": images, "schema": task_enum.schema(), "pk_phhd": phrecs[0].pk_phhd, "identity": identity} - response = requests.post(url, json=payload) - - if response.status_code == 200: - return response.json()["data"] - else: - raise Exception(f"请求信息抽取结果失败,状态码:{response.status_code}") - - # 尝试从二维码中获取高清图片 -def get_better_image_from_qrcode(image, image_id, dpi=150): +def get_better_image_from_qrcode(img_path, image_id, dpi=150): def _parse_pdf_url(pdf_url_to_parse): pdf_file = None local_pdf_path = None @@ -95,7 +44,10 @@ def get_better_image_from_qrcode(image, image_id, dpi=150): # 将渲染结果转换为OpenCV兼容的格式 img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1)) img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - return img, page.get_text() + img_name, img_ext = image_util.parse_save_path(img_path) + better_img_path = image_util.get_save_path(f'{img_name}.better.{img_ext}') + cv2.imwrite(better_img_path, img) + return better_img_path, page.get_text() except Exception as ex: logging.getLogger('error').error('解析pdf失败!', exc_info=ex) return None, None @@ -107,7 +59,8 @@ def get_better_image_from_qrcode(image, image_id, dpi=150): jsczt_base_url = 'http://einvoice.jsczt.cn' try: - results = zxingcpp.read_barcodes(image) + img = cv2.imread(img_path) + results = zxingcpp.read_barcodes(img, text_mode=zxingcpp.TextMode.HRI) except Exception as e: logging.getLogger('error').info('二维码识别失败', exc_info=e) results = [] @@ -145,106 +98,52 @@ def get_better_image_from_qrcode(image, image_id, dpi=150): # 关键信息提取 -def information_extraction(ie, phrecs, identity): - result = {} - for phrec in phrecs: +def information_extraction(phrec, identity): + """ + 处理单张图片 + :param phrec: + :return: + """ + img_path = image_util.get_img_path(phrec.cfjaddress) + if not img_path: img_url = ufile.get_private_url(phrec.cfjaddress) - if not img_url: - continue - img_path = image_util.save_to_local(img_url) - image = cv2.imread(img_path) - # 尝试从二维码中获取高清图片 - better_image, text = get_better_image_from_qrcode(image, phrec.cfjaddress) - if phrec.cRectype != '1': - better_image = None # 非结算单暂时不进行替换 - zx_ie_results = [] - if better_image is not None: - img_angle = '0' - image = better_image - if text: - info_extract = ie(text)[0] - else: - info_extract = ie_temp_image(ie, OCR, image) - if not info_extract: - continue - ie_result = {'result': info_extract, 'angle': img_angle} - now = common_util.get_default_datetime() - result_json = json.dumps(ie_result['result'], ensure_ascii=False) - if len(result_json) > 5000: - result_json = result_json[:5000] - zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity, - cfjaddress=phrec.cfjaddress, content=result_json, - rotation_angle=int(ie_result['angle']), - x_offset=0, y_offset=0, create_time=now, - creator=HOSTNAME, update_time=now, updater=HOSTNAME)) - - result = merge_result(result, ie_result['result']) + # 尝试从二维码中获取高清图片 + better_img_path, text = get_better_image_from_qrcode(img_path, phrec.cfjaddress) + if phrec.cRectype != '1': + better_img_path = None # 非结算单暂时不进行替换 + if better_img_path is not None: + if text: + info_extract = model_util.ie_settlement_text(text)[0] else: - target_images = model_util.request_book_areas(img_path) # 识别文档区域并裁剪 - angle_count = defaultdict(int, {'0': 0}) # 分割后图片的最优角度统计 - for target_image in target_images: - dewarped_image = model_util.dewarp(target_image) # 去扭曲 - angles = model_util.clas_orientation(dewarped_image) + info_extract = model_util.ie_settlement(better_img_path, + common_util.ocr_result_to_layout(model_util.ocr(better_img_path))) - split_results = image_util.split(dewarped_image) - for split_result in split_results: - if split_result['img'] is None or split_result['img'].size == 0: - continue - rotated_img = image_util.rotate(split_result['img'], int(angles[0])) - ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}] - if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')): - rotated_img = image_util.rotate(split_result['img'], int(angles[1])) - ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]}) - now = common_util.get_default_datetime() - best_angle = ['0', 0] - for ie_result in ie_results: - if not ie_result['result']: - continue + return '基本医保结算单', info_extract + else: + target_image = model_util.det_book(img_path) # 识别文档区域并裁剪 + dewarped_image = model_util.dewarp(target_image) # 去扭曲 + angles = model_util.clas_orientation(dewarped_image) + rotated_img = image_util.rotate(dewarped_image, int(angles[0])) + split_results = image_util.split(rotated_img) + ocr_result = [] + for split_result in split_results: + if split_result['img'] is None: + continue + ocr_result += model_util.ocr(rotated_img) + ocr_text = common_util.ocr_result_to_text(ocr_result) + rec_type = model_util.clas_text(ocr_text) if ocr_text else None + if rec_type == '基本医保结算单': + info_extract = model_util.ie_settlement(rotated_img, common_util.ocr_result_to_layout(ocr_result)) + elif rec_type == '出院记录': + info_extract = model_util.ie_discharge(rotated_img, common_util.ocr_result_to_layout(ocr_result)) + elif rec_type == '费用清单': + info_extract = model_util.ie_cost(rotated_img, common_util.ocr_result_to_layout(ocr_result)) + else: + info_extract = None - result_json = json.dumps(ie_result['result'], ensure_ascii=False) - if len(result_json) > 5000: - result_json = result_json[:5000] - zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity, - cfjaddress=phrec.cfjaddress, content=result_json, - rotation_angle=int(ie_result['angle']), - x_offset=split_result['x_offset'], - y_offset=split_result['y_offset'], create_time=now, - creator=HOSTNAME, update_time=now, updater=HOSTNAME)) - - result = merge_result(result, ie_result['result']) - - if len(ie_result['result']) > best_angle[1]: - best_angle = [ie_result['angle'], len(ie_result['result'])] - - angle_count[best_angle[0]] += 1 - img_angle = max(angle_count, key=angle_count.get) - - if img_angle != '0' or better_image is not None: - image = image_util.rotate(image, int(img_angle)) - with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: - cv2.imwrite(temp_file.name, image) - try: - ufile.upload_file(phrec.cfjaddress, temp_file.name) - if img_angle != '0': - logging.info(f'旋转图片[{phrec.cfjaddress}]替换成功,已旋转{img_angle}度。') - # 修正旋转角度 - for zx_ie_result in zx_ie_results: - zx_ie_result.rotation_angle -= int(img_angle) - else: - logging.info(f'高清图片[{phrec.cfjaddress}]替换成功!') - except Exception as e: - logging.error(f'上传图片({phrec.cfjaddress})失败', exc_info=e) - finally: - common_util.delete_temp_file(temp_file.name) - - session = MysqlSession() - session.add_all(zx_ie_results) - session.commit() - session.close() - - return result + return rec_type, info_extract # 从keys中获取准确率最高的value @@ -359,23 +258,24 @@ def search_department(department): return best_match -def settlement_task(pk_phhd, settlement_list, identity): - settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity) +def settlement_task(pk_phhd, settlement_list_ie_result): settlement_data = { "pk_phhd": pk_phhd, - "name": handle_name(get_best_value_in_keys(settlement_list_ie_result, PATIENT_NAME)), - "admission_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_DATE)), - "discharge_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, DISCHARGE_DATE)), + "name": handle_name(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['name'])), + "admission_date_str": handle_original_data( + get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_date'])), + "discharge_date_str": handle_original_data( + get_best_value_in_keys(settlement_list_ie_result, IE_KEY['discharge_date'])), "personal_cash_payment_str": handle_original_data( - get_best_value_in_keys(settlement_list_ie_result, PERSONAL_CASH_PAYMENT)), + get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_cash_payment'])), "personal_account_payment_str": handle_original_data( - get_best_value_in_keys(settlement_list_ie_result, PERSONAL_ACCOUNT_PAYMENT)), + get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_account_payment'])), "personal_funded_amount_str": handle_original_data( - get_best_value_in_keys(settlement_list_ie_result, PERSONAL_FUNDED_AMOUNT)), + get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_funded_amount'])), "medical_insurance_type_str": handle_original_data( - get_best_value_in_keys(settlement_list_ie_result, MEDICAL_INSURANCE_TYPE)), - "admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_ID)), - "settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, SETTLEMENT_ID)), + get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_insurance_type'])), + "admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_id'])), + "settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['settlement_id'])), } settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"]) settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"]) @@ -385,28 +285,30 @@ def settlement_task(pk_phhd, settlement_list, identity): settlement_data["personal_funded_amount"] = handle_decimal(settlement_data["personal_funded_amount_str"]) settlement_data["medical_insurance_type"] = handle_insurance_type(settlement_data["medical_insurance_type_str"]) - parse_money_result = parse_money(get_best_value_in_keys(settlement_list_ie_result, UPPERCASE_MEDICAL_EXPENSES), - get_best_value_in_keys(settlement_list_ie_result, MEDICAL_EXPENSES)) + parse_money_result = parse_money( + get_best_value_in_keys(settlement_list_ie_result, IE_KEY['upper_case_medical_expenses']), + get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_expenses'])) settlement_data["medical_expenses_str"] = handle_original_data(parse_money_result[0]) settlement_data["medical_expenses"] = parse_money_result[1] save_or_update_ie(ZxIeSettlement, pk_phhd, settlement_data) return settlement_data -def discharge_task(pk_phhd, discharge_record, identity): - discharge_record_ie_result = information_extraction(DISCHARGE_IE, discharge_record, identity) - hospitals = get_values_of_keys(discharge_record_ie_result, HOSPITAL) - departments = get_values_of_keys(discharge_record_ie_result, DEPARTMENT) +def discharge_task(pk_phhd, discharge_record_ie_result): + hospitals = get_values_of_keys(discharge_record_ie_result, IE_KEY['hospital']) + departments = get_values_of_keys(discharge_record_ie_result, IE_KEY['department']) discharge_data = { "pk_phhd": pk_phhd, "hospital": handle_hospital(",".join(hospitals)), "department": handle_department(",".join(departments)), - "name": handle_name(get_best_value_in_keys(discharge_record_ie_result, PATIENT_NAME)), - "admission_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_DATE)), - "discharge_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, DISCHARGE_DATE)), - "doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, DOCTOR)), - "admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_ID)), - "age": handle_age(get_best_value_in_keys(discharge_record_ie_result, AGE)), + "name": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['name'])), + "admission_date_str": handle_original_data( + get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_date'])), + "discharge_date_str": handle_original_data( + get_best_value_in_keys(discharge_record_ie_result, IE_KEY['discharge_date'])), + "doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['doctor'])), + "admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_id'])), + "age": handle_age(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['age'])), } discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"]) discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"]) @@ -466,14 +368,16 @@ def discharge_task(pk_phhd, discharge_record, identity): return discharge_data -def cost_task(pk_phhd, cost_list, identity): - cost_list_ie_result = information_extraction(COST_IE, cost_list, identity) +def cost_task(pk_phhd, cost_list_ie_result): cost_data = { "pk_phhd": pk_phhd, - "name": handle_name(get_best_value_in_keys(cost_list_ie_result, PATIENT_NAME)), - "admission_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, ADMISSION_DATE)), - "discharge_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, DISCHARGE_DATE)), - "medical_expenses_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, MEDICAL_EXPENSES)) + "name": handle_name(get_best_value_in_keys(cost_list_ie_result, IE_KEY['name'])), + "admission_date_str": handle_original_data( + get_best_value_in_keys(cost_list_ie_result, IE_KEY['admission_date'])), + "discharge_date_str": handle_original_data( + get_best_value_in_keys(cost_list_ie_result, IE_KEY['discharge_date'])), + "medical_expenses_str": handle_original_data( + get_best_value_in_keys(cost_list_ie_result, IE_KEY['medical_expenses'])) } cost_data["admission_date"] = handle_date(cost_data["admission_date_str"]) cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"]) @@ -483,28 +387,39 @@ def cost_task(pk_phhd, cost_list, identity): def photo_review(pk_phhd, name): - settlement_list = [] - discharge_record = [] - cost_list = [] + """ + 处理单个报销案子 + :param pk_phhd: 报销单主键 + :param name: 报销人姓名 + """ + settlement_result = defaultdict(list) + discharge_result = defaultdict(list) + cost_result = defaultdict(list) session = MysqlSession() - phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.pk_phhd, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter( + phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter( ZxPhrec.pk_phhd == pk_phhd ).all() session.close() - for phrec in phrecs: - if phrec.cRectype == "1": - settlement_list.append(phrec) - elif phrec.cRectype == "3": - discharge_record.append(phrec) - elif phrec.cRectype == "4": - cost_list.append(phrec) - # 同一批图的标识 identity = int(time.time()) - settlement_data = settlement_task(pk_phhd, settlement_list, identity) - discharge_data = discharge_task(pk_phhd, discharge_record, identity) - cost_data = cost_task(pk_phhd, cost_list, identity) + for phrec in phrecs: + rec_type, ie_result = information_extraction(phrec, identity) + if rec_type == '基本医保结算单': + rec_result = settlement_result + elif rec_type == '出院记录': + rec_result = discharge_result + elif rec_type == '费用清单': + rec_result = cost_result + else: + rec_result = None + if rec_result: + for key, value in ie_result.items(): + rec_result[key].append(value) + + settlement_data = settlement_task(pk_phhd, settlement_result) + discharge_data = discharge_task(pk_phhd, discharge_result) + cost_data = cost_task(pk_phhd, cost_result) review_result = { 'pk_phhd': pk_phhd, @@ -573,6 +488,9 @@ def photo_review(pk_phhd, name): def main(): + """ + 照片审核批量控制 + """ while 1: session = MysqlSession() phhds = (session.query(ZxPhhd.pk_phhd, ZxPhhd.cXm) diff --git a/services/paddle_services/__init__.py b/services/paddle_services/__init__.py index e012747..4818be2 100644 --- a/services/paddle_services/__init__.py +++ b/services/paddle_services/__init__.py @@ -1,34 +1,20 @@ """ 信息抽取关键词配置 """ - -# 患者姓名 -PATIENT_NAME = ['患者姓名'] -# 入院日期 -ADMISSION_DATE = ['入院日期'] -# 出院日期 -DISCHARGE_DATE = ['出院日期'] -# 发生医疗费 -MEDICAL_EXPENSES = ['费用总额'] -# 个人现金支付 -PERSONAL_CASH_PAYMENT = ['个人现金支付'] -# 个人账户支付 -PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付'] -# 个人自费金额 -PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费'] -# 医保类别 -MEDICAL_INSURANCE_TYPE = ['医保类型'] -# 就诊医院 -HOSPITAL = ['医院'] -# 就诊科室 -DEPARTMENT = ['科室'] -# 主治医生 -DOCTOR = ['主治医生'] -# 住院号 -ADMISSION_ID = ['住院号'] -# 医保结算单号码 -SETTLEMENT_ID = ['医保结算单号码'] -# 年龄 -AGE = ['年龄'] -# 大写总额 -UPPERCASE_MEDICAL_EXPENSES = ['大写总额'] +IE_KEY = { + 'name': '患者姓名', + 'admission_date': '入院日期', + 'discharge_date': '出院日期', + 'medical_expenses': '费用总额', + 'personal_cash_payment': '个人现金支付', + 'personal_account_payment': '个人账户支付', + 'personal_funded_amount': '自费金额', + 'medical_insurance_type': '医保类型', + 'hospital': '医院', + 'department': '科室', + 'doctor': '主治医生', + 'admission_id': '住院号', + 'settlement_id': '医保结算单号码', + 'age': '年龄', + 'upper_case_medical_expenses': '大写总额' +} diff --git a/services/paddle_services/clas_text.py b/services/paddle_services/clas_text.py index c202cb8..22ab330 100644 --- a/services/paddle_services/clas_text.py +++ b/services/paddle_services/clas_text.py @@ -19,7 +19,8 @@ def main(): cls_result = CLAS(text) cls_result = cls_result[0].get('predictions')[0] if cls_result['score'] < 0.8: - raise Exception(f'识别结果置信度过低!text: {text}') + logging.info(f"识别结果置信度{cls_result['score']}过低!text: {text}") + return None return cls_result['label'] diff --git a/services/paddle_services/ie_cost.py b/services/paddle_services/ie_cost.py index 5032e2e..9da5764 100644 --- a/services/paddle_services/ie_cost.py +++ b/services/paddle_services/ie_cost.py @@ -4,12 +4,14 @@ import logging.config from flask import Flask, request from paddlenlp import Taskflow -from __init__ import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES +from __init__ import IE_KEY from log import LOGGING_CONFIG from utils import process_request app = Flask(__name__) -COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES +COST_LIST_SCHEMA = tuple(IE_KEY[key] for key in [ + 'name', 'admission_date', 'discharge_date', 'medical_expenses' +]) COST = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base', task_path='model/cost_list_model', layout_analysis=False, precision='fp16') diff --git a/services/paddle_services/ie_discharge.py b/services/paddle_services/ie_discharge.py index 28702ba..6768d83 100644 --- a/services/paddle_services/ie_discharge.py +++ b/services/paddle_services/ie_discharge.py @@ -4,14 +4,14 @@ import logging.config from flask import Flask, request from paddlenlp import Taskflow -from __init__ import HOSPITAL, DEPARTMENT, PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, DOCTOR, ADMISSION_ID, AGE +from __init__ import IE_KEY from log import LOGGING_CONFIG from utils import process_request app = Flask(__name__) -DISCHARGE_RECORD_SCHEMA = ( - HOSPITAL + DEPARTMENT + PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + DOCTOR + ADMISSION_ID + AGE -) +DISCHARGE_RECORD_SCHEMA = tuple(IE_KEY[key] for key in [ + 'hospital', 'department', 'name', 'admission_date', 'discharge_date', 'doctor', 'admission_id', 'age' +]) DISCHARGE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base', task_path='model/discharge_record_model', layout_analysis=False, precision='fp16') diff --git a/services/paddle_services/ie_settlement.py b/services/paddle_services/ie_settlement.py index a160ff9..f800c08 100644 --- a/services/paddle_services/ie_settlement.py +++ b/services/paddle_services/ie_settlement.py @@ -4,18 +4,16 @@ import logging.config from flask import Flask, request from paddlenlp import Taskflow -from __init__ import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \ - PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, ADMISSION_ID, SETTLEMENT_ID, \ - UPPERCASE_MEDICAL_EXPENSES +from __init__ import IE_KEY from log import LOGGING_CONFIG from utils import process_request app = Flask(__name__) -SETTLEMENT_LIST_SCHEMA = ( - PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT - + PERSONAL_ACCOUNT_PAYMENT + PERSONAL_FUNDED_AMOUNT + MEDICAL_INSURANCE_TYPE + ADMISSION_ID + SETTLEMENT_ID - + UPPERCASE_MEDICAL_EXPENSES -) +SETTLEMENT_LIST_SCHEMA = tuple(IE_KEY[key] for key in [ + 'name', 'admission_date', 'discharge_date', 'medical_expenses', 'personal_cash_payment', + 'personal_account_payment', 'personal_funded_amount', 'medical_insurance_type', 'admission_id', 'settlement_id', + 'uppercase_medical_expenses' +]) SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base', task_path='model/settlement_list_model', layout_analysis=False, precision='fp16') diff --git a/util/common_util.py b/util/common_util.py index ffeb4b0..13b134c 100644 --- a/util/common_util.py +++ b/util/common_util.py @@ -12,6 +12,44 @@ def get_default_datetime(): return datetime.now().strftime('%Y-%m-%d %H:%M:%S') +def ocr_result_to_layout(ocr_result): + def _get_box(old_box): + new_box = [ + min(old_box[0][0], old_box[3][0]), # x1 + min(old_box[0][1], old_box[1][1]), # y1 + max(old_box[1][0], old_box[2][0]), # x2 + max(old_box[2][1], old_box[3][1]), # y2 + ] + return new_box + + def _normal_box(box_data): + # Ensure the height and width of bbox are greater than zero + if box_data[3] - box_data[1] < 0 or box_data[2] - box_data[0] < 0: + return False + return True + + layout = [] + if not ocr_result: + return layout + for segment in ocr_result: + box = segment[0] + box = _get_box(box) + if not _normal_box(box): + continue + text = segment[1][0] + layout.append((box, text)) + return layout + + +def ocr_result_to_text(ocr_results): + text = '' + for ocr_result in ocr_results: + text += ocr_result[1][0] + if len(text) >= 2048: + break + return text[:2048] + + def get_ocr_layout(ocr, img_path): """ 获取ocr识别的结果,转为合适的layout形式 diff --git a/util/image_util.py b/util/image_util.py index d86fb61..4010b59 100644 --- a/util/image_util.py +++ b/util/image_util.py @@ -1,7 +1,6 @@ import logging import math import os -import urllib.request import cv2 import numpy @@ -12,80 +11,59 @@ from tenacity import retry, stop_after_attempt, wait_random from log import PROJECT_ROOT -@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, - after=lambda x: logging.warning('获取图片失败!')) -def read(image_path): - """ - 从网络或本地读取图片 - :param image_path: 网络或本地路径 - :return: NumPy数组形式的图片 - """ - if image_path.startswith('http'): - # 发送HTTP请求并获取图像数据 - resp = urllib.request.urlopen(image_path, timeout=60) - # 将数据读取为字节流 - image_data = resp.read() - # 将字节流转换为NumPy数组 - image_np = numpy.frombuffer(image_data, numpy.uint8) - # 解码NumPy数组为OpenCV图像格式 - image = cv2.imdecode(image_np, cv2.IMREAD_COLOR) - else: - image = cv2.imread(image_path) - return image - - def capture(image, rectangle): """ 截取图片 - :param image: 图片NumPy数组 + :param image: ndarray :param rectangle: 要截取的矩形 - :return: 截取之后的图片NumPy + :return: 截取之后的ndarray图片 """ x1, y1, x2, y2 = rectangle height, width = image.shape[:2] - if x1 < 0: - x1 = 0 - if y1 < 0: - y1 = 0 - if x2 > width: - x2 = width - if y2 > height: - y2 = height + # 确保坐标值在图片范围内 + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(width, x2) + y2 = min(height, y2) return image[int(y1):int(y2), int(x1):int(x2)] -def split(image, ratio=1.414, overlap=0.05, x_compensation=3): +def split(img_path, ratio=1.414, overlap=0.05, x_compensation=3): """ 分割图片 - :param image:图片,可以是NumPy数组或文件路径 + :param img_path:图片路径 :param ratio: 分割后的比例 :param overlap: 图片之间的覆盖比例 :param x_compensation: 横向补偿倍率 :return: 分割后的图片组(NumPy数组形式) """ split_result = [] - if isinstance(image, str): - image = read(image) + image = cv2.imread(img_path) height, width = image.shape[:2] hw_ratio = height / width wh_ratio = width / height + img_name, img_ext = parse_save_path(img_path) if hw_ratio > ratio: # 纵向过长 new_img_height = width * ratio step = width * (ratio - overlap) # 偏移步长 for i in range(math.ceil(height / step)): offset = round(step * i) cropped_img = capture(image, [0, offset, width, offset + new_img_height]) - split_result.append({'img': cropped_img, 'x_offset': 0, 'y_offset': offset}) + split_path = get_save_path(f'{img_name}.split_{i}.{img_ext}') + cv2.imwrite(split_path, cropped_img) + split_result.append({'img': split_path, 'x_offset': 0, 'y_offset': offset}) elif wh_ratio > ratio: # 横向过长 new_img_width = height * ratio step = height * (ratio - overlap * x_compensation) # 一般文字是横向的,所以横向截取时增大重叠部分 for i in range(math.ceil(width / step)): offset = round(step * i) cropped_img = capture(image, [offset, 0, offset + new_img_width, width]) - split_result.append({'img': cropped_img, 'x_offset': offset, 'y_offset': 0}) + split_path = get_save_path(f'{img_name}.split_{i}.{img_ext}') + cv2.imwrite(split_path, cropped_img) + split_result.append({'img': split_path, 'x_offset': offset, 'y_offset': 0}) else: - split_result.append({'img': image, 'x_offset': 0, 'y_offset': 0}) + split_result.append({'img': img_path, 'x_offset': 0, 'y_offset': 0}) return split_result @@ -108,15 +86,16 @@ def parse_rotation_angles(image): return angles -def rotate(image, angle): +def rotate(img_path, angle): """ 旋转图片 - :param image: 图片NumPy数组 + :param img_path: 图片NumPy数组 :param angle: 逆时针旋转角度 :return: 旋转后的图片NumPy数组 """ if angle == 0: - return image + return img_path + image = cv2.imread(img_path) height, width = image.shape[:2] if angle == 180: new_width = width @@ -132,7 +111,11 @@ def rotate(image, angle): matrix[1, 2] += (new_height - height) / 2 # 参数:原始图像 旋转参数 元素图像宽高 rotated = cv2.warpAffine(image, matrix, (new_width, new_height)) - return rotated + + img_name, img_ext = parse_save_path(img_path) + rotated_path = get_save_path(f'{img_name}.rotate_{angle}.{img_ext}') + cv2.imwrite(rotated_path, rotated) + return rotated_path def invert_rotate_point(point, center, angle): @@ -260,26 +243,38 @@ def parse_img_url(url): :return: 图片名称和图片后缀 """ url = url.split('?')[0] - return os.path.basename(url).rsplit('.', 1) + return os.path.basename(url) @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, after=lambda x: logging.warning('保存图片失败!')) -def save_to_local(img_url, save_path=None): +def save_to_local(img_url): """ 保存图片到本地 :param img_url: 图片url - :param save_path: 本地保存地址,精确到文件名 :return: 本地保存地址 """ response = requests.get(img_url) response.raise_for_status() # 检查响应状态码是否正常 - if save_path is None: - img_name, img_ext = parse_img_url(img_url) - save_path = os.path.join(PROJECT_ROOT, 'tmp_img', img_name + '.' + img_ext) - + save_path = get_save_path(parse_img_url(img_url)) with open(save_path, 'wb') as file: file.write(response.content) - return save_path + + +def get_img_path(img_full_name): + save_path = get_save_path(img_full_name) + if os.path.exists(save_path): + return save_path + return None + + +def get_save_path(img_full_name): + return os.path.join(PROJECT_ROOT, 'tmp_img', img_full_name) + + +def parse_save_path(img_path): + img_full_name = os.path.basename(img_path) + img_name, img_ext = img_full_name.rsplit('.', 1) + return img_name, img_ext diff --git a/util/model_util.py b/util/model_util.py index 20ad581..db65d1b 100644 --- a/util/model_util.py +++ b/util/model_util.py @@ -1,5 +1,6 @@ import json import logging +import os.path import requests from tenacity import retry, stop_after_attempt, wait_random @@ -16,9 +17,10 @@ def ocr(img_path): url = 'http://ocr:5001' response = requests.post(url, {'img_path': img_path}) if response.status_code == 200: - return response.json() - else: - return None + ocr_result = response.json() + if ocr_result: + return ocr_result[0] + return None @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, @@ -40,7 +42,7 @@ def ie_settlement(img_path, layout): @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, after=lambda x: logging.warning('从文本抽取基本医保结算单失败!')) -def ie_settlement(text): +def ie_settlement_text(text): """ 请求基本医保结算单信息抽取接口 :param text: 待抽取文本 @@ -73,7 +75,7 @@ def ie_discharge(img_path, layout): @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, after=lambda x: logging.warning('从文本抽取出院记录失败!')) -def ie_discharge(text): +def ie_discharge_text(text): """ 请求出院记录信息抽取接口 :param text: 待抽取文本 @@ -106,7 +108,7 @@ def ie_cost(img_path, layout): @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, after=lambda x: logging.warning('从文本抽取费用清单失败!')) -def ie_cost(text): +def ie_cost_text(text): """ 请求费用清单信息抽取接口 :param text: 待抽取文本 @@ -147,9 +149,22 @@ def det_book(img_path): url = 'http://det_book:5006' response = requests.post(url, {'img_path': img_path}) if response.status_code == 200: - return response.json() + book_path_list = response.json() + if len(book_path_list) == 0: + return img_path + elif len(book_path_list) == 1: + return book_path_list[0] + else: + max_book = img_path + max_size = 0 + for book_path in book_path_list: + book_size = os.path.getsize(book_path) + if book_size > max_size: + max_book = book_path + max_size = book_size + return max_book else: - return [img_path] + return img_path @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,