优化案子处理逻辑

2024-10-09 09:39:29 +08:00
parent a3fa1e502e
commit 795134f566
10 changed files with 257 additions and 304 deletions
--- a/photo_review.py
+++ b/photo_review.py
@@ -5,12 +5,13 @@ from time import sleep

 from sqlalchemy import update

-from my_email.error_email import send_error_email
 from db import MysqlSession
 from db.mysql import ZxPhhd
 from log import LOGGING_CONFIG
+from my_email.error_email import send_error_email
 from photo_review import auto_photo_review, SEND_ERROR_EMAIL

+# 照片审核自动识别脚本入口
 if __name__ == '__main__':
    program_name = '照片审核自动识别脚本'
    logging.config.dictConfig(LOGGING_CONFIG)
@@ -19,7 +20,7 @@ if __name__ == '__main__':
    parser.add_argument('--clean', default=False, type=bool, help='是否将识别中的案子改为待识别状态')
    args = parser.parse_args()
    if args.clean:
-        # 主要用于启动时，清除仍在涂抹中的案子
+        # 启动时清除仍在识别中的案子
        session = MysqlSession()
        update_flag = (update(ZxPhhd).where(ZxPhhd.exsuccess_flag == '2').values(exsuccess_flag='1'))
        session.execute(update_flag)
@@ -33,7 +34,6 @@ if __name__ == '__main__':
        logging.info(f'【{program_name}】开始运行')
        auto_photo_review.main()
    except Exception as e:
-        error_logger = logging.getLogger('error')
-        error_logger.error(traceback.format_exc())
+        logging.getLogger('error').error(traceback.format_exc())
        if SEND_ERROR_EMAIL:
            send_error_email(program_name, repr(e), traceback.format_exc())
--- a/photo_review/auto_photo_review.py
+++ b/photo_review/auto_photo_review.py
@@ -1,7 +1,4 @@
-import json
 import logging
-import os
-import tempfile
 import time
 from collections import defaultdict
 from time import sleep
@@ -10,72 +7,24 @@ import cv2
 import fitz
 import jieba
 import numpy as np
-import requests
 import zxingcpp
 from rapidfuzz import process, fuzz
 from sqlalchemy import update

 from db import MysqlSession
-from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview
+from db.mysql import BdYljg, BdYlks, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview
 from log import HOSTNAME
 from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \
    DEPARTMENT_FILTER
-from services.paddle_services import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, \
-    PERSONAL_CASH_PAYMENT, PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, \
-    DEPARTMENT, DOCTOR, ADMISSION_ID, SETTLEMENT_ID, AGE, UPPERCASE_MEDICAL_EXPENSES
+from services.paddle_services import IE_KEY
 from ucloud import ufile
 from util import image_util, common_util, html_util, model_util
 from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \
    handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital


-# 合并信息抽取结果
-def merge_result(result1, result2):
-    for key in result2:
-        result1[key] = result1.get(key, []) + result2[key]
-    return result1
-
-
-def ie_temp_image(ie, ocr, image):
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
-        cv2.imwrite(temp_file.name, image)
-
-    ie_result = []
-    try:
-        layout = common_util.get_ocr_layout(ocr, temp_file.name)
-        if not layout:
-            # 无识别结果
-            ie_result = []
-        else:
-            ie_result = ie({"doc": temp_file.name, "layout": layout})[0]
-    except Exception as e:
-        logging.error("信息抽取时出错", exc_info=e)
-    finally:
-        try:
-            os.remove(temp_file.name)
-        except Exception as e:
-            logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e)
-    return ie_result
-
-
-# 关键信息提取
-def request_ie_result(task_enum, phrecs):
-    url = task_enum.request_url()
-    identity = int(time.time())
-    images = []
-    for phrec in phrecs:
-        images.append({"name": phrec.cfjaddress, "pk": phrec.pk_phrec})
-    payload = {"images": images, "schema": task_enum.schema(), "pk_phhd": phrecs[0].pk_phhd, "identity": identity}
-    response = requests.post(url, json=payload)
-
-    if response.status_code == 200:
-        return response.json()["data"]
-    else:
-        raise Exception(f"请求信息抽取结果失败，状态码：{response.status_code}")
-
-
 # 尝试从二维码中获取高清图片
-def get_better_image_from_qrcode(image, image_id, dpi=150):
+def get_better_image_from_qrcode(img_path, image_id, dpi=150):
    def _parse_pdf_url(pdf_url_to_parse):
        pdf_file = None
        local_pdf_path = None
@@ -95,7 +44,10 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
            # 将渲染结果转换为OpenCV兼容的格式
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1))
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-            return img, page.get_text()
+            img_name, img_ext = image_util.parse_save_path(img_path)
+            better_img_path = image_util.get_save_path(f'{img_name}.better.{img_ext}')
+            cv2.imwrite(better_img_path, img)
+            return better_img_path, page.get_text()
        except Exception as ex:
            logging.getLogger('error').error('解析pdf失败！', exc_info=ex)
            return None, None
@@ -107,7 +59,8 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):

    jsczt_base_url = 'http://einvoice.jsczt.cn'
    try:
-        results = zxingcpp.read_barcodes(image)
+        img = cv2.imread(img_path)
+        results = zxingcpp.read_barcodes(img, text_mode=zxingcpp.TextMode.HRI)
    except Exception as e:
        logging.getLogger('error').info('二维码识别失败', exc_info=e)
        results = []
@@ -145,106 +98,52 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):


 # 关键信息提取
-def information_extraction(ie, phrecs, identity):
-    result = {}
-    for phrec in phrecs:
+def information_extraction(phrec, identity):
+    """
+    处理单张图片
+    :param phrec:
+    :return:
+    """
+    img_path = image_util.get_img_path(phrec.cfjaddress)
+    if not img_path:
        img_url = ufile.get_private_url(phrec.cfjaddress)
-        if not img_url:
-            continue
-
        img_path = image_util.save_to_local(img_url)
-        image = cv2.imread(img_path)
-        # 尝试从二维码中获取高清图片
-        better_image, text = get_better_image_from_qrcode(image, phrec.cfjaddress)
-        if phrec.cRectype != '1':
-            better_image = None  # 非结算单暂时不进行替换
-        zx_ie_results = []
-        if better_image is not None:
-            img_angle = '0'
-            image = better_image
-            if text:
-                info_extract = ie(text)[0]
-            else:
-                info_extract = ie_temp_image(ie, OCR, image)
-            if not info_extract:
-                continue

-            ie_result = {'result': info_extract, 'angle': img_angle}
-            now = common_util.get_default_datetime()
-            result_json = json.dumps(ie_result['result'], ensure_ascii=False)
-            if len(result_json) > 5000:
-                result_json = result_json[:5000]
-            zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
-                                            cfjaddress=phrec.cfjaddress, content=result_json,
-                                            rotation_angle=int(ie_result['angle']),
-                                            x_offset=0, y_offset=0, create_time=now,
-                                            creator=HOSTNAME, update_time=now, updater=HOSTNAME))
-
-            result = merge_result(result, ie_result['result'])
+    # 尝试从二维码中获取高清图片
+    better_img_path, text = get_better_image_from_qrcode(img_path, phrec.cfjaddress)
+    if phrec.cRectype != '1':
+        better_img_path = None  # 非结算单暂时不进行替换
+    if better_img_path is not None:
+        if text:
+            info_extract = model_util.ie_settlement_text(text)[0]
        else:
-            target_images = model_util.request_book_areas(img_path)  # 识别文档区域并裁剪
-            angle_count = defaultdict(int, {'0': 0})  # 分割后图片的最优角度统计
-            for target_image in target_images:
-                dewarped_image = model_util.dewarp(target_image)  # 去扭曲
-                angles = model_util.clas_orientation(dewarped_image)
+            info_extract = model_util.ie_settlement(better_img_path,
+                                                    common_util.ocr_result_to_layout(model_util.ocr(better_img_path)))

-                split_results = image_util.split(dewarped_image)
-                for split_result in split_results:
-                    if split_result['img'] is None or split_result['img'].size == 0:
-                        continue
-                    rotated_img = image_util.rotate(split_result['img'], int(angles[0]))
-                    ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}]
-                    if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')):
-                        rotated_img = image_util.rotate(split_result['img'], int(angles[1]))
-                        ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]})
-                    now = common_util.get_default_datetime()
-                    best_angle = ['0', 0]
-                    for ie_result in ie_results:
-                        if not ie_result['result']:
-                            continue
+        return '基本医保结算单', info_extract
+    else:
+        target_image = model_util.det_book(img_path)  # 识别文档区域并裁剪
+        dewarped_image = model_util.dewarp(target_image)  # 去扭曲
+        angles = model_util.clas_orientation(dewarped_image)
+        rotated_img = image_util.rotate(dewarped_image, int(angles[0]))
+        split_results = image_util.split(rotated_img)
+        ocr_result = []
+        for split_result in split_results:
+            if split_result['img'] is None:
+                continue
+            ocr_result += model_util.ocr(rotated_img)
+        ocr_text = common_util.ocr_result_to_text(ocr_result)
+        rec_type = model_util.clas_text(ocr_text) if ocr_text else None
+        if rec_type == '基本医保结算单':
+            info_extract = model_util.ie_settlement(rotated_img, common_util.ocr_result_to_layout(ocr_result))
+        elif rec_type == '出院记录':
+            info_extract = model_util.ie_discharge(rotated_img, common_util.ocr_result_to_layout(ocr_result))
+        elif rec_type == '费用清单':
+            info_extract = model_util.ie_cost(rotated_img, common_util.ocr_result_to_layout(ocr_result))
+        else:
+            info_extract = None

-                        result_json = json.dumps(ie_result['result'], ensure_ascii=False)
-                        if len(result_json) > 5000:
-                            result_json = result_json[:5000]
-                        zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
-                                                        cfjaddress=phrec.cfjaddress, content=result_json,
-                                                        rotation_angle=int(ie_result['angle']),
-                                                        x_offset=split_result['x_offset'],
-                                                        y_offset=split_result['y_offset'], create_time=now,
-                                                        creator=HOSTNAME, update_time=now, updater=HOSTNAME))
-
-                        result = merge_result(result, ie_result['result'])
-
-                        if len(ie_result['result']) > best_angle[1]:
-                            best_angle = [ie_result['angle'], len(ie_result['result'])]
-
-                    angle_count[best_angle[0]] += 1
-            img_angle = max(angle_count, key=angle_count.get)
-
-        if img_angle != '0' or better_image is not None:
-            image = image_util.rotate(image, int(img_angle))
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
-                cv2.imwrite(temp_file.name, image)
-            try:
-                ufile.upload_file(phrec.cfjaddress, temp_file.name)
-                if img_angle != '0':
-                    logging.info(f'旋转图片[{phrec.cfjaddress}]替换成功，已旋转{img_angle}度。')
-                    # 修正旋转角度
-                    for zx_ie_result in zx_ie_results:
-                        zx_ie_result.rotation_angle -= int(img_angle)
-                else:
-                    logging.info(f'高清图片[{phrec.cfjaddress}]替换成功！')
-            except Exception as e:
-                logging.error(f'上传图片({phrec.cfjaddress})失败', exc_info=e)
-            finally:
-                common_util.delete_temp_file(temp_file.name)
-
-        session = MysqlSession()
-        session.add_all(zx_ie_results)
-        session.commit()
-        session.close()
-
-    return result
+        return rec_type, info_extract


 # 从keys中获取准确率最高的value
@@ -359,23 +258,24 @@ def search_department(department):
    return best_match


-def settlement_task(pk_phhd, settlement_list, identity):
-    settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity)
+def settlement_task(pk_phhd, settlement_list_ie_result):
    settlement_data = {
        "pk_phhd": pk_phhd,
-        "name": handle_name(get_best_value_in_keys(settlement_list_ie_result, PATIENT_NAME)),
-        "admission_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_DATE)),
-        "discharge_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, DISCHARGE_DATE)),
+        "name": handle_name(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['name'])),
+        "admission_date_str": handle_original_data(
+            get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_date'])),
+        "discharge_date_str": handle_original_data(
+            get_best_value_in_keys(settlement_list_ie_result, IE_KEY['discharge_date'])),
        "personal_cash_payment_str": handle_original_data(
-            get_best_value_in_keys(settlement_list_ie_result, PERSONAL_CASH_PAYMENT)),
+            get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_cash_payment'])),
        "personal_account_payment_str": handle_original_data(
-            get_best_value_in_keys(settlement_list_ie_result, PERSONAL_ACCOUNT_PAYMENT)),
+            get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_account_payment'])),
        "personal_funded_amount_str": handle_original_data(
-            get_best_value_in_keys(settlement_list_ie_result, PERSONAL_FUNDED_AMOUNT)),
+            get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_funded_amount'])),
        "medical_insurance_type_str": handle_original_data(
-            get_best_value_in_keys(settlement_list_ie_result, MEDICAL_INSURANCE_TYPE)),
-        "admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_ID)),
-        "settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, SETTLEMENT_ID)),
+            get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_insurance_type'])),
+        "admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_id'])),
+        "settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['settlement_id'])),
    }
    settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
    settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
@@ -385,28 +285,30 @@ def settlement_task(pk_phhd, settlement_list, identity):
    settlement_data["personal_funded_amount"] = handle_decimal(settlement_data["personal_funded_amount_str"])
    settlement_data["medical_insurance_type"] = handle_insurance_type(settlement_data["medical_insurance_type_str"])

-    parse_money_result = parse_money(get_best_value_in_keys(settlement_list_ie_result, UPPERCASE_MEDICAL_EXPENSES),
-                                     get_best_value_in_keys(settlement_list_ie_result, MEDICAL_EXPENSES))
+    parse_money_result = parse_money(
+        get_best_value_in_keys(settlement_list_ie_result, IE_KEY['upper_case_medical_expenses']),
+        get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_expenses']))
    settlement_data["medical_expenses_str"] = handle_original_data(parse_money_result[0])
    settlement_data["medical_expenses"] = parse_money_result[1]
    save_or_update_ie(ZxIeSettlement, pk_phhd, settlement_data)
    return settlement_data


-def discharge_task(pk_phhd, discharge_record, identity):
-    discharge_record_ie_result = information_extraction(DISCHARGE_IE, discharge_record, identity)
-    hospitals = get_values_of_keys(discharge_record_ie_result, HOSPITAL)
-    departments = get_values_of_keys(discharge_record_ie_result, DEPARTMENT)
+def discharge_task(pk_phhd, discharge_record_ie_result):
+    hospitals = get_values_of_keys(discharge_record_ie_result, IE_KEY['hospital'])
+    departments = get_values_of_keys(discharge_record_ie_result, IE_KEY['department'])
    discharge_data = {
        "pk_phhd": pk_phhd,
        "hospital": handle_hospital(",".join(hospitals)),
        "department": handle_department(",".join(departments)),
-        "name": handle_name(get_best_value_in_keys(discharge_record_ie_result, PATIENT_NAME)),
-        "admission_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_DATE)),
-        "discharge_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, DISCHARGE_DATE)),
-        "doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, DOCTOR)),
-        "admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_ID)),
-        "age": handle_age(get_best_value_in_keys(discharge_record_ie_result, AGE)),
+        "name": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['name'])),
+        "admission_date_str": handle_original_data(
+            get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_date'])),
+        "discharge_date_str": handle_original_data(
+            get_best_value_in_keys(discharge_record_ie_result, IE_KEY['discharge_date'])),
+        "doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['doctor'])),
+        "admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_id'])),
+        "age": handle_age(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['age'])),
    }
    discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"])
    discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"])
@@ -466,14 +368,16 @@ def discharge_task(pk_phhd, discharge_record, identity):
    return discharge_data


-def cost_task(pk_phhd, cost_list, identity):
-    cost_list_ie_result = information_extraction(COST_IE, cost_list, identity)
+def cost_task(pk_phhd, cost_list_ie_result):
    cost_data = {
        "pk_phhd": pk_phhd,
-        "name": handle_name(get_best_value_in_keys(cost_list_ie_result, PATIENT_NAME)),
-        "admission_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, ADMISSION_DATE)),
-        "discharge_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, DISCHARGE_DATE)),
-        "medical_expenses_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, MEDICAL_EXPENSES))
+        "name": handle_name(get_best_value_in_keys(cost_list_ie_result, IE_KEY['name'])),
+        "admission_date_str": handle_original_data(
+            get_best_value_in_keys(cost_list_ie_result, IE_KEY['admission_date'])),
+        "discharge_date_str": handle_original_data(
+            get_best_value_in_keys(cost_list_ie_result, IE_KEY['discharge_date'])),
+        "medical_expenses_str": handle_original_data(
+            get_best_value_in_keys(cost_list_ie_result, IE_KEY['medical_expenses']))
    }
    cost_data["admission_date"] = handle_date(cost_data["admission_date_str"])
    cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"])
@@ -483,28 +387,39 @@ def cost_task(pk_phhd, cost_list, identity):


 def photo_review(pk_phhd, name):
-    settlement_list = []
-    discharge_record = []
-    cost_list = []
+    """
+    处理单个报销案子
+    :param pk_phhd: 报销单主键
+    :param name: 报销人姓名
+    """
+    settlement_result = defaultdict(list)
+    discharge_result = defaultdict(list)
+    cost_result = defaultdict(list)

    session = MysqlSession()
-    phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.pk_phhd, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter(
+    phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter(
        ZxPhrec.pk_phhd == pk_phhd
    ).all()
    session.close()
-    for phrec in phrecs:
-        if phrec.cRectype == "1":
-            settlement_list.append(phrec)
-        elif phrec.cRectype == "3":
-            discharge_record.append(phrec)
-        elif phrec.cRectype == "4":
-            cost_list.append(phrec)
-
    # 同一批图的标识
    identity = int(time.time())
-    settlement_data = settlement_task(pk_phhd, settlement_list, identity)
-    discharge_data = discharge_task(pk_phhd, discharge_record, identity)
-    cost_data = cost_task(pk_phhd, cost_list, identity)
+    for phrec in phrecs:
+        rec_type, ie_result = information_extraction(phrec, identity)
+        if rec_type == '基本医保结算单':
+            rec_result = settlement_result
+        elif rec_type == '出院记录':
+            rec_result = discharge_result
+        elif rec_type == '费用清单':
+            rec_result = cost_result
+        else:
+            rec_result = None
+        if rec_result:
+            for key, value in ie_result.items():
+                rec_result[key].append(value)
+
+    settlement_data = settlement_task(pk_phhd, settlement_result)
+    discharge_data = discharge_task(pk_phhd, discharge_result)
+    cost_data = cost_task(pk_phhd, cost_result)

    review_result = {
        'pk_phhd': pk_phhd,
@@ -573,6 +488,9 @@ def photo_review(pk_phhd, name):


 def main():
+    """
+    照片审核批量控制
+    """
    while 1:
        session = MysqlSession()
        phhds = (session.query(ZxPhhd.pk_phhd, ZxPhhd.cXm)
--- a/services/paddle_services/init.py
+++ b/services/paddle_services/init.py
@@ -1,34 +1,20 @@
 """
 信息抽取关键词配置
 """
-
-# 患者姓名
-PATIENT_NAME = ['患者姓名']
-# 入院日期
-ADMISSION_DATE = ['入院日期']
-# 出院日期
-DISCHARGE_DATE = ['出院日期']
-# 发生医疗费
-MEDICAL_EXPENSES = ['费用总额']
-# 个人现金支付
-PERSONAL_CASH_PAYMENT = ['个人现金支付']
-# 个人账户支付
-PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付']
-# 个人自费金额
-PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费']
-# 医保类别
-MEDICAL_INSURANCE_TYPE = ['医保类型']
-# 就诊医院
-HOSPITAL = ['医院']
-# 就诊科室
-DEPARTMENT = ['科室']
-# 主治医生
-DOCTOR = ['主治医生']
-# 住院号
-ADMISSION_ID = ['住院号']
-# 医保结算单号码
-SETTLEMENT_ID = ['医保结算单号码']
-# 年龄
-AGE = ['年龄']
-# 大写总额
-UPPERCASE_MEDICAL_EXPENSES = ['大写总额']
+IE_KEY = {
+    'name': '患者姓名',
+    'admission_date': '入院日期',
+    'discharge_date': '出院日期',
+    'medical_expenses': '费用总额',
+    'personal_cash_payment': '个人现金支付',
+    'personal_account_payment': '个人账户支付',
+    'personal_funded_amount': '自费金额',
+    'medical_insurance_type': '医保类型',
+    'hospital': '医院',
+    'department': '科室',
+    'doctor': '主治医生',
+    'admission_id': '住院号',
+    'settlement_id': '医保结算单号码',
+    'age': '年龄',
+    'upper_case_medical_expenses': '大写总额'
+}
--- a/services/paddle_services/clas_text.py
+++ b/services/paddle_services/clas_text.py
@@ -19,7 +19,8 @@ def main():
    cls_result = CLAS(text)
    cls_result = cls_result[0].get('predictions')[0]
    if cls_result['score'] < 0.8:
-        raise Exception(f'识别结果置信度过低！text: {text}')
+        logging.info(f"识别结果置信度{cls_result['score']}过低！text: {text}")
+        return None
    return cls_result['label']


--- a/services/paddle_services/ie_cost.py
+++ b/services/paddle_services/ie_cost.py
@@ -4,12 +4,14 @@ import logging.config
 from flask import Flask, request
 from paddlenlp import Taskflow

-from __init__ import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES
+from __init__ import IE_KEY
 from log import LOGGING_CONFIG
 from utils import process_request

 app = Flask(__name__)
-COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES
+COST_LIST_SCHEMA = tuple(IE_KEY[key] for key in [
+    'name', 'admission_date', 'discharge_date', 'medical_expenses'
+])
 COST = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base',
                task_path='model/cost_list_model', layout_analysis=False, precision='fp16')

--- a/services/paddle_services/ie_discharge.py
+++ b/services/paddle_services/ie_discharge.py
@@ -4,14 +4,14 @@ import logging.config
 from flask import Flask, request
 from paddlenlp import Taskflow

-from __init__ import HOSPITAL, DEPARTMENT, PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, DOCTOR, ADMISSION_ID, AGE
+from __init__ import IE_KEY
 from log import LOGGING_CONFIG
 from utils import process_request

 app = Flask(__name__)
-DISCHARGE_RECORD_SCHEMA = (
-        HOSPITAL + DEPARTMENT + PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + DOCTOR + ADMISSION_ID + AGE
-)
+DISCHARGE_RECORD_SCHEMA = tuple(IE_KEY[key] for key in [
+    'hospital', 'department', 'name', 'admission_date', 'discharge_date', 'doctor', 'admission_id', 'age'
+])
 DISCHARGE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base',
                     task_path='model/discharge_record_model', layout_analysis=False, precision='fp16')

--- a/services/paddle_services/ie_settlement.py
+++ b/services/paddle_services/ie_settlement.py
@@ -4,18 +4,16 @@ import logging.config
 from flask import Flask, request
 from paddlenlp import Taskflow

-from __init__ import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \
-    PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, ADMISSION_ID, SETTLEMENT_ID, \
-    UPPERCASE_MEDICAL_EXPENSES
+from __init__ import IE_KEY
 from log import LOGGING_CONFIG
 from utils import process_request

 app = Flask(__name__)
-SETTLEMENT_LIST_SCHEMA = (
-        PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT
-        + PERSONAL_ACCOUNT_PAYMENT + PERSONAL_FUNDED_AMOUNT + MEDICAL_INSURANCE_TYPE + ADMISSION_ID + SETTLEMENT_ID
-        + UPPERCASE_MEDICAL_EXPENSES
-)
+SETTLEMENT_LIST_SCHEMA = tuple(IE_KEY[key] for key in [
+    'name', 'admission_date', 'discharge_date', 'medical_expenses', 'personal_cash_payment',
+    'personal_account_payment', 'personal_funded_amount', 'medical_insurance_type', 'admission_id', 'settlement_id',
+    'uppercase_medical_expenses'
+])
 SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base',
                         task_path='model/settlement_list_model', layout_analysis=False, precision='fp16')

--- a/util/common_util.py
+++ b/util/common_util.py
@@ -12,6 +12,44 @@ def get_default_datetime():
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S')


+def ocr_result_to_layout(ocr_result):
+    def _get_box(old_box):
+        new_box = [
+            min(old_box[0][0], old_box[3][0]),  # x1
+            min(old_box[0][1], old_box[1][1]),  # y1
+            max(old_box[1][0], old_box[2][0]),  # x2
+            max(old_box[2][1], old_box[3][1]),  # y2
+        ]
+        return new_box
+
+    def _normal_box(box_data):
+        # Ensure the height and width of bbox are greater than zero
+        if box_data[3] - box_data[1] < 0 or box_data[2] - box_data[0] < 0:
+            return False
+        return True
+
+    layout = []
+    if not ocr_result:
+        return layout
+    for segment in ocr_result:
+        box = segment[0]
+        box = _get_box(box)
+        if not _normal_box(box):
+            continue
+        text = segment[1][0]
+        layout.append((box, text))
+    return layout
+
+
+def ocr_result_to_text(ocr_results):
+    text = ''
+    for ocr_result in ocr_results:
+        text += ocr_result[1][0]
+        if len(text) >= 2048:
+            break
+    return text[:2048]
+
+
 def get_ocr_layout(ocr, img_path):
    """
    获取ocr识别的结果，转为合适的layout形式
--- a/util/image_util.py
+++ b/util/image_util.py
@@ -1,7 +1,6 @@
 import logging
 import math
 import os
-import urllib.request

 import cv2
 import numpy
@@ -12,80 +11,59 @@ from tenacity import retry, stop_after_attempt, wait_random
 from log import PROJECT_ROOT


-@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
-       after=lambda x: logging.warning('获取图片失败！'))
-def read(image_path):
-    """
-    从网络或本地读取图片
-    :param image_path: 网络或本地路径
-    :return: NumPy数组形式的图片
-    """
-    if image_path.startswith('http'):
-        # 发送HTTP请求并获取图像数据
-        resp = urllib.request.urlopen(image_path, timeout=60)
-        # 将数据读取为字节流
-        image_data = resp.read()
-        # 将字节流转换为NumPy数组
-        image_np = numpy.frombuffer(image_data, numpy.uint8)
-        # 解码NumPy数组为OpenCV图像格式
-        image = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
-    else:
-        image = cv2.imread(image_path)
-    return image
-
-
 def capture(image, rectangle):
    """
    截取图片
-    :param image: 图片NumPy数组
+    :param image: ndarray
    :param rectangle: 要截取的矩形
-    :return: 截取之后的图片NumPy
+    :return: 截取之后的ndarray图片
    """
    x1, y1, x2, y2 = rectangle
    height, width = image.shape[:2]
-    if x1 < 0:
-        x1 = 0
-    if y1 < 0:
-        y1 = 0
-    if x2 > width:
-        x2 = width
-    if y2 > height:
-        y2 = height
+    # 确保坐标值在图片范围内
+    x1 = max(0, x1)
+    y1 = max(0, y1)
+    x2 = min(width, x2)
+    y2 = min(height, y2)
    return image[int(y1):int(y2), int(x1):int(x2)]


-def split(image, ratio=1.414, overlap=0.05, x_compensation=3):
+def split(img_path, ratio=1.414, overlap=0.05, x_compensation=3):
    """
    分割图片
-    :param image:图片，可以是NumPy数组或文件路径
+    :param img_path:图片路径
    :param ratio: 分割后的比例
    :param overlap: 图片之间的覆盖比例
    :param x_compensation: 横向补偿倍率
    :return: 分割后的图片组(NumPy数组形式)
    """
    split_result = []
-    if isinstance(image, str):
-        image = read(image)
+    image = cv2.imread(img_path)
    height, width = image.shape[:2]
    hw_ratio = height / width
    wh_ratio = width / height

+    img_name, img_ext = parse_save_path(img_path)
    if hw_ratio > ratio:  # 纵向过长
        new_img_height = width * ratio
        step = width * (ratio - overlap)  # 偏移步长
        for i in range(math.ceil(height / step)):
            offset = round(step * i)
            cropped_img = capture(image, [0, offset, width, offset + new_img_height])
-            split_result.append({'img': cropped_img, 'x_offset': 0, 'y_offset': offset})
+            split_path = get_save_path(f'{img_name}.split_{i}.{img_ext}')
+            cv2.imwrite(split_path, cropped_img)
+            split_result.append({'img': split_path, 'x_offset': 0, 'y_offset': offset})
    elif wh_ratio > ratio:  # 横向过长
        new_img_width = height * ratio
        step = height * (ratio - overlap * x_compensation)  # 一般文字是横向的，所以横向截取时增大重叠部分
        for i in range(math.ceil(width / step)):
            offset = round(step * i)
            cropped_img = capture(image, [offset, 0, offset + new_img_width, width])
-            split_result.append({'img': cropped_img, 'x_offset': offset, 'y_offset': 0})
+            split_path = get_save_path(f'{img_name}.split_{i}.{img_ext}')
+            cv2.imwrite(split_path, cropped_img)
+            split_result.append({'img': split_path, 'x_offset': offset, 'y_offset': 0})
    else:
-        split_result.append({'img': image, 'x_offset': 0, 'y_offset': 0})
+        split_result.append({'img': img_path, 'x_offset': 0, 'y_offset': 0})
    return split_result


@@ -108,15 +86,16 @@ def parse_rotation_angles(image):
    return angles


-def rotate(image, angle):
+def rotate(img_path, angle):
    """
    旋转图片
-    :param image: 图片NumPy数组
+    :param img_path: 图片NumPy数组
    :param angle: 逆时针旋转角度
    :return: 旋转后的图片NumPy数组
    """
    if angle == 0:
-        return image
+        return img_path
+    image = cv2.imread(img_path)
    height, width = image.shape[:2]
    if angle == 180:
        new_width = width
@@ -132,7 +111,11 @@ def rotate(image, angle):
    matrix[1, 2] += (new_height - height) / 2
    # 参数：原始图像 旋转参数 元素图像宽高
    rotated = cv2.warpAffine(image, matrix, (new_width, new_height))
-    return rotated
+
+    img_name, img_ext = parse_save_path(img_path)
+    rotated_path = get_save_path(f'{img_name}.rotate_{angle}.{img_ext}')
+    cv2.imwrite(rotated_path, rotated)
+    return rotated_path


 def invert_rotate_point(point, center, angle):
@@ -260,26 +243,38 @@ def parse_img_url(url):
    :return: 图片名称和图片后缀
    """
    url = url.split('?')[0]
-    return os.path.basename(url).rsplit('.', 1)
+    return os.path.basename(url)


@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
       after=lambda x: logging.warning('保存图片失败！'))
-def save_to_local(img_url, save_path=None):
+def save_to_local(img_url):
    """
    保存图片到本地
    :param img_url: 图片url
-    :param save_path: 本地保存地址，精确到文件名
    :return: 本地保存地址
    """
    response = requests.get(img_url)
    response.raise_for_status()  # 检查响应状态码是否正常

-    if save_path is None:
-        img_name, img_ext = parse_img_url(img_url)
-        save_path = os.path.join(PROJECT_ROOT, 'tmp_img', img_name + '.' + img_ext)
-
+    save_path = get_save_path(parse_img_url(img_url))
    with open(save_path, 'wb') as file:
        file.write(response.content)
-
    return save_path
+
+
+def get_img_path(img_full_name):
+    save_path = get_save_path(img_full_name)
+    if os.path.exists(save_path):
+        return save_path
+    return None
+
+
+def get_save_path(img_full_name):
+    return os.path.join(PROJECT_ROOT, 'tmp_img', img_full_name)
+
+
+def parse_save_path(img_path):
+    img_full_name = os.path.basename(img_path)
+    img_name, img_ext = img_full_name.rsplit('.', 1)
+    return img_name, img_ext
--- a/util/model_util.py
+++ b/util/model_util.py
@@ -1,5 +1,6 @@
 import json
 import logging
+import os.path

 import requests
 from tenacity import retry, stop_after_attempt, wait_random
@@ -16,9 +17,10 @@ def ocr(img_path):
    url = 'http://ocr:5001'
    response = requests.post(url, {'img_path': img_path})
    if response.status_code == 200:
-        return response.json()
-    else:
-        return None
+        ocr_result = response.json()
+        if ocr_result:
+            return ocr_result[0]
+    return None


@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
@@ -40,7 +42,7 @@ def ie_settlement(img_path, layout):

@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
       after=lambda x: logging.warning('从文本抽取基本医保结算单失败！'))
-def ie_settlement(text):
+def ie_settlement_text(text):
    """
    请求基本医保结算单信息抽取接口
    :param text: 待抽取文本
@@ -73,7 +75,7 @@ def ie_discharge(img_path, layout):

@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
       after=lambda x: logging.warning('从文本抽取出院记录失败！'))
-def ie_discharge(text):
+def ie_discharge_text(text):
    """
    请求出院记录信息抽取接口
    :param text: 待抽取文本
@@ -106,7 +108,7 @@ def ie_cost(img_path, layout):

@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
       after=lambda x: logging.warning('从文本抽取费用清单失败！'))
-def ie_cost(text):
+def ie_cost_text(text):
    """
    请求费用清单信息抽取接口
    :param text: 待抽取文本
@@ -147,9 +149,22 @@ def det_book(img_path):
    url = 'http://det_book:5006'
    response = requests.post(url, {'img_path': img_path})
    if response.status_code == 200:
-        return response.json()
+        book_path_list = response.json()
+        if len(book_path_list) == 0:
+            return img_path
+        elif len(book_path_list) == 1:
+            return book_path_list[0]
+        else:
+            max_book = img_path
+            max_size = 0
+            for book_path in book_path_list:
+                book_size = os.path.getsize(book_path)
+                if book_size > max_size:
+                    max_book = book_path
+                    max_size = book_size
+            return max_book
    else:
-        return [img_path]
+        return img_path


@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,