优化案子处理逻辑

This commit is contained in:
2024-10-09 09:39:29 +08:00
parent a3fa1e502e
commit 795134f566
10 changed files with 257 additions and 304 deletions

View File

@@ -1,7 +1,4 @@
import json
import logging
import os
import tempfile
import time
from collections import defaultdict
from time import sleep
@@ -10,72 +7,24 @@ import cv2
import fitz
import jieba
import numpy as np
import requests
import zxingcpp
from rapidfuzz import process, fuzz
from sqlalchemy import update
from db import MysqlSession
from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview
from db.mysql import BdYljg, BdYlks, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview
from log import HOSTNAME
from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \
DEPARTMENT_FILTER
from services.paddle_services import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, \
PERSONAL_CASH_PAYMENT, PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, \
DEPARTMENT, DOCTOR, ADMISSION_ID, SETTLEMENT_ID, AGE, UPPERCASE_MEDICAL_EXPENSES
from services.paddle_services import IE_KEY
from ucloud import ufile
from util import image_util, common_util, html_util, model_util
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \
handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital
# 合并信息抽取结果
def merge_result(result1, result2):
for key in result2:
result1[key] = result1.get(key, []) + result2[key]
return result1
def ie_temp_image(ie, ocr, image):
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
cv2.imwrite(temp_file.name, image)
ie_result = []
try:
layout = common_util.get_ocr_layout(ocr, temp_file.name)
if not layout:
# 无识别结果
ie_result = []
else:
ie_result = ie({"doc": temp_file.name, "layout": layout})[0]
except Exception as e:
logging.error("信息抽取时出错", exc_info=e)
finally:
try:
os.remove(temp_file.name)
except Exception as e:
logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e)
return ie_result
# 关键信息提取
def request_ie_result(task_enum, phrecs):
url = task_enum.request_url()
identity = int(time.time())
images = []
for phrec in phrecs:
images.append({"name": phrec.cfjaddress, "pk": phrec.pk_phrec})
payload = {"images": images, "schema": task_enum.schema(), "pk_phhd": phrecs[0].pk_phhd, "identity": identity}
response = requests.post(url, json=payload)
if response.status_code == 200:
return response.json()["data"]
else:
raise Exception(f"请求信息抽取结果失败,状态码:{response.status_code}")
# 尝试从二维码中获取高清图片
def get_better_image_from_qrcode(image, image_id, dpi=150):
def get_better_image_from_qrcode(img_path, image_id, dpi=150):
def _parse_pdf_url(pdf_url_to_parse):
pdf_file = None
local_pdf_path = None
@@ -95,7 +44,10 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
# 将渲染结果转换为OpenCV兼容的格式
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1))
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
return img, page.get_text()
img_name, img_ext = image_util.parse_save_path(img_path)
better_img_path = image_util.get_save_path(f'{img_name}.better.{img_ext}')
cv2.imwrite(better_img_path, img)
return better_img_path, page.get_text()
except Exception as ex:
logging.getLogger('error').error('解析pdf失败', exc_info=ex)
return None, None
@@ -107,7 +59,8 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
jsczt_base_url = 'http://einvoice.jsczt.cn'
try:
results = zxingcpp.read_barcodes(image)
img = cv2.imread(img_path)
results = zxingcpp.read_barcodes(img, text_mode=zxingcpp.TextMode.HRI)
except Exception as e:
logging.getLogger('error').info('二维码识别失败', exc_info=e)
results = []
@@ -145,106 +98,52 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
# 关键信息提取
def information_extraction(ie, phrecs, identity):
result = {}
for phrec in phrecs:
def information_extraction(phrec, identity):
"""
处理单张图片
:param phrec:
:return:
"""
img_path = image_util.get_img_path(phrec.cfjaddress)
if not img_path:
img_url = ufile.get_private_url(phrec.cfjaddress)
if not img_url:
continue
img_path = image_util.save_to_local(img_url)
image = cv2.imread(img_path)
# 尝试从二维码中获取高清图片
better_image, text = get_better_image_from_qrcode(image, phrec.cfjaddress)
if phrec.cRectype != '1':
better_image = None # 非结算单暂时不进行替换
zx_ie_results = []
if better_image is not None:
img_angle = '0'
image = better_image
if text:
info_extract = ie(text)[0]
else:
info_extract = ie_temp_image(ie, OCR, image)
if not info_extract:
continue
ie_result = {'result': info_extract, 'angle': img_angle}
now = common_util.get_default_datetime()
result_json = json.dumps(ie_result['result'], ensure_ascii=False)
if len(result_json) > 5000:
result_json = result_json[:5000]
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
cfjaddress=phrec.cfjaddress, content=result_json,
rotation_angle=int(ie_result['angle']),
x_offset=0, y_offset=0, create_time=now,
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
result = merge_result(result, ie_result['result'])
# 尝试从二维码中获取高清图片
better_img_path, text = get_better_image_from_qrcode(img_path, phrec.cfjaddress)
if phrec.cRectype != '1':
better_img_path = None # 非结算单暂时不进行替换
if better_img_path is not None:
if text:
info_extract = model_util.ie_settlement_text(text)[0]
else:
target_images = model_util.request_book_areas(img_path) # 识别文档区域并裁剪
angle_count = defaultdict(int, {'0': 0}) # 分割后图片的最优角度统计
for target_image in target_images:
dewarped_image = model_util.dewarp(target_image) # 去扭曲
angles = model_util.clas_orientation(dewarped_image)
info_extract = model_util.ie_settlement(better_img_path,
common_util.ocr_result_to_layout(model_util.ocr(better_img_path)))
split_results = image_util.split(dewarped_image)
for split_result in split_results:
if split_result['img'] is None or split_result['img'].size == 0:
continue
rotated_img = image_util.rotate(split_result['img'], int(angles[0]))
ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}]
if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')):
rotated_img = image_util.rotate(split_result['img'], int(angles[1]))
ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]})
now = common_util.get_default_datetime()
best_angle = ['0', 0]
for ie_result in ie_results:
if not ie_result['result']:
continue
return '基本医保结算单', info_extract
else:
target_image = model_util.det_book(img_path) # 识别文档区域并裁剪
dewarped_image = model_util.dewarp(target_image) # 去扭曲
angles = model_util.clas_orientation(dewarped_image)
rotated_img = image_util.rotate(dewarped_image, int(angles[0]))
split_results = image_util.split(rotated_img)
ocr_result = []
for split_result in split_results:
if split_result['img'] is None:
continue
ocr_result += model_util.ocr(rotated_img)
ocr_text = common_util.ocr_result_to_text(ocr_result)
rec_type = model_util.clas_text(ocr_text) if ocr_text else None
if rec_type == '基本医保结算单':
info_extract = model_util.ie_settlement(rotated_img, common_util.ocr_result_to_layout(ocr_result))
elif rec_type == '出院记录':
info_extract = model_util.ie_discharge(rotated_img, common_util.ocr_result_to_layout(ocr_result))
elif rec_type == '费用清单':
info_extract = model_util.ie_cost(rotated_img, common_util.ocr_result_to_layout(ocr_result))
else:
info_extract = None
result_json = json.dumps(ie_result['result'], ensure_ascii=False)
if len(result_json) > 5000:
result_json = result_json[:5000]
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
cfjaddress=phrec.cfjaddress, content=result_json,
rotation_angle=int(ie_result['angle']),
x_offset=split_result['x_offset'],
y_offset=split_result['y_offset'], create_time=now,
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
result = merge_result(result, ie_result['result'])
if len(ie_result['result']) > best_angle[1]:
best_angle = [ie_result['angle'], len(ie_result['result'])]
angle_count[best_angle[0]] += 1
img_angle = max(angle_count, key=angle_count.get)
if img_angle != '0' or better_image is not None:
image = image_util.rotate(image, int(img_angle))
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
cv2.imwrite(temp_file.name, image)
try:
ufile.upload_file(phrec.cfjaddress, temp_file.name)
if img_angle != '0':
logging.info(f'旋转图片[{phrec.cfjaddress}]替换成功,已旋转{img_angle}度。')
# 修正旋转角度
for zx_ie_result in zx_ie_results:
zx_ie_result.rotation_angle -= int(img_angle)
else:
logging.info(f'高清图片[{phrec.cfjaddress}]替换成功!')
except Exception as e:
logging.error(f'上传图片({phrec.cfjaddress})失败', exc_info=e)
finally:
common_util.delete_temp_file(temp_file.name)
session = MysqlSession()
session.add_all(zx_ie_results)
session.commit()
session.close()
return result
return rec_type, info_extract
# 从keys中获取准确率最高的value
@@ -359,23 +258,24 @@ def search_department(department):
return best_match
def settlement_task(pk_phhd, settlement_list, identity):
settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity)
def settlement_task(pk_phhd, settlement_list_ie_result):
settlement_data = {
"pk_phhd": pk_phhd,
"name": handle_name(get_best_value_in_keys(settlement_list_ie_result, PATIENT_NAME)),
"admission_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_DATE)),
"discharge_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, DISCHARGE_DATE)),
"name": handle_name(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['name'])),
"admission_date_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_date'])),
"discharge_date_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['discharge_date'])),
"personal_cash_payment_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, PERSONAL_CASH_PAYMENT)),
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_cash_payment'])),
"personal_account_payment_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, PERSONAL_ACCOUNT_PAYMENT)),
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_account_payment'])),
"personal_funded_amount_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, PERSONAL_FUNDED_AMOUNT)),
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_funded_amount'])),
"medical_insurance_type_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, MEDICAL_INSURANCE_TYPE)),
"admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_ID)),
"settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, SETTLEMENT_ID)),
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_insurance_type'])),
"admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_id'])),
"settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['settlement_id'])),
}
settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
@@ -385,28 +285,30 @@ def settlement_task(pk_phhd, settlement_list, identity):
settlement_data["personal_funded_amount"] = handle_decimal(settlement_data["personal_funded_amount_str"])
settlement_data["medical_insurance_type"] = handle_insurance_type(settlement_data["medical_insurance_type_str"])
parse_money_result = parse_money(get_best_value_in_keys(settlement_list_ie_result, UPPERCASE_MEDICAL_EXPENSES),
get_best_value_in_keys(settlement_list_ie_result, MEDICAL_EXPENSES))
parse_money_result = parse_money(
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['upper_case_medical_expenses']),
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_expenses']))
settlement_data["medical_expenses_str"] = handle_original_data(parse_money_result[0])
settlement_data["medical_expenses"] = parse_money_result[1]
save_or_update_ie(ZxIeSettlement, pk_phhd, settlement_data)
return settlement_data
def discharge_task(pk_phhd, discharge_record, identity):
discharge_record_ie_result = information_extraction(DISCHARGE_IE, discharge_record, identity)
hospitals = get_values_of_keys(discharge_record_ie_result, HOSPITAL)
departments = get_values_of_keys(discharge_record_ie_result, DEPARTMENT)
def discharge_task(pk_phhd, discharge_record_ie_result):
hospitals = get_values_of_keys(discharge_record_ie_result, IE_KEY['hospital'])
departments = get_values_of_keys(discharge_record_ie_result, IE_KEY['department'])
discharge_data = {
"pk_phhd": pk_phhd,
"hospital": handle_hospital(",".join(hospitals)),
"department": handle_department(",".join(departments)),
"name": handle_name(get_best_value_in_keys(discharge_record_ie_result, PATIENT_NAME)),
"admission_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_DATE)),
"discharge_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, DISCHARGE_DATE)),
"doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, DOCTOR)),
"admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_ID)),
"age": handle_age(get_best_value_in_keys(discharge_record_ie_result, AGE)),
"name": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['name'])),
"admission_date_str": handle_original_data(
get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_date'])),
"discharge_date_str": handle_original_data(
get_best_value_in_keys(discharge_record_ie_result, IE_KEY['discharge_date'])),
"doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['doctor'])),
"admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_id'])),
"age": handle_age(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['age'])),
}
discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"])
discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"])
@@ -466,14 +368,16 @@ def discharge_task(pk_phhd, discharge_record, identity):
return discharge_data
def cost_task(pk_phhd, cost_list, identity):
cost_list_ie_result = information_extraction(COST_IE, cost_list, identity)
def cost_task(pk_phhd, cost_list_ie_result):
cost_data = {
"pk_phhd": pk_phhd,
"name": handle_name(get_best_value_in_keys(cost_list_ie_result, PATIENT_NAME)),
"admission_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, ADMISSION_DATE)),
"discharge_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, DISCHARGE_DATE)),
"medical_expenses_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, MEDICAL_EXPENSES))
"name": handle_name(get_best_value_in_keys(cost_list_ie_result, IE_KEY['name'])),
"admission_date_str": handle_original_data(
get_best_value_in_keys(cost_list_ie_result, IE_KEY['admission_date'])),
"discharge_date_str": handle_original_data(
get_best_value_in_keys(cost_list_ie_result, IE_KEY['discharge_date'])),
"medical_expenses_str": handle_original_data(
get_best_value_in_keys(cost_list_ie_result, IE_KEY['medical_expenses']))
}
cost_data["admission_date"] = handle_date(cost_data["admission_date_str"])
cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"])
@@ -483,28 +387,39 @@ def cost_task(pk_phhd, cost_list, identity):
def photo_review(pk_phhd, name):
settlement_list = []
discharge_record = []
cost_list = []
"""
处理单个报销案子
:param pk_phhd: 报销单主键
:param name: 报销人姓名
"""
settlement_result = defaultdict(list)
discharge_result = defaultdict(list)
cost_result = defaultdict(list)
session = MysqlSession()
phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.pk_phhd, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter(
phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter(
ZxPhrec.pk_phhd == pk_phhd
).all()
session.close()
for phrec in phrecs:
if phrec.cRectype == "1":
settlement_list.append(phrec)
elif phrec.cRectype == "3":
discharge_record.append(phrec)
elif phrec.cRectype == "4":
cost_list.append(phrec)
# 同一批图的标识
identity = int(time.time())
settlement_data = settlement_task(pk_phhd, settlement_list, identity)
discharge_data = discharge_task(pk_phhd, discharge_record, identity)
cost_data = cost_task(pk_phhd, cost_list, identity)
for phrec in phrecs:
rec_type, ie_result = information_extraction(phrec, identity)
if rec_type == '基本医保结算单':
rec_result = settlement_result
elif rec_type == '出院记录':
rec_result = discharge_result
elif rec_type == '费用清单':
rec_result = cost_result
else:
rec_result = None
if rec_result:
for key, value in ie_result.items():
rec_result[key].append(value)
settlement_data = settlement_task(pk_phhd, settlement_result)
discharge_data = discharge_task(pk_phhd, discharge_result)
cost_data = cost_task(pk_phhd, cost_result)
review_result = {
'pk_phhd': pk_phhd,
@@ -573,6 +488,9 @@ def photo_review(pk_phhd, name):
def main():
"""
照片审核批量控制
"""
while 1:
session = MysqlSession()
phhds = (session.query(ZxPhhd.pk_phhd, ZxPhhd.cXm)