优化案子处理逻辑

This commit is contained in:
2024-10-09 09:39:29 +08:00
parent a3fa1e502e
commit 795134f566
10 changed files with 257 additions and 304 deletions

View File

@@ -5,12 +5,13 @@ from time import sleep
from sqlalchemy import update from sqlalchemy import update
from my_email.error_email import send_error_email
from db import MysqlSession from db import MysqlSession
from db.mysql import ZxPhhd from db.mysql import ZxPhhd
from log import LOGGING_CONFIG from log import LOGGING_CONFIG
from my_email.error_email import send_error_email
from photo_review import auto_photo_review, SEND_ERROR_EMAIL from photo_review import auto_photo_review, SEND_ERROR_EMAIL
# 照片审核自动识别脚本入口
if __name__ == '__main__': if __name__ == '__main__':
program_name = '照片审核自动识别脚本' program_name = '照片审核自动识别脚本'
logging.config.dictConfig(LOGGING_CONFIG) logging.config.dictConfig(LOGGING_CONFIG)
@@ -19,7 +20,7 @@ if __name__ == '__main__':
parser.add_argument('--clean', default=False, type=bool, help='是否将识别中的案子改为待识别状态') parser.add_argument('--clean', default=False, type=bool, help='是否将识别中的案子改为待识别状态')
args = parser.parse_args() args = parser.parse_args()
if args.clean: if args.clean:
# 主要用于启动时清除仍在涂抹中的案子 # 启动时清除仍在识别中的案子
session = MysqlSession() session = MysqlSession()
update_flag = (update(ZxPhhd).where(ZxPhhd.exsuccess_flag == '2').values(exsuccess_flag='1')) update_flag = (update(ZxPhhd).where(ZxPhhd.exsuccess_flag == '2').values(exsuccess_flag='1'))
session.execute(update_flag) session.execute(update_flag)
@@ -33,7 +34,6 @@ if __name__ == '__main__':
logging.info(f'{program_name}】开始运行') logging.info(f'{program_name}】开始运行')
auto_photo_review.main() auto_photo_review.main()
except Exception as e: except Exception as e:
error_logger = logging.getLogger('error') logging.getLogger('error').error(traceback.format_exc())
error_logger.error(traceback.format_exc())
if SEND_ERROR_EMAIL: if SEND_ERROR_EMAIL:
send_error_email(program_name, repr(e), traceback.format_exc()) send_error_email(program_name, repr(e), traceback.format_exc())

View File

@@ -1,7 +1,4 @@
import json
import logging import logging
import os
import tempfile
import time import time
from collections import defaultdict from collections import defaultdict
from time import sleep from time import sleep
@@ -10,72 +7,24 @@ import cv2
import fitz import fitz
import jieba import jieba
import numpy as np import numpy as np
import requests
import zxingcpp import zxingcpp
from rapidfuzz import process, fuzz from rapidfuzz import process, fuzz
from sqlalchemy import update from sqlalchemy import update
from db import MysqlSession from db import MysqlSession
from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview from db.mysql import BdYljg, BdYlks, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview
from log import HOSTNAME from log import HOSTNAME
from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \ from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \
DEPARTMENT_FILTER DEPARTMENT_FILTER
from services.paddle_services import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, \ from services.paddle_services import IE_KEY
PERSONAL_CASH_PAYMENT, PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, \
DEPARTMENT, DOCTOR, ADMISSION_ID, SETTLEMENT_ID, AGE, UPPERCASE_MEDICAL_EXPENSES
from ucloud import ufile from ucloud import ufile
from util import image_util, common_util, html_util, model_util from util import image_util, common_util, html_util, model_util
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \ from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \
handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital
# 合并信息抽取结果
def merge_result(result1, result2):
for key in result2:
result1[key] = result1.get(key, []) + result2[key]
return result1
def ie_temp_image(ie, ocr, image):
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
cv2.imwrite(temp_file.name, image)
ie_result = []
try:
layout = common_util.get_ocr_layout(ocr, temp_file.name)
if not layout:
# 无识别结果
ie_result = []
else:
ie_result = ie({"doc": temp_file.name, "layout": layout})[0]
except Exception as e:
logging.error("信息抽取时出错", exc_info=e)
finally:
try:
os.remove(temp_file.name)
except Exception as e:
logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e)
return ie_result
# 关键信息提取
def request_ie_result(task_enum, phrecs):
url = task_enum.request_url()
identity = int(time.time())
images = []
for phrec in phrecs:
images.append({"name": phrec.cfjaddress, "pk": phrec.pk_phrec})
payload = {"images": images, "schema": task_enum.schema(), "pk_phhd": phrecs[0].pk_phhd, "identity": identity}
response = requests.post(url, json=payload)
if response.status_code == 200:
return response.json()["data"]
else:
raise Exception(f"请求信息抽取结果失败,状态码:{response.status_code}")
# 尝试从二维码中获取高清图片 # 尝试从二维码中获取高清图片
def get_better_image_from_qrcode(image, image_id, dpi=150): def get_better_image_from_qrcode(img_path, image_id, dpi=150):
def _parse_pdf_url(pdf_url_to_parse): def _parse_pdf_url(pdf_url_to_parse):
pdf_file = None pdf_file = None
local_pdf_path = None local_pdf_path = None
@@ -95,7 +44,10 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
# 将渲染结果转换为OpenCV兼容的格式 # 将渲染结果转换为OpenCV兼容的格式
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1)) img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1))
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
return img, page.get_text() img_name, img_ext = image_util.parse_save_path(img_path)
better_img_path = image_util.get_save_path(f'{img_name}.better.{img_ext}')
cv2.imwrite(better_img_path, img)
return better_img_path, page.get_text()
except Exception as ex: except Exception as ex:
logging.getLogger('error').error('解析pdf失败', exc_info=ex) logging.getLogger('error').error('解析pdf失败', exc_info=ex)
return None, None return None, None
@@ -107,7 +59,8 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
jsczt_base_url = 'http://einvoice.jsczt.cn' jsczt_base_url = 'http://einvoice.jsczt.cn'
try: try:
results = zxingcpp.read_barcodes(image) img = cv2.imread(img_path)
results = zxingcpp.read_barcodes(img, text_mode=zxingcpp.TextMode.HRI)
except Exception as e: except Exception as e:
logging.getLogger('error').info('二维码识别失败', exc_info=e) logging.getLogger('error').info('二维码识别失败', exc_info=e)
results = [] results = []
@@ -145,106 +98,52 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
# 关键信息提取 # 关键信息提取
def information_extraction(ie, phrecs, identity): def information_extraction(phrec, identity):
result = {} """
for phrec in phrecs: 处理单张图片
:param phrec:
:return:
"""
img_path = image_util.get_img_path(phrec.cfjaddress)
if not img_path:
img_url = ufile.get_private_url(phrec.cfjaddress) img_url = ufile.get_private_url(phrec.cfjaddress)
if not img_url:
continue
img_path = image_util.save_to_local(img_url) img_path = image_util.save_to_local(img_url)
image = cv2.imread(img_path)
# 尝试从二维码中获取高清图片
better_image, text = get_better_image_from_qrcode(image, phrec.cfjaddress)
if phrec.cRectype != '1':
better_image = None # 非结算单暂时不进行替换
zx_ie_results = []
if better_image is not None:
img_angle = '0'
image = better_image
if text:
info_extract = ie(text)[0]
else:
info_extract = ie_temp_image(ie, OCR, image)
if not info_extract:
continue
ie_result = {'result': info_extract, 'angle': img_angle} # 尝试从二维码中获取高清图片
now = common_util.get_default_datetime() better_img_path, text = get_better_image_from_qrcode(img_path, phrec.cfjaddress)
result_json = json.dumps(ie_result['result'], ensure_ascii=False) if phrec.cRectype != '1':
if len(result_json) > 5000: better_img_path = None # 非结算单暂时不进行替换
result_json = result_json[:5000] if better_img_path is not None:
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity, if text:
cfjaddress=phrec.cfjaddress, content=result_json, info_extract = model_util.ie_settlement_text(text)[0]
rotation_angle=int(ie_result['angle']),
x_offset=0, y_offset=0, create_time=now,
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
result = merge_result(result, ie_result['result'])
else: else:
target_images = model_util.request_book_areas(img_path) # 识别文档区域并裁剪 info_extract = model_util.ie_settlement(better_img_path,
angle_count = defaultdict(int, {'0': 0}) # 分割后图片的最优角度统计 common_util.ocr_result_to_layout(model_util.ocr(better_img_path)))
for target_image in target_images:
dewarped_image = model_util.dewarp(target_image) # 去扭曲
angles = model_util.clas_orientation(dewarped_image)
split_results = image_util.split(dewarped_image) return '基本医保结算单', info_extract
for split_result in split_results: else:
if split_result['img'] is None or split_result['img'].size == 0: target_image = model_util.det_book(img_path) # 识别文档区域并裁剪
continue dewarped_image = model_util.dewarp(target_image) # 去扭曲
rotated_img = image_util.rotate(split_result['img'], int(angles[0])) angles = model_util.clas_orientation(dewarped_image)
ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}] rotated_img = image_util.rotate(dewarped_image, int(angles[0]))
if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')): split_results = image_util.split(rotated_img)
rotated_img = image_util.rotate(split_result['img'], int(angles[1])) ocr_result = []
ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]}) for split_result in split_results:
now = common_util.get_default_datetime() if split_result['img'] is None:
best_angle = ['0', 0] continue
for ie_result in ie_results: ocr_result += model_util.ocr(rotated_img)
if not ie_result['result']: ocr_text = common_util.ocr_result_to_text(ocr_result)
continue rec_type = model_util.clas_text(ocr_text) if ocr_text else None
if rec_type == '基本医保结算单':
info_extract = model_util.ie_settlement(rotated_img, common_util.ocr_result_to_layout(ocr_result))
elif rec_type == '出院记录':
info_extract = model_util.ie_discharge(rotated_img, common_util.ocr_result_to_layout(ocr_result))
elif rec_type == '费用清单':
info_extract = model_util.ie_cost(rotated_img, common_util.ocr_result_to_layout(ocr_result))
else:
info_extract = None
result_json = json.dumps(ie_result['result'], ensure_ascii=False) return rec_type, info_extract
if len(result_json) > 5000:
result_json = result_json[:5000]
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
cfjaddress=phrec.cfjaddress, content=result_json,
rotation_angle=int(ie_result['angle']),
x_offset=split_result['x_offset'],
y_offset=split_result['y_offset'], create_time=now,
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
result = merge_result(result, ie_result['result'])
if len(ie_result['result']) > best_angle[1]:
best_angle = [ie_result['angle'], len(ie_result['result'])]
angle_count[best_angle[0]] += 1
img_angle = max(angle_count, key=angle_count.get)
if img_angle != '0' or better_image is not None:
image = image_util.rotate(image, int(img_angle))
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
cv2.imwrite(temp_file.name, image)
try:
ufile.upload_file(phrec.cfjaddress, temp_file.name)
if img_angle != '0':
logging.info(f'旋转图片[{phrec.cfjaddress}]替换成功,已旋转{img_angle}度。')
# 修正旋转角度
for zx_ie_result in zx_ie_results:
zx_ie_result.rotation_angle -= int(img_angle)
else:
logging.info(f'高清图片[{phrec.cfjaddress}]替换成功!')
except Exception as e:
logging.error(f'上传图片({phrec.cfjaddress})失败', exc_info=e)
finally:
common_util.delete_temp_file(temp_file.name)
session = MysqlSession()
session.add_all(zx_ie_results)
session.commit()
session.close()
return result
# 从keys中获取准确率最高的value # 从keys中获取准确率最高的value
@@ -359,23 +258,24 @@ def search_department(department):
return best_match return best_match
def settlement_task(pk_phhd, settlement_list, identity): def settlement_task(pk_phhd, settlement_list_ie_result):
settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity)
settlement_data = { settlement_data = {
"pk_phhd": pk_phhd, "pk_phhd": pk_phhd,
"name": handle_name(get_best_value_in_keys(settlement_list_ie_result, PATIENT_NAME)), "name": handle_name(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['name'])),
"admission_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_DATE)), "admission_date_str": handle_original_data(
"discharge_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, DISCHARGE_DATE)), get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_date'])),
"discharge_date_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['discharge_date'])),
"personal_cash_payment_str": handle_original_data( "personal_cash_payment_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, PERSONAL_CASH_PAYMENT)), get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_cash_payment'])),
"personal_account_payment_str": handle_original_data( "personal_account_payment_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, PERSONAL_ACCOUNT_PAYMENT)), get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_account_payment'])),
"personal_funded_amount_str": handle_original_data( "personal_funded_amount_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, PERSONAL_FUNDED_AMOUNT)), get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_funded_amount'])),
"medical_insurance_type_str": handle_original_data( "medical_insurance_type_str": handle_original_data(
get_best_value_in_keys(settlement_list_ie_result, MEDICAL_INSURANCE_TYPE)), get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_insurance_type'])),
"admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_ID)), "admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_id'])),
"settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, SETTLEMENT_ID)), "settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['settlement_id'])),
} }
settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"]) settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"]) settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
@@ -385,28 +285,30 @@ def settlement_task(pk_phhd, settlement_list, identity):
settlement_data["personal_funded_amount"] = handle_decimal(settlement_data["personal_funded_amount_str"]) settlement_data["personal_funded_amount"] = handle_decimal(settlement_data["personal_funded_amount_str"])
settlement_data["medical_insurance_type"] = handle_insurance_type(settlement_data["medical_insurance_type_str"]) settlement_data["medical_insurance_type"] = handle_insurance_type(settlement_data["medical_insurance_type_str"])
parse_money_result = parse_money(get_best_value_in_keys(settlement_list_ie_result, UPPERCASE_MEDICAL_EXPENSES), parse_money_result = parse_money(
get_best_value_in_keys(settlement_list_ie_result, MEDICAL_EXPENSES)) get_best_value_in_keys(settlement_list_ie_result, IE_KEY['upper_case_medical_expenses']),
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_expenses']))
settlement_data["medical_expenses_str"] = handle_original_data(parse_money_result[0]) settlement_data["medical_expenses_str"] = handle_original_data(parse_money_result[0])
settlement_data["medical_expenses"] = parse_money_result[1] settlement_data["medical_expenses"] = parse_money_result[1]
save_or_update_ie(ZxIeSettlement, pk_phhd, settlement_data) save_or_update_ie(ZxIeSettlement, pk_phhd, settlement_data)
return settlement_data return settlement_data
def discharge_task(pk_phhd, discharge_record, identity): def discharge_task(pk_phhd, discharge_record_ie_result):
discharge_record_ie_result = information_extraction(DISCHARGE_IE, discharge_record, identity) hospitals = get_values_of_keys(discharge_record_ie_result, IE_KEY['hospital'])
hospitals = get_values_of_keys(discharge_record_ie_result, HOSPITAL) departments = get_values_of_keys(discharge_record_ie_result, IE_KEY['department'])
departments = get_values_of_keys(discharge_record_ie_result, DEPARTMENT)
discharge_data = { discharge_data = {
"pk_phhd": pk_phhd, "pk_phhd": pk_phhd,
"hospital": handle_hospital(",".join(hospitals)), "hospital": handle_hospital(",".join(hospitals)),
"department": handle_department(",".join(departments)), "department": handle_department(",".join(departments)),
"name": handle_name(get_best_value_in_keys(discharge_record_ie_result, PATIENT_NAME)), "name": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['name'])),
"admission_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_DATE)), "admission_date_str": handle_original_data(
"discharge_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, DISCHARGE_DATE)), get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_date'])),
"doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, DOCTOR)), "discharge_date_str": handle_original_data(
"admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_ID)), get_best_value_in_keys(discharge_record_ie_result, IE_KEY['discharge_date'])),
"age": handle_age(get_best_value_in_keys(discharge_record_ie_result, AGE)), "doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['doctor'])),
"admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_id'])),
"age": handle_age(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['age'])),
} }
discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"]) discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"])
discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"]) discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"])
@@ -466,14 +368,16 @@ def discharge_task(pk_phhd, discharge_record, identity):
return discharge_data return discharge_data
def cost_task(pk_phhd, cost_list, identity): def cost_task(pk_phhd, cost_list_ie_result):
cost_list_ie_result = information_extraction(COST_IE, cost_list, identity)
cost_data = { cost_data = {
"pk_phhd": pk_phhd, "pk_phhd": pk_phhd,
"name": handle_name(get_best_value_in_keys(cost_list_ie_result, PATIENT_NAME)), "name": handle_name(get_best_value_in_keys(cost_list_ie_result, IE_KEY['name'])),
"admission_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, ADMISSION_DATE)), "admission_date_str": handle_original_data(
"discharge_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, DISCHARGE_DATE)), get_best_value_in_keys(cost_list_ie_result, IE_KEY['admission_date'])),
"medical_expenses_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, MEDICAL_EXPENSES)) "discharge_date_str": handle_original_data(
get_best_value_in_keys(cost_list_ie_result, IE_KEY['discharge_date'])),
"medical_expenses_str": handle_original_data(
get_best_value_in_keys(cost_list_ie_result, IE_KEY['medical_expenses']))
} }
cost_data["admission_date"] = handle_date(cost_data["admission_date_str"]) cost_data["admission_date"] = handle_date(cost_data["admission_date_str"])
cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"]) cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"])
@@ -483,28 +387,39 @@ def cost_task(pk_phhd, cost_list, identity):
def photo_review(pk_phhd, name): def photo_review(pk_phhd, name):
settlement_list = [] """
discharge_record = [] 处理单个报销案子
cost_list = [] :param pk_phhd: 报销单主键
:param name: 报销人姓名
"""
settlement_result = defaultdict(list)
discharge_result = defaultdict(list)
cost_result = defaultdict(list)
session = MysqlSession() session = MysqlSession()
phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.pk_phhd, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter( phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter(
ZxPhrec.pk_phhd == pk_phhd ZxPhrec.pk_phhd == pk_phhd
).all() ).all()
session.close() session.close()
for phrec in phrecs:
if phrec.cRectype == "1":
settlement_list.append(phrec)
elif phrec.cRectype == "3":
discharge_record.append(phrec)
elif phrec.cRectype == "4":
cost_list.append(phrec)
# 同一批图的标识 # 同一批图的标识
identity = int(time.time()) identity = int(time.time())
settlement_data = settlement_task(pk_phhd, settlement_list, identity) for phrec in phrecs:
discharge_data = discharge_task(pk_phhd, discharge_record, identity) rec_type, ie_result = information_extraction(phrec, identity)
cost_data = cost_task(pk_phhd, cost_list, identity) if rec_type == '基本医保结算单':
rec_result = settlement_result
elif rec_type == '出院记录':
rec_result = discharge_result
elif rec_type == '费用清单':
rec_result = cost_result
else:
rec_result = None
if rec_result:
for key, value in ie_result.items():
rec_result[key].append(value)
settlement_data = settlement_task(pk_phhd, settlement_result)
discharge_data = discharge_task(pk_phhd, discharge_result)
cost_data = cost_task(pk_phhd, cost_result)
review_result = { review_result = {
'pk_phhd': pk_phhd, 'pk_phhd': pk_phhd,
@@ -573,6 +488,9 @@ def photo_review(pk_phhd, name):
def main(): def main():
"""
照片审核批量控制
"""
while 1: while 1:
session = MysqlSession() session = MysqlSession()
phhds = (session.query(ZxPhhd.pk_phhd, ZxPhhd.cXm) phhds = (session.query(ZxPhhd.pk_phhd, ZxPhhd.cXm)

View File

@@ -1,34 +1,20 @@
""" """
信息抽取关键词配置 信息抽取关键词配置
""" """
IE_KEY = {
# 患者姓名 'name': '患者姓名',
PATIENT_NAME = ['患者姓名'] 'admission_date': '入院日期',
# 入院日期 'discharge_date': '出院日期',
ADMISSION_DATE = ['入院日期'] 'medical_expenses': '费用总额',
# 出院日期 'personal_cash_payment': '个人现金支付',
DISCHARGE_DATE = ['出院日期'] 'personal_account_payment': '个人账户支付',
# 发生医疗费 'personal_funded_amount': '自费金额',
MEDICAL_EXPENSES = ['费用总额'] 'medical_insurance_type': '医保类型',
# 个人现金支付 'hospital': '医院',
PERSONAL_CASH_PAYMENT = ['个人现金支付'] 'department': '科室',
# 个人账户支付 'doctor': '主治医生',
PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付'] 'admission_id': '住院号',
# 个人自费金额 'settlement_id': '医保结算单号码',
PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费'] 'age': '年龄',
# 医保类别 'upper_case_medical_expenses': '大写总额'
MEDICAL_INSURANCE_TYPE = ['医保类型'] }
# 就诊医院
HOSPITAL = ['医院']
# 就诊科室
DEPARTMENT = ['科室']
# 主治医生
DOCTOR = ['主治医生']
# 住院号
ADMISSION_ID = ['住院号']
# 医保结算单号码
SETTLEMENT_ID = ['医保结算单号码']
# 年龄
AGE = ['年龄']
# 大写总额
UPPERCASE_MEDICAL_EXPENSES = ['大写总额']

View File

@@ -19,7 +19,8 @@ def main():
cls_result = CLAS(text) cls_result = CLAS(text)
cls_result = cls_result[0].get('predictions')[0] cls_result = cls_result[0].get('predictions')[0]
if cls_result['score'] < 0.8: if cls_result['score'] < 0.8:
raise Exception(f'识别结果置信度过低text: {text}') logging.info(f"识别结果置信度{cls_result['score']}过低text: {text}")
return None
return cls_result['label'] return cls_result['label']

View File

@@ -4,12 +4,14 @@ import logging.config
from flask import Flask, request from flask import Flask, request
from paddlenlp import Taskflow from paddlenlp import Taskflow
from __init__ import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES from __init__ import IE_KEY
from log import LOGGING_CONFIG from log import LOGGING_CONFIG
from utils import process_request from utils import process_request
app = Flask(__name__) app = Flask(__name__)
COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES COST_LIST_SCHEMA = tuple(IE_KEY[key] for key in [
'name', 'admission_date', 'discharge_date', 'medical_expenses'
])
COST = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base', COST = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base',
task_path='model/cost_list_model', layout_analysis=False, precision='fp16') task_path='model/cost_list_model', layout_analysis=False, precision='fp16')

View File

@@ -4,14 +4,14 @@ import logging.config
from flask import Flask, request from flask import Flask, request
from paddlenlp import Taskflow from paddlenlp import Taskflow
from __init__ import HOSPITAL, DEPARTMENT, PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, DOCTOR, ADMISSION_ID, AGE from __init__ import IE_KEY
from log import LOGGING_CONFIG from log import LOGGING_CONFIG
from utils import process_request from utils import process_request
app = Flask(__name__) app = Flask(__name__)
DISCHARGE_RECORD_SCHEMA = ( DISCHARGE_RECORD_SCHEMA = tuple(IE_KEY[key] for key in [
HOSPITAL + DEPARTMENT + PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + DOCTOR + ADMISSION_ID + AGE 'hospital', 'department', 'name', 'admission_date', 'discharge_date', 'doctor', 'admission_id', 'age'
) ])
DISCHARGE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base', DISCHARGE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base',
task_path='model/discharge_record_model', layout_analysis=False, precision='fp16') task_path='model/discharge_record_model', layout_analysis=False, precision='fp16')

View File

@@ -4,18 +4,16 @@ import logging.config
from flask import Flask, request from flask import Flask, request
from paddlenlp import Taskflow from paddlenlp import Taskflow
from __init__ import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \ from __init__ import IE_KEY
PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, ADMISSION_ID, SETTLEMENT_ID, \
UPPERCASE_MEDICAL_EXPENSES
from log import LOGGING_CONFIG from log import LOGGING_CONFIG
from utils import process_request from utils import process_request
app = Flask(__name__) app = Flask(__name__)
SETTLEMENT_LIST_SCHEMA = ( SETTLEMENT_LIST_SCHEMA = tuple(IE_KEY[key] for key in [
PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT 'name', 'admission_date', 'discharge_date', 'medical_expenses', 'personal_cash_payment',
+ PERSONAL_ACCOUNT_PAYMENT + PERSONAL_FUNDED_AMOUNT + MEDICAL_INSURANCE_TYPE + ADMISSION_ID + SETTLEMENT_ID 'personal_account_payment', 'personal_funded_amount', 'medical_insurance_type', 'admission_id', 'settlement_id',
+ UPPERCASE_MEDICAL_EXPENSES 'uppercase_medical_expenses'
) ])
SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base', SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base',
task_path='model/settlement_list_model', layout_analysis=False, precision='fp16') task_path='model/settlement_list_model', layout_analysis=False, precision='fp16')

View File

@@ -12,6 +12,44 @@ def get_default_datetime():
return datetime.now().strftime('%Y-%m-%d %H:%M:%S') return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def ocr_result_to_layout(ocr_result):
def _get_box(old_box):
new_box = [
min(old_box[0][0], old_box[3][0]), # x1
min(old_box[0][1], old_box[1][1]), # y1
max(old_box[1][0], old_box[2][0]), # x2
max(old_box[2][1], old_box[3][1]), # y2
]
return new_box
def _normal_box(box_data):
# Ensure the height and width of bbox are greater than zero
if box_data[3] - box_data[1] < 0 or box_data[2] - box_data[0] < 0:
return False
return True
layout = []
if not ocr_result:
return layout
for segment in ocr_result:
box = segment[0]
box = _get_box(box)
if not _normal_box(box):
continue
text = segment[1][0]
layout.append((box, text))
return layout
def ocr_result_to_text(ocr_results):
text = ''
for ocr_result in ocr_results:
text += ocr_result[1][0]
if len(text) >= 2048:
break
return text[:2048]
def get_ocr_layout(ocr, img_path): def get_ocr_layout(ocr, img_path):
""" """
获取ocr识别的结果转为合适的layout形式 获取ocr识别的结果转为合适的layout形式

View File

@@ -1,7 +1,6 @@
import logging import logging
import math import math
import os import os
import urllib.request
import cv2 import cv2
import numpy import numpy
@@ -12,80 +11,59 @@ from tenacity import retry, stop_after_attempt, wait_random
from log import PROJECT_ROOT from log import PROJECT_ROOT
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
after=lambda x: logging.warning('获取图片失败!'))
def read(image_path):
"""
从网络或本地读取图片
:param image_path: 网络或本地路径
:return: NumPy数组形式的图片
"""
if image_path.startswith('http'):
# 发送HTTP请求并获取图像数据
resp = urllib.request.urlopen(image_path, timeout=60)
# 将数据读取为字节流
image_data = resp.read()
# 将字节流转换为NumPy数组
image_np = numpy.frombuffer(image_data, numpy.uint8)
# 解码NumPy数组为OpenCV图像格式
image = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
else:
image = cv2.imread(image_path)
return image
def capture(image, rectangle): def capture(image, rectangle):
""" """
截取图片 截取图片
:param image: 图片NumPy数组 :param image: ndarray
:param rectangle: 要截取的矩形 :param rectangle: 要截取的矩形
:return: 截取之后的图片NumPy :return: 截取之后的ndarray图片
""" """
x1, y1, x2, y2 = rectangle x1, y1, x2, y2 = rectangle
height, width = image.shape[:2] height, width = image.shape[:2]
if x1 < 0: # 确保坐标值在图片范围内
x1 = 0 x1 = max(0, x1)
if y1 < 0: y1 = max(0, y1)
y1 = 0 x2 = min(width, x2)
if x2 > width: y2 = min(height, y2)
x2 = width
if y2 > height:
y2 = height
return image[int(y1):int(y2), int(x1):int(x2)] return image[int(y1):int(y2), int(x1):int(x2)]
def split(image, ratio=1.414, overlap=0.05, x_compensation=3): def split(img_path, ratio=1.414, overlap=0.05, x_compensation=3):
""" """
分割图片 分割图片
:param image:图片可以是NumPy数组或文件路径 :param img_path:图片路径
:param ratio: 分割后的比例 :param ratio: 分割后的比例
:param overlap: 图片之间的覆盖比例 :param overlap: 图片之间的覆盖比例
:param x_compensation: 横向补偿倍率 :param x_compensation: 横向补偿倍率
:return: 分割后的图片组(NumPy数组形式) :return: 分割后的图片组(NumPy数组形式)
""" """
split_result = [] split_result = []
if isinstance(image, str): image = cv2.imread(img_path)
image = read(image)
height, width = image.shape[:2] height, width = image.shape[:2]
hw_ratio = height / width hw_ratio = height / width
wh_ratio = width / height wh_ratio = width / height
img_name, img_ext = parse_save_path(img_path)
if hw_ratio > ratio: # 纵向过长 if hw_ratio > ratio: # 纵向过长
new_img_height = width * ratio new_img_height = width * ratio
step = width * (ratio - overlap) # 偏移步长 step = width * (ratio - overlap) # 偏移步长
for i in range(math.ceil(height / step)): for i in range(math.ceil(height / step)):
offset = round(step * i) offset = round(step * i)
cropped_img = capture(image, [0, offset, width, offset + new_img_height]) cropped_img = capture(image, [0, offset, width, offset + new_img_height])
split_result.append({'img': cropped_img, 'x_offset': 0, 'y_offset': offset}) split_path = get_save_path(f'{img_name}.split_{i}.{img_ext}')
cv2.imwrite(split_path, cropped_img)
split_result.append({'img': split_path, 'x_offset': 0, 'y_offset': offset})
elif wh_ratio > ratio: # 横向过长 elif wh_ratio > ratio: # 横向过长
new_img_width = height * ratio new_img_width = height * ratio
step = height * (ratio - overlap * x_compensation) # 一般文字是横向的,所以横向截取时增大重叠部分 step = height * (ratio - overlap * x_compensation) # 一般文字是横向的,所以横向截取时增大重叠部分
for i in range(math.ceil(width / step)): for i in range(math.ceil(width / step)):
offset = round(step * i) offset = round(step * i)
cropped_img = capture(image, [offset, 0, offset + new_img_width, width]) cropped_img = capture(image, [offset, 0, offset + new_img_width, width])
split_result.append({'img': cropped_img, 'x_offset': offset, 'y_offset': 0}) split_path = get_save_path(f'{img_name}.split_{i}.{img_ext}')
cv2.imwrite(split_path, cropped_img)
split_result.append({'img': split_path, 'x_offset': offset, 'y_offset': 0})
else: else:
split_result.append({'img': image, 'x_offset': 0, 'y_offset': 0}) split_result.append({'img': img_path, 'x_offset': 0, 'y_offset': 0})
return split_result return split_result
@@ -108,15 +86,16 @@ def parse_rotation_angles(image):
return angles return angles
def rotate(image, angle): def rotate(img_path, angle):
""" """
旋转图片 旋转图片
:param image: 图片NumPy数组 :param img_path: 图片NumPy数组
:param angle: 逆时针旋转角度 :param angle: 逆时针旋转角度
:return: 旋转后的图片NumPy数组 :return: 旋转后的图片NumPy数组
""" """
if angle == 0: if angle == 0:
return image return img_path
image = cv2.imread(img_path)
height, width = image.shape[:2] height, width = image.shape[:2]
if angle == 180: if angle == 180:
new_width = width new_width = width
@@ -132,7 +111,11 @@ def rotate(image, angle):
matrix[1, 2] += (new_height - height) / 2 matrix[1, 2] += (new_height - height) / 2
# 参数:原始图像 旋转参数 元素图像宽高 # 参数:原始图像 旋转参数 元素图像宽高
rotated = cv2.warpAffine(image, matrix, (new_width, new_height)) rotated = cv2.warpAffine(image, matrix, (new_width, new_height))
return rotated
img_name, img_ext = parse_save_path(img_path)
rotated_path = get_save_path(f'{img_name}.rotate_{angle}.{img_ext}')
cv2.imwrite(rotated_path, rotated)
return rotated_path
def invert_rotate_point(point, center, angle): def invert_rotate_point(point, center, angle):
@@ -260,26 +243,38 @@ def parse_img_url(url):
:return: 图片名称和图片后缀 :return: 图片名称和图片后缀
""" """
url = url.split('?')[0] url = url.split('?')[0]
return os.path.basename(url).rsplit('.', 1) return os.path.basename(url)
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
after=lambda x: logging.warning('保存图片失败!')) after=lambda x: logging.warning('保存图片失败!'))
def save_to_local(img_url, save_path=None): def save_to_local(img_url):
""" """
保存图片到本地 保存图片到本地
:param img_url: 图片url :param img_url: 图片url
:param save_path: 本地保存地址,精确到文件名
:return: 本地保存地址 :return: 本地保存地址
""" """
response = requests.get(img_url) response = requests.get(img_url)
response.raise_for_status() # 检查响应状态码是否正常 response.raise_for_status() # 检查响应状态码是否正常
if save_path is None: save_path = get_save_path(parse_img_url(img_url))
img_name, img_ext = parse_img_url(img_url)
save_path = os.path.join(PROJECT_ROOT, 'tmp_img', img_name + '.' + img_ext)
with open(save_path, 'wb') as file: with open(save_path, 'wb') as file:
file.write(response.content) file.write(response.content)
return save_path return save_path
def get_img_path(img_full_name):
save_path = get_save_path(img_full_name)
if os.path.exists(save_path):
return save_path
return None
def get_save_path(img_full_name):
return os.path.join(PROJECT_ROOT, 'tmp_img', img_full_name)
def parse_save_path(img_path):
img_full_name = os.path.basename(img_path)
img_name, img_ext = img_full_name.rsplit('.', 1)
return img_name, img_ext

View File

@@ -1,5 +1,6 @@
import json import json
import logging import logging
import os.path
import requests import requests
from tenacity import retry, stop_after_attempt, wait_random from tenacity import retry, stop_after_attempt, wait_random
@@ -16,9 +17,10 @@ def ocr(img_path):
url = 'http://ocr:5001' url = 'http://ocr:5001'
response = requests.post(url, {'img_path': img_path}) response = requests.post(url, {'img_path': img_path})
if response.status_code == 200: if response.status_code == 200:
return response.json() ocr_result = response.json()
else: if ocr_result:
return None return ocr_result[0]
return None
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
@@ -40,7 +42,7 @@ def ie_settlement(img_path, layout):
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
after=lambda x: logging.warning('从文本抽取基本医保结算单失败!')) after=lambda x: logging.warning('从文本抽取基本医保结算单失败!'))
def ie_settlement(text): def ie_settlement_text(text):
""" """
请求基本医保结算单信息抽取接口 请求基本医保结算单信息抽取接口
:param text: 待抽取文本 :param text: 待抽取文本
@@ -73,7 +75,7 @@ def ie_discharge(img_path, layout):
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
after=lambda x: logging.warning('从文本抽取出院记录失败!')) after=lambda x: logging.warning('从文本抽取出院记录失败!'))
def ie_discharge(text): def ie_discharge_text(text):
""" """
请求出院记录信息抽取接口 请求出院记录信息抽取接口
:param text: 待抽取文本 :param text: 待抽取文本
@@ -106,7 +108,7 @@ def ie_cost(img_path, layout):
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
after=lambda x: logging.warning('从文本抽取费用清单失败!')) after=lambda x: logging.warning('从文本抽取费用清单失败!'))
def ie_cost(text): def ie_cost_text(text):
""" """
请求费用清单信息抽取接口 请求费用清单信息抽取接口
:param text: 待抽取文本 :param text: 待抽取文本
@@ -147,9 +149,22 @@ def det_book(img_path):
url = 'http://det_book:5006' url = 'http://det_book:5006'
response = requests.post(url, {'img_path': img_path}) response = requests.post(url, {'img_path': img_path})
if response.status_code == 200: if response.status_code == 200:
return response.json() book_path_list = response.json()
if len(book_path_list) == 0:
return img_path
elif len(book_path_list) == 1:
return book_path_list[0]
else:
max_book = img_path
max_size = 0
for book_path in book_path_list:
book_size = os.path.getsize(book_path)
if book_size > max_size:
max_book = book_path
max_size = book_size
return max_book
else: else:
return [img_path] return img_path
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, @retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,