优化案子处理逻辑
This commit is contained in:
@@ -5,12 +5,13 @@ from time import sleep
|
||||
|
||||
from sqlalchemy import update
|
||||
|
||||
from my_email.error_email import send_error_email
|
||||
from db import MysqlSession
|
||||
from db.mysql import ZxPhhd
|
||||
from log import LOGGING_CONFIG
|
||||
from my_email.error_email import send_error_email
|
||||
from photo_review import auto_photo_review, SEND_ERROR_EMAIL
|
||||
|
||||
# 照片审核自动识别脚本入口
|
||||
if __name__ == '__main__':
|
||||
program_name = '照片审核自动识别脚本'
|
||||
logging.config.dictConfig(LOGGING_CONFIG)
|
||||
@@ -19,7 +20,7 @@ if __name__ == '__main__':
|
||||
parser.add_argument('--clean', default=False, type=bool, help='是否将识别中的案子改为待识别状态')
|
||||
args = parser.parse_args()
|
||||
if args.clean:
|
||||
# 主要用于启动时,清除仍在涂抹中的案子
|
||||
# 启动时清除仍在识别中的案子
|
||||
session = MysqlSession()
|
||||
update_flag = (update(ZxPhhd).where(ZxPhhd.exsuccess_flag == '2').values(exsuccess_flag='1'))
|
||||
session.execute(update_flag)
|
||||
@@ -33,7 +34,6 @@ if __name__ == '__main__':
|
||||
logging.info(f'【{program_name}】开始运行')
|
||||
auto_photo_review.main()
|
||||
except Exception as e:
|
||||
error_logger = logging.getLogger('error')
|
||||
error_logger.error(traceback.format_exc())
|
||||
logging.getLogger('error').error(traceback.format_exc())
|
||||
if SEND_ERROR_EMAIL:
|
||||
send_error_email(program_name, repr(e), traceback.format_exc())
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from time import sleep
|
||||
@@ -10,72 +7,24 @@ import cv2
|
||||
import fitz
|
||||
import jieba
|
||||
import numpy as np
|
||||
import requests
|
||||
import zxingcpp
|
||||
from rapidfuzz import process, fuzz
|
||||
from sqlalchemy import update
|
||||
|
||||
from db import MysqlSession
|
||||
from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview
|
||||
from db.mysql import BdYljg, BdYlks, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview
|
||||
from log import HOSTNAME
|
||||
from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \
|
||||
DEPARTMENT_FILTER
|
||||
from services.paddle_services import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, \
|
||||
PERSONAL_CASH_PAYMENT, PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, \
|
||||
DEPARTMENT, DOCTOR, ADMISSION_ID, SETTLEMENT_ID, AGE, UPPERCASE_MEDICAL_EXPENSES
|
||||
from services.paddle_services import IE_KEY
|
||||
from ucloud import ufile
|
||||
from util import image_util, common_util, html_util, model_util
|
||||
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \
|
||||
handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital
|
||||
|
||||
|
||||
# 合并信息抽取结果
|
||||
def merge_result(result1, result2):
|
||||
for key in result2:
|
||||
result1[key] = result1.get(key, []) + result2[key]
|
||||
return result1
|
||||
|
||||
|
||||
def ie_temp_image(ie, ocr, image):
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
|
||||
cv2.imwrite(temp_file.name, image)
|
||||
|
||||
ie_result = []
|
||||
try:
|
||||
layout = common_util.get_ocr_layout(ocr, temp_file.name)
|
||||
if not layout:
|
||||
# 无识别结果
|
||||
ie_result = []
|
||||
else:
|
||||
ie_result = ie({"doc": temp_file.name, "layout": layout})[0]
|
||||
except Exception as e:
|
||||
logging.error("信息抽取时出错", exc_info=e)
|
||||
finally:
|
||||
try:
|
||||
os.remove(temp_file.name)
|
||||
except Exception as e:
|
||||
logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e)
|
||||
return ie_result
|
||||
|
||||
|
||||
# 关键信息提取
|
||||
def request_ie_result(task_enum, phrecs):
|
||||
url = task_enum.request_url()
|
||||
identity = int(time.time())
|
||||
images = []
|
||||
for phrec in phrecs:
|
||||
images.append({"name": phrec.cfjaddress, "pk": phrec.pk_phrec})
|
||||
payload = {"images": images, "schema": task_enum.schema(), "pk_phhd": phrecs[0].pk_phhd, "identity": identity}
|
||||
response = requests.post(url, json=payload)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()["data"]
|
||||
else:
|
||||
raise Exception(f"请求信息抽取结果失败,状态码:{response.status_code}")
|
||||
|
||||
|
||||
# 尝试从二维码中获取高清图片
|
||||
def get_better_image_from_qrcode(image, image_id, dpi=150):
|
||||
def get_better_image_from_qrcode(img_path, image_id, dpi=150):
|
||||
def _parse_pdf_url(pdf_url_to_parse):
|
||||
pdf_file = None
|
||||
local_pdf_path = None
|
||||
@@ -95,7 +44,10 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
|
||||
# 将渲染结果转换为OpenCV兼容的格式
|
||||
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1))
|
||||
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
||||
return img, page.get_text()
|
||||
img_name, img_ext = image_util.parse_save_path(img_path)
|
||||
better_img_path = image_util.get_save_path(f'{img_name}.better.{img_ext}')
|
||||
cv2.imwrite(better_img_path, img)
|
||||
return better_img_path, page.get_text()
|
||||
except Exception as ex:
|
||||
logging.getLogger('error').error('解析pdf失败!', exc_info=ex)
|
||||
return None, None
|
||||
@@ -107,7 +59,8 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
|
||||
|
||||
jsczt_base_url = 'http://einvoice.jsczt.cn'
|
||||
try:
|
||||
results = zxingcpp.read_barcodes(image)
|
||||
img = cv2.imread(img_path)
|
||||
results = zxingcpp.read_barcodes(img, text_mode=zxingcpp.TextMode.HRI)
|
||||
except Exception as e:
|
||||
logging.getLogger('error').info('二维码识别失败', exc_info=e)
|
||||
results = []
|
||||
@@ -145,106 +98,52 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
|
||||
|
||||
|
||||
# 关键信息提取
|
||||
def information_extraction(ie, phrecs, identity):
|
||||
result = {}
|
||||
for phrec in phrecs:
|
||||
def information_extraction(phrec, identity):
|
||||
"""
|
||||
处理单张图片
|
||||
:param phrec:
|
||||
:return:
|
||||
"""
|
||||
img_path = image_util.get_img_path(phrec.cfjaddress)
|
||||
if not img_path:
|
||||
img_url = ufile.get_private_url(phrec.cfjaddress)
|
||||
if not img_url:
|
||||
continue
|
||||
|
||||
img_path = image_util.save_to_local(img_url)
|
||||
image = cv2.imread(img_path)
|
||||
# 尝试从二维码中获取高清图片
|
||||
better_image, text = get_better_image_from_qrcode(image, phrec.cfjaddress)
|
||||
if phrec.cRectype != '1':
|
||||
better_image = None # 非结算单暂时不进行替换
|
||||
zx_ie_results = []
|
||||
if better_image is not None:
|
||||
img_angle = '0'
|
||||
image = better_image
|
||||
if text:
|
||||
info_extract = ie(text)[0]
|
||||
else:
|
||||
info_extract = ie_temp_image(ie, OCR, image)
|
||||
if not info_extract:
|
||||
continue
|
||||
|
||||
ie_result = {'result': info_extract, 'angle': img_angle}
|
||||
now = common_util.get_default_datetime()
|
||||
result_json = json.dumps(ie_result['result'], ensure_ascii=False)
|
||||
if len(result_json) > 5000:
|
||||
result_json = result_json[:5000]
|
||||
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
|
||||
cfjaddress=phrec.cfjaddress, content=result_json,
|
||||
rotation_angle=int(ie_result['angle']),
|
||||
x_offset=0, y_offset=0, create_time=now,
|
||||
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
|
||||
|
||||
result = merge_result(result, ie_result['result'])
|
||||
# 尝试从二维码中获取高清图片
|
||||
better_img_path, text = get_better_image_from_qrcode(img_path, phrec.cfjaddress)
|
||||
if phrec.cRectype != '1':
|
||||
better_img_path = None # 非结算单暂时不进行替换
|
||||
if better_img_path is not None:
|
||||
if text:
|
||||
info_extract = model_util.ie_settlement_text(text)[0]
|
||||
else:
|
||||
target_images = model_util.request_book_areas(img_path) # 识别文档区域并裁剪
|
||||
angle_count = defaultdict(int, {'0': 0}) # 分割后图片的最优角度统计
|
||||
for target_image in target_images:
|
||||
dewarped_image = model_util.dewarp(target_image) # 去扭曲
|
||||
angles = model_util.clas_orientation(dewarped_image)
|
||||
info_extract = model_util.ie_settlement(better_img_path,
|
||||
common_util.ocr_result_to_layout(model_util.ocr(better_img_path)))
|
||||
|
||||
split_results = image_util.split(dewarped_image)
|
||||
for split_result in split_results:
|
||||
if split_result['img'] is None or split_result['img'].size == 0:
|
||||
continue
|
||||
rotated_img = image_util.rotate(split_result['img'], int(angles[0]))
|
||||
ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}]
|
||||
if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')):
|
||||
rotated_img = image_util.rotate(split_result['img'], int(angles[1]))
|
||||
ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]})
|
||||
now = common_util.get_default_datetime()
|
||||
best_angle = ['0', 0]
|
||||
for ie_result in ie_results:
|
||||
if not ie_result['result']:
|
||||
continue
|
||||
return '基本医保结算单', info_extract
|
||||
else:
|
||||
target_image = model_util.det_book(img_path) # 识别文档区域并裁剪
|
||||
dewarped_image = model_util.dewarp(target_image) # 去扭曲
|
||||
angles = model_util.clas_orientation(dewarped_image)
|
||||
rotated_img = image_util.rotate(dewarped_image, int(angles[0]))
|
||||
split_results = image_util.split(rotated_img)
|
||||
ocr_result = []
|
||||
for split_result in split_results:
|
||||
if split_result['img'] is None:
|
||||
continue
|
||||
ocr_result += model_util.ocr(rotated_img)
|
||||
ocr_text = common_util.ocr_result_to_text(ocr_result)
|
||||
rec_type = model_util.clas_text(ocr_text) if ocr_text else None
|
||||
if rec_type == '基本医保结算单':
|
||||
info_extract = model_util.ie_settlement(rotated_img, common_util.ocr_result_to_layout(ocr_result))
|
||||
elif rec_type == '出院记录':
|
||||
info_extract = model_util.ie_discharge(rotated_img, common_util.ocr_result_to_layout(ocr_result))
|
||||
elif rec_type == '费用清单':
|
||||
info_extract = model_util.ie_cost(rotated_img, common_util.ocr_result_to_layout(ocr_result))
|
||||
else:
|
||||
info_extract = None
|
||||
|
||||
result_json = json.dumps(ie_result['result'], ensure_ascii=False)
|
||||
if len(result_json) > 5000:
|
||||
result_json = result_json[:5000]
|
||||
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
|
||||
cfjaddress=phrec.cfjaddress, content=result_json,
|
||||
rotation_angle=int(ie_result['angle']),
|
||||
x_offset=split_result['x_offset'],
|
||||
y_offset=split_result['y_offset'], create_time=now,
|
||||
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
|
||||
|
||||
result = merge_result(result, ie_result['result'])
|
||||
|
||||
if len(ie_result['result']) > best_angle[1]:
|
||||
best_angle = [ie_result['angle'], len(ie_result['result'])]
|
||||
|
||||
angle_count[best_angle[0]] += 1
|
||||
img_angle = max(angle_count, key=angle_count.get)
|
||||
|
||||
if img_angle != '0' or better_image is not None:
|
||||
image = image_util.rotate(image, int(img_angle))
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
|
||||
cv2.imwrite(temp_file.name, image)
|
||||
try:
|
||||
ufile.upload_file(phrec.cfjaddress, temp_file.name)
|
||||
if img_angle != '0':
|
||||
logging.info(f'旋转图片[{phrec.cfjaddress}]替换成功,已旋转{img_angle}度。')
|
||||
# 修正旋转角度
|
||||
for zx_ie_result in zx_ie_results:
|
||||
zx_ie_result.rotation_angle -= int(img_angle)
|
||||
else:
|
||||
logging.info(f'高清图片[{phrec.cfjaddress}]替换成功!')
|
||||
except Exception as e:
|
||||
logging.error(f'上传图片({phrec.cfjaddress})失败', exc_info=e)
|
||||
finally:
|
||||
common_util.delete_temp_file(temp_file.name)
|
||||
|
||||
session = MysqlSession()
|
||||
session.add_all(zx_ie_results)
|
||||
session.commit()
|
||||
session.close()
|
||||
|
||||
return result
|
||||
return rec_type, info_extract
|
||||
|
||||
|
||||
# 从keys中获取准确率最高的value
|
||||
@@ -359,23 +258,24 @@ def search_department(department):
|
||||
return best_match
|
||||
|
||||
|
||||
def settlement_task(pk_phhd, settlement_list, identity):
|
||||
settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity)
|
||||
def settlement_task(pk_phhd, settlement_list_ie_result):
|
||||
settlement_data = {
|
||||
"pk_phhd": pk_phhd,
|
||||
"name": handle_name(get_best_value_in_keys(settlement_list_ie_result, PATIENT_NAME)),
|
||||
"admission_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_DATE)),
|
||||
"discharge_date_str": handle_original_data(get_best_value_in_keys(settlement_list_ie_result, DISCHARGE_DATE)),
|
||||
"name": handle_name(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['name'])),
|
||||
"admission_date_str": handle_original_data(
|
||||
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_date'])),
|
||||
"discharge_date_str": handle_original_data(
|
||||
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['discharge_date'])),
|
||||
"personal_cash_payment_str": handle_original_data(
|
||||
get_best_value_in_keys(settlement_list_ie_result, PERSONAL_CASH_PAYMENT)),
|
||||
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_cash_payment'])),
|
||||
"personal_account_payment_str": handle_original_data(
|
||||
get_best_value_in_keys(settlement_list_ie_result, PERSONAL_ACCOUNT_PAYMENT)),
|
||||
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_account_payment'])),
|
||||
"personal_funded_amount_str": handle_original_data(
|
||||
get_best_value_in_keys(settlement_list_ie_result, PERSONAL_FUNDED_AMOUNT)),
|
||||
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['personal_funded_amount'])),
|
||||
"medical_insurance_type_str": handle_original_data(
|
||||
get_best_value_in_keys(settlement_list_ie_result, MEDICAL_INSURANCE_TYPE)),
|
||||
"admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, ADMISSION_ID)),
|
||||
"settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, SETTLEMENT_ID)),
|
||||
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_insurance_type'])),
|
||||
"admission_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['admission_id'])),
|
||||
"settlement_id": handle_id(get_best_value_in_keys(settlement_list_ie_result, IE_KEY['settlement_id'])),
|
||||
}
|
||||
settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
|
||||
settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
|
||||
@@ -385,28 +285,30 @@ def settlement_task(pk_phhd, settlement_list, identity):
|
||||
settlement_data["personal_funded_amount"] = handle_decimal(settlement_data["personal_funded_amount_str"])
|
||||
settlement_data["medical_insurance_type"] = handle_insurance_type(settlement_data["medical_insurance_type_str"])
|
||||
|
||||
parse_money_result = parse_money(get_best_value_in_keys(settlement_list_ie_result, UPPERCASE_MEDICAL_EXPENSES),
|
||||
get_best_value_in_keys(settlement_list_ie_result, MEDICAL_EXPENSES))
|
||||
parse_money_result = parse_money(
|
||||
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['upper_case_medical_expenses']),
|
||||
get_best_value_in_keys(settlement_list_ie_result, IE_KEY['medical_expenses']))
|
||||
settlement_data["medical_expenses_str"] = handle_original_data(parse_money_result[0])
|
||||
settlement_data["medical_expenses"] = parse_money_result[1]
|
||||
save_or_update_ie(ZxIeSettlement, pk_phhd, settlement_data)
|
||||
return settlement_data
|
||||
|
||||
|
||||
def discharge_task(pk_phhd, discharge_record, identity):
|
||||
discharge_record_ie_result = information_extraction(DISCHARGE_IE, discharge_record, identity)
|
||||
hospitals = get_values_of_keys(discharge_record_ie_result, HOSPITAL)
|
||||
departments = get_values_of_keys(discharge_record_ie_result, DEPARTMENT)
|
||||
def discharge_task(pk_phhd, discharge_record_ie_result):
|
||||
hospitals = get_values_of_keys(discharge_record_ie_result, IE_KEY['hospital'])
|
||||
departments = get_values_of_keys(discharge_record_ie_result, IE_KEY['department'])
|
||||
discharge_data = {
|
||||
"pk_phhd": pk_phhd,
|
||||
"hospital": handle_hospital(",".join(hospitals)),
|
||||
"department": handle_department(",".join(departments)),
|
||||
"name": handle_name(get_best_value_in_keys(discharge_record_ie_result, PATIENT_NAME)),
|
||||
"admission_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_DATE)),
|
||||
"discharge_date_str": handle_original_data(get_best_value_in_keys(discharge_record_ie_result, DISCHARGE_DATE)),
|
||||
"doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, DOCTOR)),
|
||||
"admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_ID)),
|
||||
"age": handle_age(get_best_value_in_keys(discharge_record_ie_result, AGE)),
|
||||
"name": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['name'])),
|
||||
"admission_date_str": handle_original_data(
|
||||
get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_date'])),
|
||||
"discharge_date_str": handle_original_data(
|
||||
get_best_value_in_keys(discharge_record_ie_result, IE_KEY['discharge_date'])),
|
||||
"doctor": handle_name(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['doctor'])),
|
||||
"admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['admission_id'])),
|
||||
"age": handle_age(get_best_value_in_keys(discharge_record_ie_result, IE_KEY['age'])),
|
||||
}
|
||||
discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"])
|
||||
discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"])
|
||||
@@ -466,14 +368,16 @@ def discharge_task(pk_phhd, discharge_record, identity):
|
||||
return discharge_data
|
||||
|
||||
|
||||
def cost_task(pk_phhd, cost_list, identity):
|
||||
cost_list_ie_result = information_extraction(COST_IE, cost_list, identity)
|
||||
def cost_task(pk_phhd, cost_list_ie_result):
|
||||
cost_data = {
|
||||
"pk_phhd": pk_phhd,
|
||||
"name": handle_name(get_best_value_in_keys(cost_list_ie_result, PATIENT_NAME)),
|
||||
"admission_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, ADMISSION_DATE)),
|
||||
"discharge_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, DISCHARGE_DATE)),
|
||||
"medical_expenses_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, MEDICAL_EXPENSES))
|
||||
"name": handle_name(get_best_value_in_keys(cost_list_ie_result, IE_KEY['name'])),
|
||||
"admission_date_str": handle_original_data(
|
||||
get_best_value_in_keys(cost_list_ie_result, IE_KEY['admission_date'])),
|
||||
"discharge_date_str": handle_original_data(
|
||||
get_best_value_in_keys(cost_list_ie_result, IE_KEY['discharge_date'])),
|
||||
"medical_expenses_str": handle_original_data(
|
||||
get_best_value_in_keys(cost_list_ie_result, IE_KEY['medical_expenses']))
|
||||
}
|
||||
cost_data["admission_date"] = handle_date(cost_data["admission_date_str"])
|
||||
cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"])
|
||||
@@ -483,28 +387,39 @@ def cost_task(pk_phhd, cost_list, identity):
|
||||
|
||||
|
||||
def photo_review(pk_phhd, name):
|
||||
settlement_list = []
|
||||
discharge_record = []
|
||||
cost_list = []
|
||||
"""
|
||||
处理单个报销案子
|
||||
:param pk_phhd: 报销单主键
|
||||
:param name: 报销人姓名
|
||||
"""
|
||||
settlement_result = defaultdict(list)
|
||||
discharge_result = defaultdict(list)
|
||||
cost_result = defaultdict(list)
|
||||
|
||||
session = MysqlSession()
|
||||
phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.pk_phhd, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter(
|
||||
phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.cRectype, ZxPhrec.cfjaddress).filter(
|
||||
ZxPhrec.pk_phhd == pk_phhd
|
||||
).all()
|
||||
session.close()
|
||||
for phrec in phrecs:
|
||||
if phrec.cRectype == "1":
|
||||
settlement_list.append(phrec)
|
||||
elif phrec.cRectype == "3":
|
||||
discharge_record.append(phrec)
|
||||
elif phrec.cRectype == "4":
|
||||
cost_list.append(phrec)
|
||||
|
||||
# 同一批图的标识
|
||||
identity = int(time.time())
|
||||
settlement_data = settlement_task(pk_phhd, settlement_list, identity)
|
||||
discharge_data = discharge_task(pk_phhd, discharge_record, identity)
|
||||
cost_data = cost_task(pk_phhd, cost_list, identity)
|
||||
for phrec in phrecs:
|
||||
rec_type, ie_result = information_extraction(phrec, identity)
|
||||
if rec_type == '基本医保结算单':
|
||||
rec_result = settlement_result
|
||||
elif rec_type == '出院记录':
|
||||
rec_result = discharge_result
|
||||
elif rec_type == '费用清单':
|
||||
rec_result = cost_result
|
||||
else:
|
||||
rec_result = None
|
||||
if rec_result:
|
||||
for key, value in ie_result.items():
|
||||
rec_result[key].append(value)
|
||||
|
||||
settlement_data = settlement_task(pk_phhd, settlement_result)
|
||||
discharge_data = discharge_task(pk_phhd, discharge_result)
|
||||
cost_data = cost_task(pk_phhd, cost_result)
|
||||
|
||||
review_result = {
|
||||
'pk_phhd': pk_phhd,
|
||||
@@ -573,6 +488,9 @@ def photo_review(pk_phhd, name):
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
照片审核批量控制
|
||||
"""
|
||||
while 1:
|
||||
session = MysqlSession()
|
||||
phhds = (session.query(ZxPhhd.pk_phhd, ZxPhhd.cXm)
|
||||
|
||||
@@ -1,34 +1,20 @@
|
||||
"""
|
||||
信息抽取关键词配置
|
||||
"""
|
||||
|
||||
# 患者姓名
|
||||
PATIENT_NAME = ['患者姓名']
|
||||
# 入院日期
|
||||
ADMISSION_DATE = ['入院日期']
|
||||
# 出院日期
|
||||
DISCHARGE_DATE = ['出院日期']
|
||||
# 发生医疗费
|
||||
MEDICAL_EXPENSES = ['费用总额']
|
||||
# 个人现金支付
|
||||
PERSONAL_CASH_PAYMENT = ['个人现金支付']
|
||||
# 个人账户支付
|
||||
PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付']
|
||||
# 个人自费金额
|
||||
PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费']
|
||||
# 医保类别
|
||||
MEDICAL_INSURANCE_TYPE = ['医保类型']
|
||||
# 就诊医院
|
||||
HOSPITAL = ['医院']
|
||||
# 就诊科室
|
||||
DEPARTMENT = ['科室']
|
||||
# 主治医生
|
||||
DOCTOR = ['主治医生']
|
||||
# 住院号
|
||||
ADMISSION_ID = ['住院号']
|
||||
# 医保结算单号码
|
||||
SETTLEMENT_ID = ['医保结算单号码']
|
||||
# 年龄
|
||||
AGE = ['年龄']
|
||||
# 大写总额
|
||||
UPPERCASE_MEDICAL_EXPENSES = ['大写总额']
|
||||
IE_KEY = {
|
||||
'name': '患者姓名',
|
||||
'admission_date': '入院日期',
|
||||
'discharge_date': '出院日期',
|
||||
'medical_expenses': '费用总额',
|
||||
'personal_cash_payment': '个人现金支付',
|
||||
'personal_account_payment': '个人账户支付',
|
||||
'personal_funded_amount': '自费金额',
|
||||
'medical_insurance_type': '医保类型',
|
||||
'hospital': '医院',
|
||||
'department': '科室',
|
||||
'doctor': '主治医生',
|
||||
'admission_id': '住院号',
|
||||
'settlement_id': '医保结算单号码',
|
||||
'age': '年龄',
|
||||
'upper_case_medical_expenses': '大写总额'
|
||||
}
|
||||
|
||||
@@ -19,7 +19,8 @@ def main():
|
||||
cls_result = CLAS(text)
|
||||
cls_result = cls_result[0].get('predictions')[0]
|
||||
if cls_result['score'] < 0.8:
|
||||
raise Exception(f'识别结果置信度过低!text: {text}')
|
||||
logging.info(f"识别结果置信度{cls_result['score']}过低!text: {text}")
|
||||
return None
|
||||
return cls_result['label']
|
||||
|
||||
|
||||
|
||||
@@ -4,12 +4,14 @@ import logging.config
|
||||
from flask import Flask, request
|
||||
from paddlenlp import Taskflow
|
||||
|
||||
from __init__ import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES
|
||||
from __init__ import IE_KEY
|
||||
from log import LOGGING_CONFIG
|
||||
from utils import process_request
|
||||
|
||||
app = Flask(__name__)
|
||||
COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES
|
||||
COST_LIST_SCHEMA = tuple(IE_KEY[key] for key in [
|
||||
'name', 'admission_date', 'discharge_date', 'medical_expenses'
|
||||
])
|
||||
COST = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base',
|
||||
task_path='model/cost_list_model', layout_analysis=False, precision='fp16')
|
||||
|
||||
|
||||
@@ -4,14 +4,14 @@ import logging.config
|
||||
from flask import Flask, request
|
||||
from paddlenlp import Taskflow
|
||||
|
||||
from __init__ import HOSPITAL, DEPARTMENT, PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, DOCTOR, ADMISSION_ID, AGE
|
||||
from __init__ import IE_KEY
|
||||
from log import LOGGING_CONFIG
|
||||
from utils import process_request
|
||||
|
||||
app = Flask(__name__)
|
||||
DISCHARGE_RECORD_SCHEMA = (
|
||||
HOSPITAL + DEPARTMENT + PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + DOCTOR + ADMISSION_ID + AGE
|
||||
)
|
||||
DISCHARGE_RECORD_SCHEMA = tuple(IE_KEY[key] for key in [
|
||||
'hospital', 'department', 'name', 'admission_date', 'discharge_date', 'doctor', 'admission_id', 'age'
|
||||
])
|
||||
DISCHARGE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base',
|
||||
task_path='model/discharge_record_model', layout_analysis=False, precision='fp16')
|
||||
|
||||
|
||||
@@ -4,18 +4,16 @@ import logging.config
|
||||
from flask import Flask, request
|
||||
from paddlenlp import Taskflow
|
||||
|
||||
from __init__ import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \
|
||||
PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, ADMISSION_ID, SETTLEMENT_ID, \
|
||||
UPPERCASE_MEDICAL_EXPENSES
|
||||
from __init__ import IE_KEY
|
||||
from log import LOGGING_CONFIG
|
||||
from utils import process_request
|
||||
|
||||
app = Flask(__name__)
|
||||
SETTLEMENT_LIST_SCHEMA = (
|
||||
PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT
|
||||
+ PERSONAL_ACCOUNT_PAYMENT + PERSONAL_FUNDED_AMOUNT + MEDICAL_INSURANCE_TYPE + ADMISSION_ID + SETTLEMENT_ID
|
||||
+ UPPERCASE_MEDICAL_EXPENSES
|
||||
)
|
||||
SETTLEMENT_LIST_SCHEMA = tuple(IE_KEY[key] for key in [
|
||||
'name', 'admission_date', 'discharge_date', 'medical_expenses', 'personal_cash_payment',
|
||||
'personal_account_payment', 'personal_funded_amount', 'medical_insurance_type', 'admission_id', 'settlement_id',
|
||||
'uppercase_medical_expenses'
|
||||
])
|
||||
SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base',
|
||||
task_path='model/settlement_list_model', layout_analysis=False, precision='fp16')
|
||||
|
||||
|
||||
@@ -12,6 +12,44 @@ def get_default_datetime():
|
||||
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
|
||||
def ocr_result_to_layout(ocr_result):
|
||||
def _get_box(old_box):
|
||||
new_box = [
|
||||
min(old_box[0][0], old_box[3][0]), # x1
|
||||
min(old_box[0][1], old_box[1][1]), # y1
|
||||
max(old_box[1][0], old_box[2][0]), # x2
|
||||
max(old_box[2][1], old_box[3][1]), # y2
|
||||
]
|
||||
return new_box
|
||||
|
||||
def _normal_box(box_data):
|
||||
# Ensure the height and width of bbox are greater than zero
|
||||
if box_data[3] - box_data[1] < 0 or box_data[2] - box_data[0] < 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
layout = []
|
||||
if not ocr_result:
|
||||
return layout
|
||||
for segment in ocr_result:
|
||||
box = segment[0]
|
||||
box = _get_box(box)
|
||||
if not _normal_box(box):
|
||||
continue
|
||||
text = segment[1][0]
|
||||
layout.append((box, text))
|
||||
return layout
|
||||
|
||||
|
||||
def ocr_result_to_text(ocr_results):
|
||||
text = ''
|
||||
for ocr_result in ocr_results:
|
||||
text += ocr_result[1][0]
|
||||
if len(text) >= 2048:
|
||||
break
|
||||
return text[:2048]
|
||||
|
||||
|
||||
def get_ocr_layout(ocr, img_path):
|
||||
"""
|
||||
获取ocr识别的结果,转为合适的layout形式
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
import cv2
|
||||
import numpy
|
||||
@@ -12,80 +11,59 @@ from tenacity import retry, stop_after_attempt, wait_random
|
||||
from log import PROJECT_ROOT
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
|
||||
after=lambda x: logging.warning('获取图片失败!'))
|
||||
def read(image_path):
|
||||
"""
|
||||
从网络或本地读取图片
|
||||
:param image_path: 网络或本地路径
|
||||
:return: NumPy数组形式的图片
|
||||
"""
|
||||
if image_path.startswith('http'):
|
||||
# 发送HTTP请求并获取图像数据
|
||||
resp = urllib.request.urlopen(image_path, timeout=60)
|
||||
# 将数据读取为字节流
|
||||
image_data = resp.read()
|
||||
# 将字节流转换为NumPy数组
|
||||
image_np = numpy.frombuffer(image_data, numpy.uint8)
|
||||
# 解码NumPy数组为OpenCV图像格式
|
||||
image = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
|
||||
else:
|
||||
image = cv2.imread(image_path)
|
||||
return image
|
||||
|
||||
|
||||
def capture(image, rectangle):
|
||||
"""
|
||||
截取图片
|
||||
:param image: 图片NumPy数组
|
||||
:param image: ndarray
|
||||
:param rectangle: 要截取的矩形
|
||||
:return: 截取之后的图片NumPy
|
||||
:return: 截取之后的ndarray图片
|
||||
"""
|
||||
x1, y1, x2, y2 = rectangle
|
||||
height, width = image.shape[:2]
|
||||
if x1 < 0:
|
||||
x1 = 0
|
||||
if y1 < 0:
|
||||
y1 = 0
|
||||
if x2 > width:
|
||||
x2 = width
|
||||
if y2 > height:
|
||||
y2 = height
|
||||
# 确保坐标值在图片范围内
|
||||
x1 = max(0, x1)
|
||||
y1 = max(0, y1)
|
||||
x2 = min(width, x2)
|
||||
y2 = min(height, y2)
|
||||
return image[int(y1):int(y2), int(x1):int(x2)]
|
||||
|
||||
|
||||
def split(image, ratio=1.414, overlap=0.05, x_compensation=3):
|
||||
def split(img_path, ratio=1.414, overlap=0.05, x_compensation=3):
|
||||
"""
|
||||
分割图片
|
||||
:param image:图片,可以是NumPy数组或文件路径
|
||||
:param img_path:图片路径
|
||||
:param ratio: 分割后的比例
|
||||
:param overlap: 图片之间的覆盖比例
|
||||
:param x_compensation: 横向补偿倍率
|
||||
:return: 分割后的图片组(NumPy数组形式)
|
||||
"""
|
||||
split_result = []
|
||||
if isinstance(image, str):
|
||||
image = read(image)
|
||||
image = cv2.imread(img_path)
|
||||
height, width = image.shape[:2]
|
||||
hw_ratio = height / width
|
||||
wh_ratio = width / height
|
||||
|
||||
img_name, img_ext = parse_save_path(img_path)
|
||||
if hw_ratio > ratio: # 纵向过长
|
||||
new_img_height = width * ratio
|
||||
step = width * (ratio - overlap) # 偏移步长
|
||||
for i in range(math.ceil(height / step)):
|
||||
offset = round(step * i)
|
||||
cropped_img = capture(image, [0, offset, width, offset + new_img_height])
|
||||
split_result.append({'img': cropped_img, 'x_offset': 0, 'y_offset': offset})
|
||||
split_path = get_save_path(f'{img_name}.split_{i}.{img_ext}')
|
||||
cv2.imwrite(split_path, cropped_img)
|
||||
split_result.append({'img': split_path, 'x_offset': 0, 'y_offset': offset})
|
||||
elif wh_ratio > ratio: # 横向过长
|
||||
new_img_width = height * ratio
|
||||
step = height * (ratio - overlap * x_compensation) # 一般文字是横向的,所以横向截取时增大重叠部分
|
||||
for i in range(math.ceil(width / step)):
|
||||
offset = round(step * i)
|
||||
cropped_img = capture(image, [offset, 0, offset + new_img_width, width])
|
||||
split_result.append({'img': cropped_img, 'x_offset': offset, 'y_offset': 0})
|
||||
split_path = get_save_path(f'{img_name}.split_{i}.{img_ext}')
|
||||
cv2.imwrite(split_path, cropped_img)
|
||||
split_result.append({'img': split_path, 'x_offset': offset, 'y_offset': 0})
|
||||
else:
|
||||
split_result.append({'img': image, 'x_offset': 0, 'y_offset': 0})
|
||||
split_result.append({'img': img_path, 'x_offset': 0, 'y_offset': 0})
|
||||
return split_result
|
||||
|
||||
|
||||
@@ -108,15 +86,16 @@ def parse_rotation_angles(image):
|
||||
return angles
|
||||
|
||||
|
||||
def rotate(image, angle):
|
||||
def rotate(img_path, angle):
|
||||
"""
|
||||
旋转图片
|
||||
:param image: 图片NumPy数组
|
||||
:param img_path: 图片NumPy数组
|
||||
:param angle: 逆时针旋转角度
|
||||
:return: 旋转后的图片NumPy数组
|
||||
"""
|
||||
if angle == 0:
|
||||
return image
|
||||
return img_path
|
||||
image = cv2.imread(img_path)
|
||||
height, width = image.shape[:2]
|
||||
if angle == 180:
|
||||
new_width = width
|
||||
@@ -132,7 +111,11 @@ def rotate(image, angle):
|
||||
matrix[1, 2] += (new_height - height) / 2
|
||||
# 参数:原始图像 旋转参数 元素图像宽高
|
||||
rotated = cv2.warpAffine(image, matrix, (new_width, new_height))
|
||||
return rotated
|
||||
|
||||
img_name, img_ext = parse_save_path(img_path)
|
||||
rotated_path = get_save_path(f'{img_name}.rotate_{angle}.{img_ext}')
|
||||
cv2.imwrite(rotated_path, rotated)
|
||||
return rotated_path
|
||||
|
||||
|
||||
def invert_rotate_point(point, center, angle):
|
||||
@@ -260,26 +243,38 @@ def parse_img_url(url):
|
||||
:return: 图片名称和图片后缀
|
||||
"""
|
||||
url = url.split('?')[0]
|
||||
return os.path.basename(url).rsplit('.', 1)
|
||||
return os.path.basename(url)
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
|
||||
after=lambda x: logging.warning('保存图片失败!'))
|
||||
def save_to_local(img_url, save_path=None):
|
||||
def save_to_local(img_url):
|
||||
"""
|
||||
保存图片到本地
|
||||
:param img_url: 图片url
|
||||
:param save_path: 本地保存地址,精确到文件名
|
||||
:return: 本地保存地址
|
||||
"""
|
||||
response = requests.get(img_url)
|
||||
response.raise_for_status() # 检查响应状态码是否正常
|
||||
|
||||
if save_path is None:
|
||||
img_name, img_ext = parse_img_url(img_url)
|
||||
save_path = os.path.join(PROJECT_ROOT, 'tmp_img', img_name + '.' + img_ext)
|
||||
|
||||
save_path = get_save_path(parse_img_url(img_url))
|
||||
with open(save_path, 'wb') as file:
|
||||
file.write(response.content)
|
||||
|
||||
return save_path
|
||||
|
||||
|
||||
def get_img_path(img_full_name):
|
||||
save_path = get_save_path(img_full_name)
|
||||
if os.path.exists(save_path):
|
||||
return save_path
|
||||
return None
|
||||
|
||||
|
||||
def get_save_path(img_full_name):
|
||||
return os.path.join(PROJECT_ROOT, 'tmp_img', img_full_name)
|
||||
|
||||
|
||||
def parse_save_path(img_path):
|
||||
img_full_name = os.path.basename(img_path)
|
||||
img_name, img_ext = img_full_name.rsplit('.', 1)
|
||||
return img_name, img_ext
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
import os.path
|
||||
|
||||
import requests
|
||||
from tenacity import retry, stop_after_attempt, wait_random
|
||||
@@ -16,9 +17,10 @@ def ocr(img_path):
|
||||
url = 'http://ocr:5001'
|
||||
response = requests.post(url, {'img_path': img_path})
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
return None
|
||||
ocr_result = response.json()
|
||||
if ocr_result:
|
||||
return ocr_result[0]
|
||||
return None
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
|
||||
@@ -40,7 +42,7 @@ def ie_settlement(img_path, layout):
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
|
||||
after=lambda x: logging.warning('从文本抽取基本医保结算单失败!'))
|
||||
def ie_settlement(text):
|
||||
def ie_settlement_text(text):
|
||||
"""
|
||||
请求基本医保结算单信息抽取接口
|
||||
:param text: 待抽取文本
|
||||
@@ -73,7 +75,7 @@ def ie_discharge(img_path, layout):
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
|
||||
after=lambda x: logging.warning('从文本抽取出院记录失败!'))
|
||||
def ie_discharge(text):
|
||||
def ie_discharge_text(text):
|
||||
"""
|
||||
请求出院记录信息抽取接口
|
||||
:param text: 待抽取文本
|
||||
@@ -106,7 +108,7 @@ def ie_cost(img_path, layout):
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
|
||||
after=lambda x: logging.warning('从文本抽取费用清单失败!'))
|
||||
def ie_cost(text):
|
||||
def ie_cost_text(text):
|
||||
"""
|
||||
请求费用清单信息抽取接口
|
||||
:param text: 待抽取文本
|
||||
@@ -147,9 +149,22 @@ def det_book(img_path):
|
||||
url = 'http://det_book:5006'
|
||||
response = requests.post(url, {'img_path': img_path})
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
book_path_list = response.json()
|
||||
if len(book_path_list) == 0:
|
||||
return img_path
|
||||
elif len(book_path_list) == 1:
|
||||
return book_path_list[0]
|
||||
else:
|
||||
max_book = img_path
|
||||
max_size = 0
|
||||
for book_path in book_path_list:
|
||||
book_size = os.path.getsize(book_path)
|
||||
if book_size > max_size:
|
||||
max_book = book_path
|
||||
max_size = book_size
|
||||
return max_book
|
||||
else:
|
||||
return [img_path]
|
||||
return img_path
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
|
||||
|
||||
Reference in New Issue
Block a user