添加信息抽取存表,根据抽取结果进行缺项判断

This commit is contained in:
2024-10-10 09:24:09 +08:00
parent fc69aa5b9d
commit 15fe5d4f0d

View File

@@ -1,3 +1,4 @@
import json
import logging
import time
from collections import defaultdict
@@ -12,7 +13,7 @@ from rapidfuzz import process, fuzz
from sqlalchemy import update
from db import MysqlSession
from db.mysql import BdYljg, BdYlks, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview
from db.mysql import BdYljg, BdYlks, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview, ZxIeResult
from log import HOSTNAME
from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \
DEPARTMENT_FILTER
@@ -98,11 +99,13 @@ def get_better_image_from_qrcode(img_path, image_id, dpi=150):
# 关键信息提取
def information_extraction(phrec, identity):
def information_extraction(phrec, pk_phhd, identity):
"""
处理单张图片
:param phrec:
:return:
:param phrec:图片信息
:param pk_phhd:案子主键
:param identity:处理批次标识
:return:记录类型,信息抽取结果
"""
img_path = image_util.get_img_path(phrec.cfjaddress)
if not img_path:
@@ -114,13 +117,13 @@ def information_extraction(phrec, identity):
if phrec.cRectype != '1':
better_img_path = None # 非结算单暂时不进行替换
if better_img_path is not None:
rec_type = '基本医保结算单'
if text:
info_extract = model_util.ie_settlement_text(text)
else:
info_extract = model_util.ie_settlement(
better_img_path, common_util.ocr_result_to_layout(model_util.ocr(better_img_path))
)
return '基本医保结算单', info_extract
else:
target_image = model_util.det_book(img_path) # 识别文档区域并裁剪
dewarped_image = model_util.dewarp(target_image) # 去扭曲
@@ -143,7 +146,19 @@ def information_extraction(phrec, identity):
else:
info_extract = None
return rec_type, info_extract
if info_extract:
result_json = json.dumps(info_extract, ensure_ascii=False)
if len(result_json) > 5000:
result_json = result_json[:5000]
now = common_util.get_default_datetime()
session = MysqlSession()
session.add(ZxIeResult(pk_phhd=pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
cfjaddress=phrec.cfjaddress, content=result_json, create_time=now,
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
session.commit()
session.close()
return rec_type, info_extract
# 从keys中获取准确率最高的value
@@ -402,7 +417,7 @@ def photo_review(pk_phhd, name):
# 同一批图的标识
identity = int(time.time())
for phrec in phrecs:
rec_type, ie_result = information_extraction(phrec, identity)
rec_type, ie_result = information_extraction(phrec, pk_phhd, identity)
if rec_type == '基本医保结算单':
rec_result = settlement_result
elif rec_type == '出院记录':
@@ -424,12 +439,13 @@ def photo_review(pk_phhd, name):
}
# 三项资料完整性判断
# 三项资料缺项判断
if (settlement_data['personal_account_payment'] + settlement_data['personal_cash_payment']
if (bool(settlement_data) and settlement_data['personal_account_payment']
and settlement_data['personal_cash_payment'] and settlement_data['medical_expenses']
and settlement_data['personal_account_payment'] + settlement_data['personal_cash_payment']
< settlement_data['medical_expenses']):
review_result['has_settlement'] = True
# TODO:出院记录和费用清单暂时没想好怎么判断
review_result['has_discharge'] = True
review_result['has_cost'] = True
review_result['has_discharge'] = bool(discharge_result)
review_result['has_cost'] = bool(cost_result)
# 三项资料缺页判断
# TODO:缺页需要对页码进行抽取,暂未训练相关模型
review_result['full_page'] = True