优化:将结果保存到zx_ocr;优化值的抽取

This commit is contained in:
2024-05-20 17:22:24 +08:00
parent 99614d6957
commit d64a71b2bb

View File

@@ -1,3 +1,4 @@
import json
from time import sleep from time import sleep
from paddlenlp import Taskflow from paddlenlp import Taskflow
@@ -8,6 +9,7 @@ from photo_review.entity.bd_ylks import BdYlks
from photo_review.entity.zx_ie_cost import ZxIeCost from photo_review.entity.zx_ie_cost import ZxIeCost
from photo_review.entity.zx_ie_discharge import ZxIeDischarge from photo_review.entity.zx_ie_discharge import ZxIeDischarge
from photo_review.entity.zx_ie_settlement import ZxIeSettlement from photo_review.entity.zx_ie_settlement import ZxIeSettlement
from photo_review.entity.zx_ocr import ZxOcr
from photo_review.entity.zx_phhd import ZxPhhd from photo_review.entity.zx_phhd import ZxPhhd
from photo_review.entity.zx_phrec import ZxPhrec from photo_review.entity.zx_phrec import ZxPhrec
from photo_review.util.data_util import handle_date, handle_decimal from photo_review.util.data_util import handle_date, handle_decimal
@@ -15,26 +17,41 @@ from photo_review.util.ucloud import get_private_url
# 关键信息提取 # 关键信息提取
def information_extraction(schema, pictures, task_path): def information_extraction(schema, phrecs, task_path):
results = {} results = {}
for picture in pictures: for phrec in phrecs:
pic_path = get_private_url(picture) pic_path = get_private_url(phrec.cfjaddress)
if pic_path: if pic_path:
ie = Taskflow("information_extraction", schema=schema, model="uie-x-base", task_path=task_path) ie = Taskflow("information_extraction", schema=schema, model="uie-x-base", task_path=task_path)
result = ie({"doc": pic_path}) result = ie({"doc": pic_path})
# 提取完保存每张图片的结果
session = MysqlSession()
zx_ocr = ZxOcr(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, cfjaddress=phrec.cfjaddress,
content=json.dumps(result, ensure_ascii=False))
session.add(zx_ocr)
session.commit()
session.close()
results.update(result[0]) results.update(result[0])
return results return results
# 从keys中获取第一个不为空的value # 从keys中获取准确率最高的value
def get_value_in_keys(source, keys): def get_best_value_in_keys(source, keys):
# 最终结果
result = None
# 最大可能性
most_probability = 0
for key in keys: for key in keys:
value = source.get(key) values = source.get(key)
if value: if values:
value = value[0].get("text") for value in values:
if value: text = value.get("text")
return value probability = value.get("probability")
return None if text and probability > most_probability:
result = text
return result
# 从keys中获取所有value组成list # 从keys中获取所有value组成list
@@ -69,17 +86,17 @@ def photo_review(pk_phhd):
cost_list = [] cost_list = []
session = MysqlSession() session = MysqlSession()
phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.cRectype, ZxPhrec.cfjaddress) \ phrecs = session.query(ZxPhrec.pk_phrec, ZxPhrec.pk_phhd, ZxPhrec.cRectype, ZxPhrec.cfjaddress) \
.filter(ZxPhrec.pk_phhd == pk_phhd) \ .filter(ZxPhrec.pk_phhd == pk_phhd) \
.all() .all()
session.close() session.close()
for phrec in phrecs: for phrec in phrecs:
if phrec.cRectype == "1": if phrec.cRectype == "1":
settlement_list.append(phrec.cfjaddress) settlement_list.append(phrec)
elif phrec.cRectype == "3": elif phrec.cRectype == "3":
discharge_record.append(phrec.cfjaddress) discharge_record.append(phrec)
elif phrec.cRectype == "4": elif phrec.cRectype == "4":
cost_list.append(phrec.cfjaddress) cost_list.append(phrec)
name_key = ["姓名", "交款人"] name_key = ["姓名", "交款人"]
admission_date_key = ["入院日期", "住院时间", "开始日期", "费用发生时间", "入院时间", "住院日期"] admission_date_key = ["入院日期", "住院时间", "开始日期", "费用发生时间", "入院时间", "住院日期"]
@@ -110,14 +127,14 @@ def photo_review(pk_phhd):
"config/model/settlement_list_model") "config/model/settlement_list_model")
settlement_data = { settlement_data = {
"pk_phhd": pk_phhd, "pk_phhd": pk_phhd,
"name": get_value_in_keys(settlement_list_ie_result, name_key), "name": get_best_value_in_keys(settlement_list_ie_result, name_key),
"admission_date_str": get_value_in_keys(settlement_list_ie_result, admission_date_key), "admission_date_str": get_best_value_in_keys(settlement_list_ie_result, admission_date_key),
"discharge_date_str": get_value_in_keys(settlement_list_ie_result, discharge_date_key), "discharge_date_str": get_best_value_in_keys(settlement_list_ie_result, discharge_date_key),
"medical_expenses_str": get_value_in_keys(settlement_list_ie_result, medical_expenses_key), "medical_expenses_str": get_best_value_in_keys(settlement_list_ie_result, medical_expenses_key),
"personal_cash_payment_str": get_value_in_keys(settlement_list_ie_result, personal_cash_payment_key), "personal_cash_payment_str": get_best_value_in_keys(settlement_list_ie_result, personal_cash_payment_key),
"personal_account_payment_str": get_value_in_keys(settlement_list_ie_result, personal_account_payment_key), "personal_account_payment_str": get_best_value_in_keys(settlement_list_ie_result, personal_account_payment_key),
"personal_funded_amount_str": get_value_in_keys(settlement_list_ie_result, personal_funded_amount_key), "personal_funded_amount_str": get_best_value_in_keys(settlement_list_ie_result, personal_funded_amount_key),
"medical_insurance_type": get_value_in_keys(settlement_list_ie_result, medical_insurance_type_key) "medical_insurance_type": get_best_value_in_keys(settlement_list_ie_result, medical_insurance_type_key)
} }
settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"]) settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"]) settlement_data["admission_date"] = handle_date(settlement_data["admission_date_str"])
@@ -132,12 +149,12 @@ def photo_review(pk_phhd):
"config/model/discharge_record_model") "config/model/discharge_record_model")
discharge_data = { discharge_data = {
"pk_phhd": pk_phhd, "pk_phhd": pk_phhd,
"hospital": get_value_in_keys(discharge_record_ie_result, hospital_key), "hospital": get_best_value_in_keys(discharge_record_ie_result, hospital_key),
"department": get_value_in_keys(discharge_record_ie_result, department_key), "department": get_best_value_in_keys(discharge_record_ie_result, department_key),
"name": get_value_in_keys(discharge_record_ie_result, name_key), "name": get_best_value_in_keys(discharge_record_ie_result, name_key),
"admission_date_str": get_value_in_keys(discharge_record_ie_result, admission_date_key), "admission_date_str": get_best_value_in_keys(discharge_record_ie_result, admission_date_key),
"discharge_date_str": get_value_in_keys(discharge_record_ie_result, discharge_date_key), "discharge_date_str": get_best_value_in_keys(discharge_record_ie_result, discharge_date_key),
"doctor": get_value_in_keys(discharge_record_ie_result, doctor_key) "doctor": get_best_value_in_keys(discharge_record_ie_result, doctor_key)
} }
discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"]) discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"])
discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"]) discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"])
@@ -164,10 +181,10 @@ def photo_review(pk_phhd):
cost_list_ie_result = information_extraction(cost_list_schema, cost_list, "config/model/cost_list_model") cost_list_ie_result = information_extraction(cost_list_schema, cost_list, "config/model/cost_list_model")
cost_data = { cost_data = {
"pk_phhd": pk_phhd, "pk_phhd": pk_phhd,
"name": get_value_in_keys(cost_list_ie_result, name_key), "name": get_best_value_in_keys(cost_list_ie_result, name_key),
"admission_date_str": get_value_in_keys(cost_list_ie_result, admission_date_key), "admission_date_str": get_best_value_in_keys(cost_list_ie_result, admission_date_key),
"discharge_date_str": get_value_in_keys(cost_list_ie_result, discharge_date_key), "discharge_date_str": get_best_value_in_keys(cost_list_ie_result, discharge_date_key),
"medical_expenses_str": get_value_in_keys(cost_list_ie_result, medical_expenses_key) "medical_expenses_str": get_best_value_in_keys(cost_list_ie_result, medical_expenses_key)
} }
cost_data["admission_date"] = handle_date(cost_data["admission_date_str"]) cost_data["admission_date"] = handle_date(cost_data["admission_date_str"])
cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"]) cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"])