增加ocr结果存表

This commit is contained in:
2024-12-24 14:55:43 +08:00
parent 96b8a06e6c
commit 5e6a471954
4 changed files with 27 additions and 9 deletions

View File

@@ -1,5 +1,5 @@
# coding: utf-8 # coding: utf-8
from sqlalchemy import Column, DECIMAL, Date, DateTime, Index, String, text, LargeBinary from sqlalchemy import Column, DECIMAL, Date, DateTime, Index, String, text, LargeBinary, Text
from sqlalchemy.dialects.mysql import BIT, CHAR, INTEGER, TINYINT, VARCHAR from sqlalchemy.dialects.mysql import BIT, CHAR, INTEGER, TINYINT, VARCHAR
from db import Base from db import Base
@@ -56,6 +56,7 @@ class ZxIeCost(Base):
pk_ie_cost = Column(INTEGER(11), primary_key=True, comment='费用明细信息抽取主键') pk_ie_cost = Column(INTEGER(11), primary_key=True, comment='费用明细信息抽取主键')
pk_phhd = Column(INTEGER(11), nullable=False, unique=True, comment='报销案子主键') pk_phhd = Column(INTEGER(11), nullable=False, unique=True, comment='报销案子主键')
content = Column(Text, comment='详细内容')
name = Column(String(30), comment='患者姓名') name = Column(String(30), comment='患者姓名')
admission_date_str = Column(String(255), comment='入院日期字符串') admission_date_str = Column(String(255), comment='入院日期字符串')
admission_date = Column(Date, comment='入院日期') admission_date = Column(Date, comment='入院日期')
@@ -63,6 +64,8 @@ class ZxIeCost(Base):
discharge_date = Column(Date, comment='出院日期') discharge_date = Column(Date, comment='出院日期')
medical_expenses_str = Column(String(255), comment='费用总额字符串') medical_expenses_str = Column(String(255), comment='费用总额字符串')
medical_expenses = Column(DECIMAL(18, 2), comment='费用总额') medical_expenses = Column(DECIMAL(18, 2), comment='费用总额')
page_nums = Column(String(255), comment='页码')
page_count = Column(TINYINT(4), comment='页数')
create_time = Column(DateTime, server_default=text("CURRENT_TIMESTAMP"), comment='创建时间') create_time = Column(DateTime, server_default=text("CURRENT_TIMESTAMP"), comment='创建时间')
creator = Column(String(255), comment='创建人') creator = Column(String(255), comment='创建人')
update_time = Column(DateTime, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP"), update_time = Column(DateTime, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP"),
@@ -94,7 +97,7 @@ class ZxIeDischarge(Base):
pk_ie_discharge = Column(INTEGER(11), primary_key=True, comment='出院记录信息抽取主键') pk_ie_discharge = Column(INTEGER(11), primary_key=True, comment='出院记录信息抽取主键')
pk_phhd = Column(INTEGER(11), nullable=False, unique=True, comment='报销案子主键') pk_phhd = Column(INTEGER(11), nullable=False, unique=True, comment='报销案子主键')
content = Column(String(5000), comment='详细内容') content = Column(Text, comment='详细内容')
hospital = Column(String(255), comment='医院') hospital = Column(String(255), comment='医院')
pk_yljg = Column(INTEGER(11), comment='医院主键') pk_yljg = Column(INTEGER(11), comment='医院主键')
department = Column(String(255), comment='科室') department = Column(String(255), comment='科室')

View File

@@ -1,6 +1,6 @@
x-env: x-env:
&template &template
image: fcb_photo_review:1.14.10 image: fcb_photo_review:1.14.11
restart: always restart: always
x-review: x-review:

View File

@@ -26,7 +26,7 @@ from ucloud import ufile
from util import image_util, util, html_util from util import image_util, util, html_util
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \ from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \
handle_insurance_type, handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, \ handle_insurance_type, handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, \
parse_hospital, handle_doctor parse_hospital, handle_doctor, handle_text
# 合并信息抽取结果 # 合并信息抽取结果
@@ -41,6 +41,7 @@ def ie_temp_image(ie, ocr, image):
cv2.imwrite(temp_file.name, image) cv2.imwrite(temp_file.name, image)
ie_result = [] ie_result = []
ocr_pure_text = ''
try: try:
layout = util.get_ocr_layout(ocr, temp_file.name) layout = util.get_ocr_layout(ocr, temp_file.name)
if not layout: if not layout:
@@ -48,6 +49,8 @@ def ie_temp_image(ie, ocr, image):
ie_result = [] ie_result = []
else: else:
ie_result = ie({"doc": temp_file.name, "layout": layout})[0] ie_result = ie({"doc": temp_file.name, "layout": layout})[0]
for lay in layout:
ocr_pure_text += lay[1]
except MemoryError as e: except MemoryError as e:
# 显存不足时应该抛出错误,让程序重启,同时释放显存 # 显存不足时应该抛出错误,让程序重启,同时释放显存
raise e raise e
@@ -58,7 +61,7 @@ def ie_temp_image(ie, ocr, image):
os.remove(temp_file.name) os.remove(temp_file.name)
except Exception as e: except Exception as e:
logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e) logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e)
return ie_result return ie_result, ocr_pure_text
# 关键信息提取 # 关键信息提取
@@ -150,6 +153,7 @@ def get_better_image_from_qrcode(image, image_id, dpi=150):
# 关键信息提取 # 关键信息提取
def information_extraction(ie, phrecs, identity): def information_extraction(ie, phrecs, identity):
result = {} result = {}
ocr_text = ''
for phrec in phrecs: for phrec in phrecs:
img_path = ufile.get_private_url(phrec.cfjaddress) img_path = ufile.get_private_url(phrec.cfjaddress)
if not img_path: if not img_path:
@@ -168,7 +172,7 @@ def information_extraction(ie, phrecs, identity):
if text: if text:
info_extract = ie(text)[0] info_extract = ie(text)[0]
else: else:
info_extract = ie_temp_image(ie, OCR, image) info_extract = ie_temp_image(ie, OCR, image)[0]
ie_result = {'result': info_extract, 'angle': '0'} ie_result = {'result': info_extract, 'angle': '0'}
now = util.get_default_datetime() now = util.get_default_datetime()
@@ -201,10 +205,12 @@ def information_extraction(ie, phrecs, identity):
if split_result['img'] is None or split_result['img'].size == 0: if split_result['img'] is None or split_result['img'].size == 0:
continue continue
rotated_img = image_util.rotate(split_result['img'], int(angles[0])) rotated_img = image_util.rotate(split_result['img'], int(angles[0]))
ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}] ie_temp_result = ie_temp_image(ie, OCR, rotated_img)
ocr_text += ie_temp_result[1]
ie_results = [{'result': ie_temp_result[0], 'angle': angles[0]}]
if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')): if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')):
rotated_img = image_util.rotate(split_result['img'], int(angles[1])) rotated_img = image_util.rotate(split_result['img'], int(angles[1]))
ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]}) ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img)[0], 'angle': angles[1]})
now = util.get_default_datetime() now = util.get_default_datetime()
best_angle = ['0', 0] best_angle = ['0', 0]
for ie_result in ie_results: for ie_result in ie_results:
@@ -252,6 +258,7 @@ def information_extraction(ie, phrecs, identity):
session.commit() session.commit()
session.close() session.close()
result['ocr_text'] = ocr_text
return result return result
@@ -414,6 +421,7 @@ def discharge_task(pk_phhd, discharge_record, identity):
"doctor": handle_doctor(get_best_value_in_keys(discharge_record_ie_result, DOCTOR)), "doctor": handle_doctor(get_best_value_in_keys(discharge_record_ie_result, DOCTOR)),
"admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_ID)), "admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_ID)),
"age": handle_age(get_best_value_in_keys(discharge_record_ie_result, AGE)), "age": handle_age(get_best_value_in_keys(discharge_record_ie_result, AGE)),
"content": handle_text(discharge_record_ie_result['ocr_text']),
} }
discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"]) discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"])
discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"]) discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"])
@@ -479,7 +487,8 @@ def cost_task(pk_phhd, cost_list, identity):
"name": handle_name(get_best_value_in_keys(cost_list_ie_result, PATIENT_NAME)), "name": handle_name(get_best_value_in_keys(cost_list_ie_result, PATIENT_NAME)),
"admission_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, ADMISSION_DATE)), "admission_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, ADMISSION_DATE)),
"discharge_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, DISCHARGE_DATE)), "discharge_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, DISCHARGE_DATE)),
"medical_expenses_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, MEDICAL_EXPENSES)) "medical_expenses_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, MEDICAL_EXPENSES)),
"content": handle_text(cost_list_ie_result['ocr_text']),
} }
cost_data["admission_date"] = handle_date(cost_data["admission_date_str"]) cost_data["admission_date"] = handle_date(cost_data["admission_date_str"])
cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"]) cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"])

View File

@@ -190,3 +190,9 @@ def parse_hospital(string):
split_hospitals = string_without_company.replace("医院", "医院 ") split_hospitals = string_without_company.replace("医院", "医院 ")
result += split_hospitals.strip().split(" ") result += split_hospitals.strip().split(" ")
return result return result
def handle_text(string):
if not string:
return ""
return string[:16383]