From 5e6a471954dfa204998b1959e407651cd4815b91 Mon Sep 17 00:00:00 2001 From: liuyebo <1515783401@qq.com> Date: Tue, 24 Dec 2024 14:55:43 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0ocr=E7=BB=93=E6=9E=9C?= =?UTF-8?q?=E5=AD=98=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- db/mysql.py | 7 +++++-- docker-compose.yml | 2 +- photo_review/auto_photo_review.py | 21 +++++++++++++++------ util/data_util.py | 6 ++++++ 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/db/mysql.py b/db/mysql.py index 9fd738d..5a9e608 100644 --- a/db/mysql.py +++ b/db/mysql.py @@ -1,5 +1,5 @@ # coding: utf-8 -from sqlalchemy import Column, DECIMAL, Date, DateTime, Index, String, text, LargeBinary +from sqlalchemy import Column, DECIMAL, Date, DateTime, Index, String, text, LargeBinary, Text from sqlalchemy.dialects.mysql import BIT, CHAR, INTEGER, TINYINT, VARCHAR from db import Base @@ -56,6 +56,7 @@ class ZxIeCost(Base): pk_ie_cost = Column(INTEGER(11), primary_key=True, comment='费用明细信息抽取主键') pk_phhd = Column(INTEGER(11), nullable=False, unique=True, comment='报销案子主键') + content = Column(Text, comment='详细内容') name = Column(String(30), comment='患者姓名') admission_date_str = Column(String(255), comment='入院日期字符串') admission_date = Column(Date, comment='入院日期') @@ -63,6 +64,8 @@ class ZxIeCost(Base): discharge_date = Column(Date, comment='出院日期') medical_expenses_str = Column(String(255), comment='费用总额字符串') medical_expenses = Column(DECIMAL(18, 2), comment='费用总额') + page_nums = Column(String(255), comment='页码') + page_count = Column(TINYINT(4), comment='页数') create_time = Column(DateTime, server_default=text("CURRENT_TIMESTAMP"), comment='创建时间') creator = Column(String(255), comment='创建人') update_time = Column(DateTime, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP"), @@ -94,7 +97,7 @@ class ZxIeDischarge(Base): pk_ie_discharge = Column(INTEGER(11), primary_key=True, comment='出院记录信息抽取主键') pk_phhd = Column(INTEGER(11), nullable=False, unique=True, comment='报销案子主键') - content = Column(String(5000), comment='详细内容') + content = Column(Text, comment='详细内容') hospital = Column(String(255), comment='医院') pk_yljg = Column(INTEGER(11), comment='医院主键') department = Column(String(255), comment='科室') diff --git a/docker-compose.yml b/docker-compose.yml index afde6a5..b199d85 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,6 @@ x-env: &template - image: fcb_photo_review:1.14.10 + image: fcb_photo_review:1.14.11 restart: always x-review: diff --git a/photo_review/auto_photo_review.py b/photo_review/auto_photo_review.py index 14d6635..5773081 100644 --- a/photo_review/auto_photo_review.py +++ b/photo_review/auto_photo_review.py @@ -26,7 +26,7 @@ from ucloud import ufile from util import image_util, util, html_util from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \ handle_insurance_type, handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, \ - parse_hospital, handle_doctor + parse_hospital, handle_doctor, handle_text # 合并信息抽取结果 @@ -41,6 +41,7 @@ def ie_temp_image(ie, ocr, image): cv2.imwrite(temp_file.name, image) ie_result = [] + ocr_pure_text = '' try: layout = util.get_ocr_layout(ocr, temp_file.name) if not layout: @@ -48,6 +49,8 @@ def ie_temp_image(ie, ocr, image): ie_result = [] else: ie_result = ie({"doc": temp_file.name, "layout": layout})[0] + for lay in layout: + ocr_pure_text += lay[1] except MemoryError as e: # 显存不足时应该抛出错误,让程序重启,同时释放显存 raise e @@ -58,7 +61,7 @@ def ie_temp_image(ie, ocr, image): os.remove(temp_file.name) except Exception as e: logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e) - return ie_result + return ie_result, ocr_pure_text # 关键信息提取 @@ -150,6 +153,7 @@ def get_better_image_from_qrcode(image, image_id, dpi=150): # 关键信息提取 def information_extraction(ie, phrecs, identity): result = {} + ocr_text = '' for phrec in phrecs: img_path = ufile.get_private_url(phrec.cfjaddress) if not img_path: @@ -168,7 +172,7 @@ def information_extraction(ie, phrecs, identity): if text: info_extract = ie(text)[0] else: - info_extract = ie_temp_image(ie, OCR, image) + info_extract = ie_temp_image(ie, OCR, image)[0] ie_result = {'result': info_extract, 'angle': '0'} now = util.get_default_datetime() @@ -201,10 +205,12 @@ def information_extraction(ie, phrecs, identity): if split_result['img'] is None or split_result['img'].size == 0: continue rotated_img = image_util.rotate(split_result['img'], int(angles[0])) - ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}] + ie_temp_result = ie_temp_image(ie, OCR, rotated_img) + ocr_text += ie_temp_result[1] + ie_results = [{'result': ie_temp_result[0], 'angle': angles[0]}] if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')): rotated_img = image_util.rotate(split_result['img'], int(angles[1])) - ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]}) + ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img)[0], 'angle': angles[1]}) now = util.get_default_datetime() best_angle = ['0', 0] for ie_result in ie_results: @@ -252,6 +258,7 @@ def information_extraction(ie, phrecs, identity): session.commit() session.close() + result['ocr_text'] = ocr_text return result @@ -414,6 +421,7 @@ def discharge_task(pk_phhd, discharge_record, identity): "doctor": handle_doctor(get_best_value_in_keys(discharge_record_ie_result, DOCTOR)), "admission_id": handle_id(get_best_value_in_keys(discharge_record_ie_result, ADMISSION_ID)), "age": handle_age(get_best_value_in_keys(discharge_record_ie_result, AGE)), + "content": handle_text(discharge_record_ie_result['ocr_text']), } discharge_data["admission_date"] = handle_date(discharge_data["admission_date_str"]) discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"]) @@ -479,7 +487,8 @@ def cost_task(pk_phhd, cost_list, identity): "name": handle_name(get_best_value_in_keys(cost_list_ie_result, PATIENT_NAME)), "admission_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, ADMISSION_DATE)), "discharge_date_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, DISCHARGE_DATE)), - "medical_expenses_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, MEDICAL_EXPENSES)) + "medical_expenses_str": handle_original_data(get_best_value_in_keys(cost_list_ie_result, MEDICAL_EXPENSES)), + "content": handle_text(cost_list_ie_result['ocr_text']), } cost_data["admission_date"] = handle_date(cost_data["admission_date_str"]) cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"]) diff --git a/util/data_util.py b/util/data_util.py index ccebea6..bacd8b0 100644 --- a/util/data_util.py +++ b/util/data_util.py @@ -190,3 +190,9 @@ def parse_hospital(string): split_hospitals = string_without_company.replace("医院", "医院 ") result += split_hospitals.strip().split(" ") return result + + +def handle_text(string): + if not string: + return "" + return string[:16383]