From e4b58e30c038463024abc3ed9658ec944502d1e7 Mon Sep 17 00:00:00 2001 From: liuyebo <1515783401@qq.com> Date: Thu, 10 Oct 2024 11:24:16 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A1=A5=E5=85=85=E7=BC=BA=E9=A1=B5=E5=88=A4?= =?UTF-8?q?=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- photo_review/auto_photo_review.py | 40 +++++++++++++++++++++++----- services/paddle_services/__init__.py | 3 ++- services/paddle_services/ie_cost.py | 2 +- util/data_util.py | 19 +++++++++++++ 4 files changed, 56 insertions(+), 8 deletions(-) diff --git a/photo_review/auto_photo_review.py b/photo_review/auto_photo_review.py index 6abe6d2..9960999 100644 --- a/photo_review/auto_photo_review.py +++ b/photo_review/auto_photo_review.py @@ -21,7 +21,8 @@ from services.paddle_services import IE_KEY from ucloud import ufile from util import image_util, common_util, html_util, model_util from util.data_util import handle_date, handle_decimal, parse_department, handle_name, handle_insurance_type, \ - handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital + handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, parse_hospital, \ + parse_page_num, handle_tiny_int # 尝试从二维码中获取高清图片 @@ -124,6 +125,7 @@ def information_extraction(phrec, pk_phhd, identity): info_extract = model_util.ie_settlement( better_img_path, common_util.ocr_result_to_layout(model_util.ocr(better_img_path)) ) + ocr_text = None # 此处肯定不是出院记录,后续用不到 else: target_image = model_util.det_book(img_path) # 识别文档区域并裁剪 dewarped_image = model_util.dewarp(target_image) # 去扭曲 @@ -158,7 +160,7 @@ def information_extraction(phrec, pk_phhd, identity): creator=HOSTNAME, update_time=now, updater=HOSTNAME)) session.commit() session.close() - return rec_type, info_extract + return rec_type, info_extract, ocr_text # 从keys中获取准确率最高的value @@ -395,6 +397,9 @@ def cost_task(pk_phhd, cost_list_ie_result): cost_data["admission_date"] = handle_date(cost_data["admission_date_str"]) cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"]) cost_data["medical_expenses"] = handle_decimal(cost_data["medical_expenses_str"]) + page_nums, page_count = parse_page_num(cost_list_ie_result[IE_KEY['page']]) + cost_data['page_nums'] = handle_original_data(','.join(page_nums)) + cost_data['page_count'] = handle_tiny_int(page_count) save_or_update_ie(ZxIeCost, pk_phhd, cost_data) return cost_data @@ -416,19 +421,24 @@ def photo_review(pk_phhd, name): session.close() # 同一批图的标识 identity = int(time.time()) + discharge_text = '' for phrec in phrecs: - rec_type, ie_result = information_extraction(phrec, pk_phhd, identity) + rec_type, ie_result, ocr_text = information_extraction(phrec, pk_phhd, identity) if rec_type == '基本医保结算单': rec_result = settlement_result elif rec_type == '出院记录': rec_result = discharge_result + discharge_text += ocr_text elif rec_type == '费用清单': rec_result = cost_result else: rec_result = None if rec_result is not None: for key, value in ie_result.items(): - rec_result[key] += value + if key == '页码': + rec_result[key].append(value) # 页码要区分来源,所以多包一层 + else: + rec_result[key] += value settlement_data = settlement_task(pk_phhd, settlement_result) discharge_data = discharge_task(pk_phhd, discharge_result) @@ -447,8 +457,26 @@ def photo_review(pk_phhd, name): review_result['has_discharge'] = bool(discharge_result) review_result['has_cost'] = bool(cost_result) # 三项资料缺页判断 - # TODO:缺页需要对页码进行抽取,暂未训练相关模型 - review_result['full_page'] = True + page_description = [] + # todo:关键词需根据实际情况调整 + discharge_key = ['入院诊断', '入院日期', '出院日期', '出院诊断', '入院情况', '诊疗经过', '出院情况', '出院医嘱'] + if not all(key in discharge_text for key in discharge_key): + page_description.append('《出院记录》缺页') + + cost_missing_page = {} + if cost_data['page_nums']: + page_nums = cost_data['page_nums'].split(',') + required_set = set(range(1, cost_data['page_count'] + 1)) + page_set = set(page_nums) + cost_missing_page = required_set - page_set + if cost_missing_page: + page_description.append(f"《住院费用清单》,缺第{','.join(cost_missing_page)}页") + + if page_description: + review_result['full_page'] = False + review_result['page_description'] = ';'.join(page_description) + else: + review_result['full_page'] = True if (review_result['has_settlement'] and review_result['has_discharge'] and review_result['has_cost'] and review_result['full_page']): diff --git a/services/paddle_services/__init__.py b/services/paddle_services/__init__.py index 4818be2..d2e5148 100644 --- a/services/paddle_services/__init__.py +++ b/services/paddle_services/__init__.py @@ -16,5 +16,6 @@ IE_KEY = { 'admission_id': '住院号', 'settlement_id': '医保结算单号码', 'age': '年龄', - 'upper_case_medical_expenses': '大写总额' + 'upper_case_medical_expenses': '大写总额', + 'page': '页码', } diff --git a/services/paddle_services/ie_cost.py b/services/paddle_services/ie_cost.py index 9da5764..fce2ef8 100644 --- a/services/paddle_services/ie_cost.py +++ b/services/paddle_services/ie_cost.py @@ -10,7 +10,7 @@ from utils import process_request app = Flask(__name__) COST_LIST_SCHEMA = tuple(IE_KEY[key] for key in [ - 'name', 'admission_date', 'discharge_date', 'medical_expenses' + 'name', 'admission_date', 'discharge_date', 'medical_expenses', 'page' ]) COST = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base', task_path='model/cost_list_model', layout_analysis=False, precision='fp16') diff --git a/util/data_util.py b/util/data_util.py index b1e7608..ce2fad0 100644 --- a/util/data_util.py +++ b/util/data_util.py @@ -178,3 +178,22 @@ def parse_hospital(string): split_hospitals = string_without_company.replace('医院', '医院 ') result += split_hospitals.strip().split(' ') return result + + +def parse_page_num(page_list): + if not page_list: + return None, None + pages = [] + total = [] + for page in page_list: + join = ''.join(page) + numbers = re.findall(r'\d+', join) + pages.append(min(numbers)) + total.append(max(numbers)) + return pages, max(total) + + +def handle_tiny_int(num): + if not num: + return None + return num if num <= 127 else 127