From de631bef2e3128f37940c58b3b2ce97baf3489a9 Mon Sep 17 00:00:00 2001 From: liuyebo <1515783401@qq.com> Date: Thu, 5 Sep 2024 13:29:17 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E4=BA=8C=E7=BB=B4=E7=A0=81?= =?UTF-8?q?=E8=AF=86=E5=88=AB=E6=9B=BF=E6=8D=A2=E9=AB=98=E6=B8=85=E5=9B=BE?= =?UTF-8?q?=E7=89=87=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 +- docker-compose.yml | 153 +++++++++------------------- photo_review/__init__.py | 90 ++++++++--------- photo_review/auto_photo_review.py | 160 +++++++++++++++++++++--------- requirements.txt | 3 +- util/html_util.py | 43 ++++++++ 6 files changed, 255 insertions(+), 198 deletions(-) create mode 100644 util/html_util.py diff --git a/README.md b/README.md index e3f7cd2..6cdf102 100644 --- a/README.md +++ b/README.md @@ -117,4 +117,6 @@ 4. 新增自动识别错误分析功能 20. 版本号:1.13.0 1. 新增文档检测功能 - 2. 新增扭曲矫正功能 \ No newline at end of file + 2. 新增扭曲矫正功能 +21. 版本号:1.14.0 + 1. 新增二维码识别替换高清图片功能 \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 543cf5a..68a3924 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,8 +1,35 @@ -x-env: +template: &template - image: fcb_photo_review:1.13.10 + image: fcb_photo_review:1.14.1 restart: always +review_template: + &review_template + <<: *template + volumes: + - ./log:/app/log + - ./model:/app/model + deploy: + resources: + reservations: + devices: + - device_ids: [ '0', '1' ] + capabilities: [ 'gpu' ] + driver: 'nvidia' + +mask_template: + &mask_template + <<: *template + volumes: + - ./log:/app/log + deploy: + resources: + reservations: + devices: + - device_ids: [ '1' ] + capabilities: [ 'gpu' ] + driver: 'nvidia' + services: det_api: <<: *template @@ -13,153 +40,67 @@ services: volumes: - ./log:/app/log - ./model:/app/model - command: [ "det_api.py" ] +# command: [ 'det_api.py' ] deploy: resources: reservations: devices: - - device_ids: [ "0" ] - capabilities: [ "gpu" ] - driver: "nvidia" + - device_ids: [ '0' ] + capabilities: [ 'gpu' ] + driver: 'nvidia' photo_review_1: - <<: *template + <<: *review_template container_name: photo_review_1 hostname: photo_review_1 - volumes: - - ./log:/app/log - - ./model:/app/model depends_on: - det_api - command: [ "photo_review.py", "--clean", "True" ] - deploy: - resources: - reservations: - devices: - - device_ids: [ "0", "1" ] - capabilities: [ "gpu" ] - driver: "nvidia" + command: [ 'photo_review.py', '--clean', 'True' ] photo_review_2: - <<: *template + <<: *review_template container_name: photo_review_2 hostname: photo_review_2 - volumes: - - ./log:/app/log - - ./model:/app/model depends_on: - photo_review_1 - command: [ "photo_review.py" ] - deploy: - resources: - reservations: - devices: - - device_ids: [ "0", "1" ] - capabilities: [ "gpu" ] - driver: "nvidia" + command: [ 'photo_review.py' ] photo_review_3: - <<: *template + <<: *review_template container_name: photo_review_3 hostname: photo_review_3 - volumes: - - ./log:/app/log - - ./model:/app/model depends_on: - photo_review_2 - command: [ "photo_review.py" ] - deploy: - resources: - reservations: - devices: - - device_ids: [ "0", "1" ] - capabilities: [ "gpu" ] - driver: "nvidia" + command: [ 'photo_review.py' ] photo_review_4: - <<: *template + <<: *review_template container_name: photo_review_4 hostname: photo_review_4 - volumes: - - ./log:/app/log - - ./model:/app/model depends_on: - photo_review_3 - command: [ "photo_review.py" ] - deploy: - resources: - reservations: - devices: - - device_ids: [ "0", "1" ] - capabilities: [ "gpu" ] - driver: "nvidia" + command: [ 'photo_review.py' ] photo_review_5: - <<: *template + <<: *review_template container_name: photo_review_5 hostname: photo_review_5 - volumes: - - ./log:/app/log - - ./model:/app/model depends_on: - photo_review_4 - command: [ "photo_review.py" ] - deploy: - resources: - reservations: - devices: - - device_ids: [ "0", "1" ] - capabilities: [ "gpu" ] - driver: "nvidia" + command: [ 'photo_review.py' ] photo_mask_1: - <<: *template + <<: *mask_template container_name: photo_mask_1 hostname: photo_mask_1 - volumes: - - ./log:/app/log depends_on: - photo_review_5 - command: [ "photo_mask.py", "--clean", "True" ] - deploy: - resources: - reservations: - devices: - - device_ids: [ "1" ] - capabilities: [ "gpu" ] - driver: "nvidia" + command: [ 'photo_mask.py', '--clean', 'True' ] photo_mask_2: - <<: *template + <<: *mask_template container_name: photo_mask_2 hostname: photo_mask_2 - volumes: - - ./log:/app/log depends_on: - photo_mask_1 - command: [ "photo_mask.py" ] - deploy: - resources: - reservations: - devices: - - device_ids: [ "1" ] - capabilities: [ "gpu" ] - driver: "nvidia" - -# photo_review_6: -# <<: *template -# container_name: photo_review_6 -# hostname: photo_review_6 -# volumes: -# - ./log:/app/log -# - ./model:/app/model -# depends_on: -# - photo_mask_2 -# command: [ "photo_review.py" ] -# deploy: -# resources: -# reservations: -# devices: -# - device_ids: [ "0", "1" ] -# capabilities: [ "gpu" ] -# driver: "nvidia" \ No newline at end of file + command: [ 'photo_mask.py' ] \ No newline at end of file diff --git a/photo_review/__init__.py b/photo_review/__init__.py index 4077044..23a4d76 100644 --- a/photo_review/__init__.py +++ b/photo_review/__init__.py @@ -2,9 +2,9 @@ import jieba from paddlenlp import Taskflow from paddleocr import PaddleOCR -""" +''' 项目配置 -""" +''' # 每次从数据库获取的案子数量 PHHD_BATCH_SIZE = 10 # 没有查询到案子的等待时间(分钟) @@ -18,35 +18,35 @@ LAYOUT_ANALYSIS = False 信息抽取关键词配置 """ # 患者姓名 -PATIENT_NAME = ["患者姓名"] +PATIENT_NAME = ['患者姓名'] # 入院日期 -ADMISSION_DATE = ["入院日期"] +ADMISSION_DATE = ['入院日期'] # 出院日期 -DISCHARGE_DATE = ["出院日期"] +DISCHARGE_DATE = ['出院日期'] # 发生医疗费 -MEDICAL_EXPENSES = ["费用总额"] +MEDICAL_EXPENSES = ['费用总额'] # 个人现金支付 -PERSONAL_CASH_PAYMENT = ["个人现金支付"] +PERSONAL_CASH_PAYMENT = ['个人现金支付'] # 个人账户支付 -PERSONAL_ACCOUNT_PAYMENT = ["个人账户支付"] +PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付'] # 个人自费金额 -PERSONAL_FUNDED_AMOUNT = ["自费金额"] +PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费'] # 医保类别 -MEDICAL_INSURANCE_TYPE = ["医保类型"] +MEDICAL_INSURANCE_TYPE = ['医保类型'] # 就诊医院 -HOSPITAL = ["医院"] +HOSPITAL = ['医院'] # 就诊科室 -DEPARTMENT = ["科室"] +DEPARTMENT = ['科室'] # 主治医生 -DOCTOR = ["主治医生"] +DOCTOR = ['主治医生'] # 住院号 -ADMISSION_ID = ["住院号"] +ADMISSION_ID = ['住院号'] # 医保结算单号码 -SETTLEMENT_ID = ["医保结算单号码"] +SETTLEMENT_ID = ['医保结算单号码'] # 年龄 -AGE = ["年龄"] +AGE = ['年龄'] # 大写总额 -UPPERCASE_MEDICAL_EXPENSES = ["大写总额"] +UPPERCASE_MEDICAL_EXPENSES = ['大写总额'] SETTLEMENT_LIST_SCHEMA = \ (PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT @@ -58,47 +58,47 @@ DISCHARGE_RECORD_SCHEMA = \ COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES -""" +''' 别名配置 -""" +''' # 使用别名中的value替换key。考虑到效率问题,只会替换第一个匹配到的key。 HOSPITAL_ALIAS = { - "沐阳": ["沭阳"], - "连水": ["涟水"], - "唯宁": ["睢宁"], - "九〇四": ["904"], - "漂水": ["溧水"], + '沐阳': ['沭阳'], + '连水': ['涟水'], + '唯宁': ['睢宁'], # 雕宁 + '九〇四': ['904'], + '漂水': ['溧水'], } DEPARTMENT_ALIAS = { - "耳鼻喉": ["耳鼻咽喉"], - "急症": ["急诊"], + '耳鼻喉': ['耳鼻咽喉'], + '急症': ['急诊'], } -""" +''' 搜索过滤配置 -""" +''' # 默认会过滤单字 -HOSPITAL_FILTER = ["医院", "人民", "第一", "第二", "第三", "大学", "附属"] +HOSPITAL_FILTER = ['医院', '人民', '第一', '第二', '第三', '大学', '附属'] -DEPARTMENT_FILTER = ["医", "伤", "西", "新"] +DEPARTMENT_FILTER = ['医', '伤', '西', '新'] -""" +''' 分词配置 -""" -jieba.suggest_freq(("肿瘤", "医院"), True) -jieba.suggest_freq(("骨", "伤"), True) -jieba.suggest_freq(("感染", "性"), True) -jieba.suggest_freq(("胆", "道"), True) -jieba.suggest_freq(("脾", "胃"), True) +''' +jieba.suggest_freq(('肿瘤', '医院'), True) +jieba.suggest_freq(('骨', '伤'), True) +jieba.suggest_freq(('感染', '性'), True) +jieba.suggest_freq(('胆', '道'), True) +jieba.suggest_freq(('脾', '胃'), True) -""" +''' 模型配置 -""" -SETTLEMENT_IE = Taskflow("information_extraction", schema=SETTLEMENT_LIST_SCHEMA, model="uie-x-base", - task_path="model/settlement_list_model", layout_analysis=LAYOUT_ANALYSIS, precision='fp16') -DISCHARGE_IE = Taskflow("information_extraction", schema=DISCHARGE_RECORD_SCHEMA, model="uie-x-base", - task_path="model/discharge_record_model", layout_analysis=LAYOUT_ANALYSIS, precision='fp16') -COST_IE = Taskflow("information_extraction", schema=COST_LIST_SCHEMA, model="uie-x-base", device_id=1, - task_path="model/cost_list_model", layout_analysis=LAYOUT_ANALYSIS, precision='fp16') +''' +SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base', + task_path='model/settlement_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16') +DISCHARGE_IE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base', + task_path='model/discharge_record_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16') +COST_IE = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base', device_id=1, + task_path='model/cost_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16') OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=1, det_db_box_thresh=0.3) diff --git a/photo_review/auto_photo_review.py b/photo_review/auto_photo_review.py index b39c0dc..feedcb2 100644 --- a/photo_review/auto_photo_review.py +++ b/photo_review/auto_photo_review.py @@ -7,22 +7,23 @@ from collections import defaultdict from time import sleep import cv2 +import fitz import jieba +import numpy as np import requests +import zxingcpp from rapidfuzz import process, fuzz from sqlalchemy import update from db import MysqlSession from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec -from doc_dewarp import dewarp from log import HOSTNAME -from paddle_detection import detector from photo_review import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \ PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR, \ ADMISSION_ID, SETTLEMENT_ID, AGE, OCR, SETTLEMENT_IE, DISCHARGE_IE, COST_IE, PHHD_BATCH_SIZE, SLEEP_MINUTES, \ UPPERCASE_MEDICAL_EXPENSES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, DEPARTMENT_FILTER from ucloud import ufile -from util import image_util, util +from util import image_util, util, html_util from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \ handle_insurance_type, handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, \ parse_hospital @@ -73,6 +74,47 @@ def request_ie_result(task_enum, phrecs): raise Exception(f"请求信息抽取结果失败,状态码:{response.status_code}") +# 尝试从二维码中获取高清图片 +def get_better_image_from_qrcode(image, dpi=150): + js_base_url = 'http://einvoice.jsczt.cn' + results = zxingcpp.read_barcodes(image) + for result in results: + pdf = None + pdf_path = None + try: + url = result.text + if url.startswith(js_base_url): + id_base = html_util.get_jsczt_id_base(url) + pdf_url = f'{js_base_url}/download?idBase={id_base}' + pdf_path = html_util.download_pdf(pdf_url) + # 打开PDF文件 + pdf = fitz.open(pdf_path) + # 选择第一页 + page = pdf[0] + # 定义缩放系数(DPI) + default_dpi = 72 + zoom = dpi / default_dpi + # 设置矩阵变换参数 + mat = fitz.Matrix(zoom, zoom) + # 渲染页面 + pix = page.get_pixmap(matrix=mat) + # 将渲染结果转换为OpenCV兼容的格式 + img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1)) + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + return img, page.get_text() + else: + logging.getLogger('qr').info(f'未知二维码内容:{url}') + except Exception as e: + logging.getLogger('error').error('从二维码中获取高清图片时出错', exc_info=e) + continue + finally: + if pdf: + pdf.close() + if pdf_path: + util.delete_temp_file(pdf_path) + return None, None + + # 关键信息提取 def information_extraction(ie, phrecs, identity): result = {} @@ -83,60 +125,88 @@ def information_extraction(ie, phrecs, identity): image = image_util.read(img_path) - target_images = [] - target_images += detector.request_book_areas(image) # 识别文档区域并裁剪 - if not target_images: - target_images.append(image) # 识别失败 - angle_count = defaultdict(int, {"0": 0}) # 分割后图片的最优角度统计 - for target_image in target_images: - dewarped_image = dewarp.dewarp_image(target_image) # 去扭曲 - angles = image_util.parse_rotation_angles(dewarped_image) - zx_ie_results = [] - split_results = image_util.split(dewarped_image) - for split_result in split_results: - if split_result["img"] is None or split_result["img"].size == 0: - continue - rotated_img = image_util.rotate(split_result["img"], int(angles[0])) - ie_results = [{"result": ie_temp_image(ie, OCR, rotated_img), "angle": angles[0]}] - if not ie_results[0]["result"] or len(ie_results[0]["result"]) < len(ie.kwargs.get("schema")): - rotated_img = image_util.rotate(split_result["img"], int(angles[1])) - ie_results.append({"result": ie_temp_image(ie, OCR, rotated_img), "angle": angles[1]}) + # 尝试从二维码中获取高清图片 + better_image, text = get_better_image_from_qrcode(image) + zx_ie_results = [] + if better_image is not None: + img_angle = '0' + image = better_image + if text: + info_extract = ie(text)[0] + else: + info_extract = ie_temp_image(ie, OCR, image) + ie_result = {'result': info_extract, 'angle': '0'} - now = util.get_default_datetime() - best_angle = ["0", 0] - for ie_result in ie_results: - if not ie_result["result"]: + now = util.get_default_datetime() + if not ie_result['result']: + continue + + result_json = json.dumps(ie_result['result'], ensure_ascii=False) + if len(result_json) > 5000: + result_json = result_json[:5000] + zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity, + cfjaddress=phrec.cfjaddress, content=result_json, + rotation_angle=int(ie_result['angle']), + x_offset=0, y_offset=0, create_time=now, + creator=HOSTNAME, update_time=now, updater=HOSTNAME)) + + result = merge_result(result, ie_result['result']) + else: + target_images = [] + # target_images += detector.request_book_areas(image) # 识别文档区域并裁剪 + if not target_images: + target_images.append(image) # 识别失败 + angle_count = defaultdict(int, {'0': 0}) # 分割后图片的最优角度统计 + for target_image in target_images: + # dewarped_image = dewarp.dewarp_image(target_image) # 去扭曲 + dewarped_image = target_image + angles = image_util.parse_rotation_angles(dewarped_image) + + split_results = image_util.split(dewarped_image) + for split_result in split_results: + if split_result['img'] is None or split_result['img'].size == 0: continue + rotated_img = image_util.rotate(split_result['img'], int(angles[0])) + ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}] + if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')): + rotated_img = image_util.rotate(split_result['img'], int(angles[1])) + ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]}) + now = util.get_default_datetime() + best_angle = ['0', 0] + for ie_result in ie_results: + if not ie_result['result']: + continue - result_json = json.dumps(ie_result["result"], ensure_ascii=False) - if len(result_json) > 5000: - result_json = result_json[:5000] - zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity, - cfjaddress=phrec.cfjaddress, content=result_json, - rotation_angle=int(ie_result["angle"]), - x_offset=split_result["x_offset"], - y_offset=split_result["y_offset"], create_time=now, - creator=HOSTNAME, update_time=now, updater=HOSTNAME)) + result_json = json.dumps(ie_result['result'], ensure_ascii=False) + if len(result_json) > 5000: + result_json = result_json[:5000] + zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity, + cfjaddress=phrec.cfjaddress, content=result_json, + rotation_angle=int(ie_result['angle']), + x_offset=split_result['x_offset'], + y_offset=split_result['y_offset'], create_time=now, + creator=HOSTNAME, update_time=now, updater=HOSTNAME)) - result = merge_result(result, ie_result["result"]) + result = merge_result(result, ie_result['result']) - if len(ie_result["result"]) > best_angle[1]: - best_angle = [ie_result["angle"], len(ie_result["result"])] + if len(ie_result['result']) > best_angle[1]: + best_angle = [ie_result['angle'], len(ie_result['result'])] - angle_count[best_angle[0]] += 1 + angle_count[best_angle[0]] += 1 + img_angle = max(angle_count, key=angle_count.get) - img_angle = max(angle_count, key=angle_count.get) - if img_angle != "0": + if img_angle != '0' or better_image is not None: image = image_util.rotate(image, int(img_angle)) - with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file: + with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: cv2.imwrite(temp_file.name, image) try: ufile.upload_file(phrec.cfjaddress, temp_file.name) - # 修正旋转角度 - for zx_ie_result in zx_ie_results: - zx_ie_result.rotation_angle -= int(img_angle) + if img_angle != '0': + # 修正旋转角度 + for zx_ie_result in zx_ie_results: + zx_ie_result.rotation_angle -= int(img_angle) except Exception as e: - logging.error(f"上传图片({phrec.cfjaddress})失败", exc_info=e) + logging.error(f'上传图片({phrec.cfjaddress})失败', exc_info=e) finally: util.delete_temp_file(temp_file.name) diff --git a/requirements.txt b/requirements.txt index 4d49848..6697c5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ requests==2.32.3 sqlacodegen==2.3.0.post1 sqlalchemy==1.4.52 tenacity==8.5.0 -ufile==3.2.9 \ No newline at end of file +ufile==3.2.9 +zxing-cpp==2.2.0 \ No newline at end of file diff --git a/util/html_util.py b/util/html_util.py new file mode 100644 index 0000000..caeeda2 --- /dev/null +++ b/util/html_util.py @@ -0,0 +1,43 @@ +import logging +import tempfile + +import requests +from bs4 import BeautifulSoup +from tenacity import retry, stop_after_attempt, wait_random + + +@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, + after=lambda x: logging.warning('获取江苏省财政票据idBase失败!')) +def get_jsczt_id_base(url): + response = requests.get(url) + if response.status_code != 200: + raise Exception(f'请求江苏省财政票据失败!状态码: {response.status_code}') + soup = BeautifulSoup(response.text, 'html.parser') + hidden_input = soup.find('input', {'name': "idBase"}) + if hidden_input: + # 获取隐藏字段的值 + value = hidden_input.get('value') + return value + return None + + +@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True, + after=lambda x: logging.warning('下载pdf失败!')) +def download_pdf(url, local_filename=None): + # 如果没有提供文件名,则使用URL中的文件名 + if local_filename is None: + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf: + local_filename = temp_pdf.name + # 发送HTTP GET请求 + response = requests.get(url, stream=True) + # 检查请求是否成功 + if response.status_code != 200: + raise Exception(f'下载pdf失败!状态码: {response.status_code}') + else: + # 打开一个文件用于写入二进制数据 + with open(local_filename, 'wb') as file: + # 迭代写入文件 + for chunk in response.iter_content(chunk_size=8192): + if chunk: # filter out keep-alive new chunks + file.write(chunk) + return local_filename