优化图片的存储，及时删除处理过程中产生的图片

2024-10-15 10:17:01 +08:00
parent 15ea3ff96f
commit 5af6256376
5 changed files with 141 additions and 94 deletions
--- a/photo_review/auto_photo_review.py
+++ b/photo_review/auto_photo_review.py
@@ -1,6 +1,9 @@
 import json
 import logging
+import os
+import shutil
 import time
+import uuid
 from collections import defaultdict
 from time import sleep

@@ -16,7 +19,7 @@ from db import MysqlSession
 from db.mysql import BdYljg, BdYlks, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec, ZxIeReview, ZxIeResult
 from log import HOSTNAME
 from photo_review import PHHD_BATCH_SIZE, SLEEP_MINUTES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, \
-    DEPARTMENT_FILTER, DISCHARGE_KEY
+    DEPARTMENT_FILTER, DISCHARGE_KEY, modify_batch_id, BATCH_ID
 from services.paddle_services import IE_KEY
 from ucloud import ufile
 from util import image_util, common_util, html_util, model_util
@@ -46,8 +49,8 @@ def get_better_image_from_qrcode(img_path, image_id, dpi=150):
            # 将渲染结果转换为OpenCV兼容的格式
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1))
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-            img_name, img_ext = image_util.parse_save_path(img_path)
-            better_img_path = image_util.get_save_path(f'{img_name}.better.{img_ext}')
+            img_name, img_ext = common_util.parse_save_path(img_path)
+            better_img_path = common_util.get_processed_img_path(f'{img_name}.better.{img_ext}')
            cv2.imwrite(better_img_path, img)
            return better_img_path, page.get_text()
        except Exception as ex:
@@ -100,20 +103,21 @@ def get_better_image_from_qrcode(img_path, image_id, dpi=150):


 # 关键信息提取
-def information_extraction(phrec, pk_phhd, identity):
+def information_extraction(phrec, pk_phhd):
    """
    处理单张图片
    :param phrec:图片信息
    :param pk_phhd:案子主键
-    :param identity:处理批次标识
    :return:记录类型，信息抽取结果
    """
-    img_path = image_util.get_img_path(phrec.cfjaddress)
-    if not img_path:
+    original_img_path = common_util.get_img_path(phrec.cfjaddress)
+    if not original_img_path:
        img_url = ufile.get_private_url(phrec.cfjaddress)
        if not img_url:
            return None, None, None
-        img_path = image_util.save_to_local(img_url)
+        original_img_path = common_util.save_to_local(img_url)
+    img_path = common_util.get_processed_img_path(phrec.cfjaddress)
+    shutil.copy2(original_img_path, img_path)

    # 尝试从二维码中获取高清图片
    better_img_path, text = get_better_image_from_qrcode(img_path, phrec.cfjaddress)
@@ -129,17 +133,26 @@ def information_extraction(phrec, pk_phhd, identity):
            )
        ocr_text = None  # 此处肯定不是出院记录，后续用不到
    else:
-        target_image = model_util.det_book(img_path)  # 识别文档区域并裁剪
-        dewarped_image = model_util.dewarp(target_image)  # 去扭曲
-        angles = model_util.clas_orientation(dewarped_image)
-        rotated_img = image_util.rotate(dewarped_image, int(angles[0]))
-        split_results = image_util.split(rotated_img)
+        if image_util.is_photo(img_path):
+            book_img_path = model_util.det_book(img_path)  # 识别文档区域并裁剪
+            dewarped_img_path = model_util.dewarp(book_img_path)  # 去扭曲
+        else:  # todo:也可能是图片，后续添加细分逻辑
+            dewarped_img_path = img_path
+        angles = model_util.clas_orientation(dewarped_img_path)
        ocr_result = []
-        for split_result in split_results:
-            if split_result['img'] is None:
-                continue
-            a4_img = image_util.expand_to_a4_size(split_result['img'])
-            ocr_result += model_util.ocr(a4_img)
+        rotated_img = None
+        for angle in angles:
+            tmp_ocr_result = []
+            tmp_rotated_img = image_util.rotate(dewarped_img_path, int(angle))
+            split_results = image_util.split(tmp_rotated_img)
+            for split_result in split_results:
+                if split_result['img'] is None:
+                    continue
+                a4_img = image_util.expand_to_a4_size(split_result['img'])
+                tmp_ocr_result += model_util.ocr(a4_img)
+            if len(tmp_ocr_result) > len(ocr_result):
+                ocr_result = tmp_ocr_result
+                rotated_img = tmp_rotated_img
        ocr_text = common_util.ocr_result_to_text(ocr_result)
        rec_type = model_util.clas_text(ocr_text) if ocr_text else None
        if rec_type == '基本医保结算单':
@@ -158,7 +171,7 @@ def information_extraction(phrec, pk_phhd, identity):

        now = common_util.get_default_datetime()
        session = MysqlSession()
-        session.add(ZxIeResult(pk_phhd=pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
+        session.add(ZxIeResult(pk_phhd=pk_phhd, pk_phrec=phrec.pk_phrec, id=BATCH_ID,
                               cfjaddress=phrec.cfjaddress, content=result_json, create_time=now,
                               creator=HOSTNAME, update_time=now, updater=HOSTNAME))
        session.commit()
@@ -175,23 +188,25 @@ def get_best_value_of_key(source, key):
    values = source.get(key)
    if values:
        for value in values:
-            text = value.get("text")
-            probability = value.get("probability")
-            if text and probability > best_probability:
-                result = text
-                best_probability = probability
+            for v in value:
+                text = v.get("text")
+                probability = v.get("probability")
+                if text and probability > best_probability:
+                    result = text
+                    best_probability = probability
    return result


 # 从keys中获取所有value组成list
 def get_values_of_key(source, key):
    result = []
-    value = source.get(key)
-    if value:
-        for v in value:
-            v = v.get("text")
-            if v:
-                result.append(v)
+    values = source.get(key)
+    if values:
+        for value in values:
+            for v in value:
+                v = v.get("text")
+                if v:
+                    result.append(v)
    # 去重
    return list(set(result))

@@ -424,10 +439,12 @@ def photo_review(pk_phhd, name):
    ).all()
    session.close()
    # 同一批图的标识
-    identity = int(time.time())
+    modify_batch_id(uuid.uuid4().hex)
    discharge_text = ''
    for phrec in phrecs:
-        rec_type, ie_result, ocr_text = information_extraction(phrec, pk_phhd, identity)
+        processed_img_dir = common_util.get_processed_img_path('')
+        os.makedirs(processed_img_dir, exist_ok=True)
+        rec_type, ie_result, ocr_text = information_extraction(phrec, pk_phhd)
        if rec_type == '基本医保结算单':
            rec_result = settlement_result
        elif rec_type == '出院记录':
@@ -439,10 +456,11 @@ def photo_review(pk_phhd, name):
            rec_result = None
        if rec_result is not None:
            for key, value in ie_result.items():
-                if key == '页码':
-                    rec_result[key].append(value)  # 页码要区分来源，所以多包一层
-                else:
-                    rec_result[key] += value
+                rec_result[key].append(value)
+
+        # 删除多余图片
+        if os.path.exists(processed_img_dir) and os.path.isdir(processed_img_dir):
+            shutil.rmtree(processed_img_dir)

    settlement_data = settlement_task(pk_phhd, settlement_result)
    discharge_data = discharge_task(pk_phhd, discharge_result)