From b9c72002349a23e0f006d3d1cfdc494befeb331e Mon Sep 17 00:00:00 2001
From: liuyebo <1515783401@qq.com>
Date: Thu, 20 Jun 2024 14:45:15 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=AF=94=E4=BE=8B=E5=A4=B8?=
 =?UTF-8?q?=E5=BC=A0=E5=9B=BE=E7=89=87=E7=9A=84=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 main.py                      |   9 +--
 photo_review/photo_review.py | 109 ++++++++++++++++++++++++++++-------
 2 files changed, 91 insertions(+), 27 deletions(-)

diff --git a/main.py b/main.py
index d46c617..00a9d68 100644
--- a/main.py
+++ b/main.py
@@ -17,16 +17,13 @@ if __name__ == '__main__':
     log = logging.getLogger()
 
     # 崩溃后的重试次数
-    retry_time = RETRY_TIME + 1
-    for _ in range(retry_time):
+    for _ in range(RETRY_TIME + 1):
         try:
-            log.info("照片审核开始")
+            log.info("【照片审核关键信息抽取】开始")
             main()
         except Exception as e:
             log.error(traceback.format_exc())
             if SEND_ERROR_EMAIL:
-                send_an_error_email(program_name='照片审核关键信息抽取脚本', error_name=repr(e),
-                                    error_detail=traceback.format_exc())
+                send_an_error_email(program_name='照片审核关键信息抽取脚本', error_name=repr(e), error_detail=traceback.format_exc())
             # 释放显存
             paddle.device.cuda.empty_cache()
-            continue
diff --git a/photo_review/photo_review.py b/photo_review/photo_review.py
index aba63ca..7738906 100644
--- a/photo_review/photo_review.py
+++ b/photo_review/photo_review.py
@@ -1,12 +1,19 @@
 import json
 import logging
+import math
 import os
 import sys
-from time import sleep
+import tempfile
+from io import BytesIO
 
 import paddle
-from sqlalchemy import update
+import requests
 
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from PIL import Image
+from time import sleep
+from sqlalchemy import update
 from config.keys import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \
     PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR
 from config.mysql import MysqlSession
@@ -25,7 +32,55 @@ from photo_review.util.data_util import handle_date, handle_decimal, handle_depa
 from photo_review.util.util import get_default_datetime
 from ucloud import ucloud
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# 获取图片
+def open_image_from_url(url):
+    # 发送HTTP请求获取图片数据
+    response = requests.get(url)
+    # 将响应内容转化为BytesIO对象，以便PIL处理
+    image_stream = BytesIO(response.content)
+    # 使用PIL的Image.open方法打开图像
+    image = Image.open(image_stream)
+    return image
+
+
+# 分割大图片
+def split_image(img_path, max_ratio=2.82, best_ration=1.41, overlap=0.05):
+    split_result = []
+    # 打开图片
+    img = open_image_from_url(img_path)
+    # 获取图片的宽度和高度
+    width, height = img.size
+    # 计算宽高比
+    ratio = max(width, height) / min(width, height)
+    # 检查是否需要裁剪
+    if ratio > max_ratio:
+        # 确定裁剪的尺寸，保持长宽比，以较短边为基准
+        new_ratio = best_ration - overlap
+        if width < height:  # 高度是较长边
+            for i in range(math.ceil(height / (width * new_ratio))):
+                offset = round(width * new_ratio * i)
+                cropped_img = img.crop((0, offset, width, round(offset + width * best_ration)))
+                # 统一转为RGB，这样可以正确保存为jpg格式
+                cropped_img = cropped_img.convert("RGB")
+                split_result.append({"img": cropped_img, "x_offset": 0, "y_offset": offset})
+        else:  # 宽度是较长边
+            for i in range(math.ceil(width / (height * new_ratio))):
+                offset = round(height * new_ratio * i)
+                cropped_img = img.crop((offset, 0, round(offset + height * best_ration), height))
+                # 统一转为RGB，这样可以正确保存为jpg格式
+                cropped_img = cropped_img.convert("RGB")
+                split_result.append({"img": cropped_img, "x_offset": offset, "y_offset": 0})
+    else:
+        split_result.append({"img": img, "x_offset": 0, "y_offset": 0})
+    return split_result
+
+
+# 合并信息抽取结果
+def merge_result(result1, result2):
+    for key in result2:
+        result1[key] = result1.get(key, []) + result2[key]
+    return result1
 
 
 # 关键信息提取
@@ -36,16 +91,27 @@ def information_extraction(ie, phrecs):
     for phrec in phrecs:
         pic_path = ucloud.get_private_url(phrec.cfjaddress)
         if pic_path:
-            docs.append({"doc": pic_path})
-            doc_phrecs.append(phrec)
+            split_result = split_image(pic_path)
+            for img in split_result:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
+                    img["img"].save(temp_file.name)
+                    docs.append({"doc": temp_file.name})
+                    doc_phrecs.append(phrec)
     if not docs:
         return result
 
+    ie_results = []
     try:
         ie_results = ie(docs)
     except Exception as e:
         logging.error(e)
         return result
+    finally:
+        for temp_file in docs:
+            try:
+                os.remove(temp_file["doc"])
+            except Exception as e:
+                logging.info(f"删除临时文件 {temp_file['doc']} 时出错: {e}")
 
     now = get_default_datetime()
     for i in range(len(ie_results)):
@@ -61,7 +127,7 @@ def information_extraction(ie, phrecs):
         session.commit()
         session.close()
 
-        result.update(ie_result)
+        result = merge_result(result, ie_result)
 
     return result
 
@@ -71,15 +137,16 @@ def get_best_value_in_keys(source, keys):
     # 最终结果
     result = None
     # 最大可能性
-    most_probability = 0
+    best_probability = 0
     for key in keys:
         values = source.get(key)
         if values:
             for value in values:
                 text = value.get("text")
                 probability = value.get("probability")
-                if text and probability > most_probability:
+                if text and probability > best_probability:
                     result = text
+                    best_probability = probability
     return result
 
 
@@ -89,10 +156,12 @@ def get_values_of_keys(source, keys):
     for key in keys:
         value = source.get(key)
         if value:
-            value = value[0].get("text")
-            if value:
-                result.append(value)
-    return result
+            for v in value:
+                v = v.get("text")
+                if v:
+                    result.append(v)
+    # 去重
+    return list(set(result))
 
 
 def save_or_update_ie(table, pk_phhd, data):
@@ -214,10 +283,8 @@ def main():
     # 持续检测新案子
     while 1:
         session = MysqlSession()
-        phhds = session.query(ZxPhhd.pk_phhd) \
-            .filter(ZxPhhd.exsuccess_flag == '1') \
-            .limit(PHHD_BATCH_SIZE) \
-            .all()
+        # 查询需要识别的案子
+        phhds = session.query(ZxPhhd.pk_phhd).filter(ZxPhhd.exsuccess_flag == '1').limit(PHHD_BATCH_SIZE).all()
         session.close()
         if phhds:
             for phhd in phhds:
@@ -226,14 +293,14 @@ def main():
 
                 # 识别完成更新标识
                 session = MysqlSession()
-                stmt = (update(ZxPhhd).where(ZxPhhd.pk_phhd == pk_phhd).values(exsuccess_flag=8))
-                session.execute(stmt)
+                update_flag = (update(ZxPhhd).where(ZxPhhd.pk_phhd == pk_phhd).values(exsuccess_flag=8))
+                session.execute(update_flag)
                 session.commit()
                 session.close()
+                # 完成一个案子释放显存
                 paddle.device.cuda.empty_cache()
         else:
             # 没有查询到新案子，等待一段时间后再查
-            sleep_minutes = SLEEP_MINUTES
             log = logging.getLogger()
-            log.info(f"暂未查询到新案子，等待{sleep_minutes}分钟...")
-            sleep(sleep_minutes * 60)
+            log.info(f"暂未查询到新案子，等待{SLEEP_MINUTES}分钟...")
+            sleep(SLEEP_MINUTES * 60)