优化比例夸张图片的处理

This commit is contained in:
2024-06-20 14:45:15 +08:00
parent a142a62774
commit b9c7200234
2 changed files with 91 additions and 27 deletions

View File

@@ -17,16 +17,13 @@ if __name__ == '__main__':
log = logging.getLogger()
# 崩溃后的重试次数
retry_time = RETRY_TIME + 1
for _ in range(retry_time):
for _ in range(RETRY_TIME + 1):
try:
log.info("照片审核开始")
log.info("照片审核关键信息抽取】开始")
main()
except Exception as e:
log.error(traceback.format_exc())
if SEND_ERROR_EMAIL:
send_an_error_email(program_name='照片审核关键信息抽取脚本', error_name=repr(e),
error_detail=traceback.format_exc())
send_an_error_email(program_name='照片审核关键信息抽取脚本', error_name=repr(e), error_detail=traceback.format_exc())
# 释放显存
paddle.device.cuda.empty_cache()
continue

View File

@@ -1,12 +1,19 @@
import json
import logging
import math
import os
import sys
from time import sleep
import tempfile
from io import BytesIO
import paddle
from sqlalchemy import update
import requests
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from PIL import Image
from time import sleep
from sqlalchemy import update
from config.keys import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \
PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR
from config.mysql import MysqlSession
@@ -25,7 +32,55 @@ from photo_review.util.data_util import handle_date, handle_decimal, handle_depa
from photo_review.util.util import get_default_datetime
from ucloud import ucloud
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# 获取图片
def open_image_from_url(url):
# 发送HTTP请求获取图片数据
response = requests.get(url)
# 将响应内容转化为BytesIO对象以便PIL处理
image_stream = BytesIO(response.content)
# 使用PIL的Image.open方法打开图像
image = Image.open(image_stream)
return image
# 分割大图片
def split_image(img_path, max_ratio=2.82, best_ration=1.41, overlap=0.05):
split_result = []
# 打开图片
img = open_image_from_url(img_path)
# 获取图片的宽度和高度
width, height = img.size
# 计算宽高比
ratio = max(width, height) / min(width, height)
# 检查是否需要裁剪
if ratio > max_ratio:
# 确定裁剪的尺寸,保持长宽比,以较短边为基准
new_ratio = best_ration - overlap
if width < height: # 高度是较长边
for i in range(math.ceil(height / (width * new_ratio))):
offset = round(width * new_ratio * i)
cropped_img = img.crop((0, offset, width, round(offset + width * best_ration)))
# 统一转为RGB这样可以正确保存为jpg格式
cropped_img = cropped_img.convert("RGB")
split_result.append({"img": cropped_img, "x_offset": 0, "y_offset": offset})
else: # 宽度是较长边
for i in range(math.ceil(width / (height * new_ratio))):
offset = round(height * new_ratio * i)
cropped_img = img.crop((offset, 0, round(offset + height * best_ration), height))
# 统一转为RGB这样可以正确保存为jpg格式
cropped_img = cropped_img.convert("RGB")
split_result.append({"img": cropped_img, "x_offset": offset, "y_offset": 0})
else:
split_result.append({"img": img, "x_offset": 0, "y_offset": 0})
return split_result
# 合并信息抽取结果
def merge_result(result1, result2):
for key in result2:
result1[key] = result1.get(key, []) + result2[key]
return result1
# 关键信息提取
@@ -36,16 +91,27 @@ def information_extraction(ie, phrecs):
for phrec in phrecs:
pic_path = ucloud.get_private_url(phrec.cfjaddress)
if pic_path:
docs.append({"doc": pic_path})
split_result = split_image(pic_path)
for img in split_result:
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
img["img"].save(temp_file.name)
docs.append({"doc": temp_file.name})
doc_phrecs.append(phrec)
if not docs:
return result
ie_results = []
try:
ie_results = ie(docs)
except Exception as e:
logging.error(e)
return result
finally:
for temp_file in docs:
try:
os.remove(temp_file["doc"])
except Exception as e:
logging.info(f"删除临时文件 {temp_file['doc']} 时出错: {e}")
now = get_default_datetime()
for i in range(len(ie_results)):
@@ -61,7 +127,7 @@ def information_extraction(ie, phrecs):
session.commit()
session.close()
result.update(ie_result)
result = merge_result(result, ie_result)
return result
@@ -71,15 +137,16 @@ def get_best_value_in_keys(source, keys):
# 最终结果
result = None
# 最大可能性
most_probability = 0
best_probability = 0
for key in keys:
values = source.get(key)
if values:
for value in values:
text = value.get("text")
probability = value.get("probability")
if text and probability > most_probability:
if text and probability > best_probability:
result = text
best_probability = probability
return result
@@ -89,10 +156,12 @@ def get_values_of_keys(source, keys):
for key in keys:
value = source.get(key)
if value:
value = value[0].get("text")
if value:
result.append(value)
return result
for v in value:
v = v.get("text")
if v:
result.append(v)
# 去重
return list(set(result))
def save_or_update_ie(table, pk_phhd, data):
@@ -214,10 +283,8 @@ def main():
# 持续检测新案子
while 1:
session = MysqlSession()
phhds = session.query(ZxPhhd.pk_phhd) \
.filter(ZxPhhd.exsuccess_flag == '1') \
.limit(PHHD_BATCH_SIZE) \
.all()
# 查询需要识别的案子
phhds = session.query(ZxPhhd.pk_phhd).filter(ZxPhhd.exsuccess_flag == '1').limit(PHHD_BATCH_SIZE).all()
session.close()
if phhds:
for phhd in phhds:
@@ -226,14 +293,14 @@ def main():
# 识别完成更新标识
session = MysqlSession()
stmt = (update(ZxPhhd).where(ZxPhhd.pk_phhd == pk_phhd).values(exsuccess_flag=8))
session.execute(stmt)
update_flag = (update(ZxPhhd).where(ZxPhhd.pk_phhd == pk_phhd).values(exsuccess_flag=8))
session.execute(update_flag)
session.commit()
session.close()
# 完成一个案子释放显存
paddle.device.cuda.empty_cache()
else:
# 没有查询到新案子,等待一段时间后再查
sleep_minutes = SLEEP_MINUTES
log = logging.getLogger()
log.info(f"暂未查询到新案子,等待{sleep_minutes}分钟...")
sleep(sleep_minutes * 60)
log.info(f"暂未查询到新案子,等待{SLEEP_MINUTES}分钟...")
sleep(SLEEP_MINUTES * 60)