添加可视化测试中的长图处理

This commit is contained in:
2024-06-20 17:03:18 +08:00
parent 41aadebcdc
commit 1f88688676

View File

@@ -2,13 +2,16 @@
import os import os
import re import re
import sys import sys
import tempfile
import time import time
from pprint import pprint from pprint import pprint
from paddlenlp import Taskflow from photo_review.photo_review import split_image
from paddlenlp.utils.doc_parser import DocParser
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from paddlenlp import Taskflow
from paddlenlp.utils.doc_parser import DocParser
from ucloud import ucloud from ucloud import ucloud
@@ -39,22 +42,55 @@ def write_visual_result(image, layout=None, result=None):
def visual_model_test(model_type, test_img, task_path, schema): def visual_model_test(model_type, test_img, task_path, schema):
if model_type == "ocr": if model_type == "ocr":
doc_parser = DocParser(layout_analysis=True) imgs = split_image(test_img)
parsed_doc = doc_parser.parse({"doc": test_img}) layout = []
write_visual_result(test_img, layout=parsed_doc["layout"]) temp_files_paths = []
doc_parser = DocParser(layout_analysis=False)
for img in imgs:
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
img["img"].save(temp_file.name)
temp_files_paths.append(temp_file.name)
parsed_doc = doc_parser.parse({"doc": temp_file.name}, expand_to_a4_size=True)
if img["x_offset"] or img["y_offset"]:
for p in parsed_doc["layout"]:
box = p[0]
box[0] += img["x_offset"]
box[1] += img["y_offset"]
box[2] += img["x_offset"]
box[3] += img["y_offset"]
layout += parsed_doc["layout"]
write_visual_result(test_img, layout=layout)
else: else:
docs = []
split_result = split_image(test_img)
temp_files_paths = []
for img in split_result:
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
img["img"].save(temp_file.name)
temp_files_paths.append(temp_file.name)
docs.append({"doc": temp_file.name})
my_ie = Taskflow("information_extraction", schema=schema, model="uie-x-base", task_path=task_path, my_ie = Taskflow("information_extraction", schema=schema, model="uie-x-base", task_path=task_path,
layout_analysis=False) layout_analysis=False)
my_results = my_ie({"doc": test_img}) my_results = my_ie(docs)
write_visual_result(test_img, result=my_results[0]) write_visual_result(test_img, result=my_results[0])
# 使用完临时文件后,记得清理(删除)它们
for path in temp_files_paths:
try:
os.remove(path)
print(f"临时文件 {path} 已删除")
except Exception as e:
print(f"删除临时文件 {path} 时出错: {e}")
def batch_test(test_imgs, task_path, schema): def batch_test(test_imgs, task_path, schema):
docs = [] docs = []
for test_img in test_imgs: for test_img in test_imgs:
docs.append({"doc": test_img}) docs.append({"doc": test_img})
my_ie = Taskflow("information_extraction", schema=schema, model="uie-x-base", task_path=task_path, my_ie = Taskflow("information_extraction", schema=schema, model="uie-x-base", task_path=task_path,
layout_analysis=True, batch_size=16) layout_analysis=False, batch_size=16)
# 批量抽取写法:(ie([{"doc": "./data/6.jpg"}, {"doc": "./data/7.jpg"}]) # 批量抽取写法:(ie([{"doc": "./data/6.jpg"}, {"doc": "./data/7.jpg"}])
my_results = my_ie(docs) my_results = my_ie(docs)
pprint(my_results) pprint(my_results)