新增二维码识别替换高清图片功能
This commit is contained in:
@@ -118,3 +118,5 @@
|
|||||||
20. 版本号:1.13.0
|
20. 版本号:1.13.0
|
||||||
1. 新增文档检测功能
|
1. 新增文档检测功能
|
||||||
2. 新增扭曲矫正功能
|
2. 新增扭曲矫正功能
|
||||||
|
21. 版本号:1.14.0
|
||||||
|
1. 新增二维码识别替换高清图片功能
|
||||||
@@ -1,8 +1,35 @@
|
|||||||
x-env:
|
template:
|
||||||
&template
|
&template
|
||||||
image: fcb_photo_review:1.13.10
|
image: fcb_photo_review:1.14.1
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
|
review_template:
|
||||||
|
&review_template
|
||||||
|
<<: *template
|
||||||
|
volumes:
|
||||||
|
- ./log:/app/log
|
||||||
|
- ./model:/app/model
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- device_ids: [ '0', '1' ]
|
||||||
|
capabilities: [ 'gpu' ]
|
||||||
|
driver: 'nvidia'
|
||||||
|
|
||||||
|
mask_template:
|
||||||
|
&mask_template
|
||||||
|
<<: *template
|
||||||
|
volumes:
|
||||||
|
- ./log:/app/log
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- device_ids: [ '1' ]
|
||||||
|
capabilities: [ 'gpu' ]
|
||||||
|
driver: 'nvidia'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
det_api:
|
det_api:
|
||||||
<<: *template
|
<<: *template
|
||||||
@@ -13,153 +40,67 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
- ./log:/app/log
|
- ./log:/app/log
|
||||||
- ./model:/app/model
|
- ./model:/app/model
|
||||||
command: [ "det_api.py" ]
|
# command: [ 'det_api.py' ]
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
devices:
|
devices:
|
||||||
- device_ids: [ "0" ]
|
- device_ids: [ '0' ]
|
||||||
capabilities: [ "gpu" ]
|
capabilities: [ 'gpu' ]
|
||||||
driver: "nvidia"
|
driver: 'nvidia'
|
||||||
|
|
||||||
photo_review_1:
|
photo_review_1:
|
||||||
<<: *template
|
<<: *review_template
|
||||||
container_name: photo_review_1
|
container_name: photo_review_1
|
||||||
hostname: photo_review_1
|
hostname: photo_review_1
|
||||||
volumes:
|
|
||||||
- ./log:/app/log
|
|
||||||
- ./model:/app/model
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- det_api
|
- det_api
|
||||||
command: [ "photo_review.py", "--clean", "True" ]
|
command: [ 'photo_review.py', '--clean', 'True' ]
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- device_ids: [ "0", "1" ]
|
|
||||||
capabilities: [ "gpu" ]
|
|
||||||
driver: "nvidia"
|
|
||||||
|
|
||||||
photo_review_2:
|
photo_review_2:
|
||||||
<<: *template
|
<<: *review_template
|
||||||
container_name: photo_review_2
|
container_name: photo_review_2
|
||||||
hostname: photo_review_2
|
hostname: photo_review_2
|
||||||
volumes:
|
|
||||||
- ./log:/app/log
|
|
||||||
- ./model:/app/model
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- photo_review_1
|
- photo_review_1
|
||||||
command: [ "photo_review.py" ]
|
command: [ 'photo_review.py' ]
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- device_ids: [ "0", "1" ]
|
|
||||||
capabilities: [ "gpu" ]
|
|
||||||
driver: "nvidia"
|
|
||||||
|
|
||||||
photo_review_3:
|
photo_review_3:
|
||||||
<<: *template
|
<<: *review_template
|
||||||
container_name: photo_review_3
|
container_name: photo_review_3
|
||||||
hostname: photo_review_3
|
hostname: photo_review_3
|
||||||
volumes:
|
|
||||||
- ./log:/app/log
|
|
||||||
- ./model:/app/model
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- photo_review_2
|
- photo_review_2
|
||||||
command: [ "photo_review.py" ]
|
command: [ 'photo_review.py' ]
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- device_ids: [ "0", "1" ]
|
|
||||||
capabilities: [ "gpu" ]
|
|
||||||
driver: "nvidia"
|
|
||||||
|
|
||||||
photo_review_4:
|
photo_review_4:
|
||||||
<<: *template
|
<<: *review_template
|
||||||
container_name: photo_review_4
|
container_name: photo_review_4
|
||||||
hostname: photo_review_4
|
hostname: photo_review_4
|
||||||
volumes:
|
|
||||||
- ./log:/app/log
|
|
||||||
- ./model:/app/model
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- photo_review_3
|
- photo_review_3
|
||||||
command: [ "photo_review.py" ]
|
command: [ 'photo_review.py' ]
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- device_ids: [ "0", "1" ]
|
|
||||||
capabilities: [ "gpu" ]
|
|
||||||
driver: "nvidia"
|
|
||||||
|
|
||||||
photo_review_5:
|
photo_review_5:
|
||||||
<<: *template
|
<<: *review_template
|
||||||
container_name: photo_review_5
|
container_name: photo_review_5
|
||||||
hostname: photo_review_5
|
hostname: photo_review_5
|
||||||
volumes:
|
|
||||||
- ./log:/app/log
|
|
||||||
- ./model:/app/model
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- photo_review_4
|
- photo_review_4
|
||||||
command: [ "photo_review.py" ]
|
command: [ 'photo_review.py' ]
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- device_ids: [ "0", "1" ]
|
|
||||||
capabilities: [ "gpu" ]
|
|
||||||
driver: "nvidia"
|
|
||||||
|
|
||||||
photo_mask_1:
|
photo_mask_1:
|
||||||
<<: *template
|
<<: *mask_template
|
||||||
container_name: photo_mask_1
|
container_name: photo_mask_1
|
||||||
hostname: photo_mask_1
|
hostname: photo_mask_1
|
||||||
volumes:
|
|
||||||
- ./log:/app/log
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- photo_review_5
|
- photo_review_5
|
||||||
command: [ "photo_mask.py", "--clean", "True" ]
|
command: [ 'photo_mask.py', '--clean', 'True' ]
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- device_ids: [ "1" ]
|
|
||||||
capabilities: [ "gpu" ]
|
|
||||||
driver: "nvidia"
|
|
||||||
|
|
||||||
photo_mask_2:
|
photo_mask_2:
|
||||||
<<: *template
|
<<: *mask_template
|
||||||
container_name: photo_mask_2
|
container_name: photo_mask_2
|
||||||
hostname: photo_mask_2
|
hostname: photo_mask_2
|
||||||
volumes:
|
|
||||||
- ./log:/app/log
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- photo_mask_1
|
- photo_mask_1
|
||||||
command: [ "photo_mask.py" ]
|
command: [ 'photo_mask.py' ]
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- device_ids: [ "1" ]
|
|
||||||
capabilities: [ "gpu" ]
|
|
||||||
driver: "nvidia"
|
|
||||||
|
|
||||||
# photo_review_6:
|
|
||||||
# <<: *template
|
|
||||||
# container_name: photo_review_6
|
|
||||||
# hostname: photo_review_6
|
|
||||||
# volumes:
|
|
||||||
# - ./log:/app/log
|
|
||||||
# - ./model:/app/model
|
|
||||||
# depends_on:
|
|
||||||
# - photo_mask_2
|
|
||||||
# command: [ "photo_review.py" ]
|
|
||||||
# deploy:
|
|
||||||
# resources:
|
|
||||||
# reservations:
|
|
||||||
# devices:
|
|
||||||
# - device_ids: [ "0", "1" ]
|
|
||||||
# capabilities: [ "gpu" ]
|
|
||||||
# driver: "nvidia"
|
|
||||||
@@ -2,9 +2,9 @@ import jieba
|
|||||||
from paddlenlp import Taskflow
|
from paddlenlp import Taskflow
|
||||||
from paddleocr import PaddleOCR
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
"""
|
'''
|
||||||
项目配置
|
项目配置
|
||||||
"""
|
'''
|
||||||
# 每次从数据库获取的案子数量
|
# 每次从数据库获取的案子数量
|
||||||
PHHD_BATCH_SIZE = 10
|
PHHD_BATCH_SIZE = 10
|
||||||
# 没有查询到案子的等待时间(分钟)
|
# 没有查询到案子的等待时间(分钟)
|
||||||
@@ -18,35 +18,35 @@ LAYOUT_ANALYSIS = False
|
|||||||
信息抽取关键词配置
|
信息抽取关键词配置
|
||||||
"""
|
"""
|
||||||
# 患者姓名
|
# 患者姓名
|
||||||
PATIENT_NAME = ["患者姓名"]
|
PATIENT_NAME = ['患者姓名']
|
||||||
# 入院日期
|
# 入院日期
|
||||||
ADMISSION_DATE = ["入院日期"]
|
ADMISSION_DATE = ['入院日期']
|
||||||
# 出院日期
|
# 出院日期
|
||||||
DISCHARGE_DATE = ["出院日期"]
|
DISCHARGE_DATE = ['出院日期']
|
||||||
# 发生医疗费
|
# 发生医疗费
|
||||||
MEDICAL_EXPENSES = ["费用总额"]
|
MEDICAL_EXPENSES = ['费用总额']
|
||||||
# 个人现金支付
|
# 个人现金支付
|
||||||
PERSONAL_CASH_PAYMENT = ["个人现金支付"]
|
PERSONAL_CASH_PAYMENT = ['个人现金支付']
|
||||||
# 个人账户支付
|
# 个人账户支付
|
||||||
PERSONAL_ACCOUNT_PAYMENT = ["个人账户支付"]
|
PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付']
|
||||||
# 个人自费金额
|
# 个人自费金额
|
||||||
PERSONAL_FUNDED_AMOUNT = ["自费金额"]
|
PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费']
|
||||||
# 医保类别
|
# 医保类别
|
||||||
MEDICAL_INSURANCE_TYPE = ["医保类型"]
|
MEDICAL_INSURANCE_TYPE = ['医保类型']
|
||||||
# 就诊医院
|
# 就诊医院
|
||||||
HOSPITAL = ["医院"]
|
HOSPITAL = ['医院']
|
||||||
# 就诊科室
|
# 就诊科室
|
||||||
DEPARTMENT = ["科室"]
|
DEPARTMENT = ['科室']
|
||||||
# 主治医生
|
# 主治医生
|
||||||
DOCTOR = ["主治医生"]
|
DOCTOR = ['主治医生']
|
||||||
# 住院号
|
# 住院号
|
||||||
ADMISSION_ID = ["住院号"]
|
ADMISSION_ID = ['住院号']
|
||||||
# 医保结算单号码
|
# 医保结算单号码
|
||||||
SETTLEMENT_ID = ["医保结算单号码"]
|
SETTLEMENT_ID = ['医保结算单号码']
|
||||||
# 年龄
|
# 年龄
|
||||||
AGE = ["年龄"]
|
AGE = ['年龄']
|
||||||
# 大写总额
|
# 大写总额
|
||||||
UPPERCASE_MEDICAL_EXPENSES = ["大写总额"]
|
UPPERCASE_MEDICAL_EXPENSES = ['大写总额']
|
||||||
|
|
||||||
SETTLEMENT_LIST_SCHEMA = \
|
SETTLEMENT_LIST_SCHEMA = \
|
||||||
(PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT
|
(PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT
|
||||||
@@ -58,47 +58,47 @@ DISCHARGE_RECORD_SCHEMA = \
|
|||||||
|
|
||||||
COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES
|
COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES
|
||||||
|
|
||||||
"""
|
'''
|
||||||
别名配置
|
别名配置
|
||||||
"""
|
'''
|
||||||
# 使用别名中的value替换key。考虑到效率问题,只会替换第一个匹配到的key。
|
# 使用别名中的value替换key。考虑到效率问题,只会替换第一个匹配到的key。
|
||||||
HOSPITAL_ALIAS = {
|
HOSPITAL_ALIAS = {
|
||||||
"沐阳": ["沭阳"],
|
'沐阳': ['沭阳'],
|
||||||
"连水": ["涟水"],
|
'连水': ['涟水'],
|
||||||
"唯宁": ["睢宁"],
|
'唯宁': ['睢宁'], # 雕宁
|
||||||
"九〇四": ["904"],
|
'九〇四': ['904'],
|
||||||
"漂水": ["溧水"],
|
'漂水': ['溧水'],
|
||||||
}
|
}
|
||||||
DEPARTMENT_ALIAS = {
|
DEPARTMENT_ALIAS = {
|
||||||
"耳鼻喉": ["耳鼻咽喉"],
|
'耳鼻喉': ['耳鼻咽喉'],
|
||||||
"急症": ["急诊"],
|
'急症': ['急诊'],
|
||||||
}
|
}
|
||||||
|
|
||||||
"""
|
'''
|
||||||
搜索过滤配置
|
搜索过滤配置
|
||||||
"""
|
'''
|
||||||
# 默认会过滤单字
|
# 默认会过滤单字
|
||||||
HOSPITAL_FILTER = ["医院", "人民", "第一", "第二", "第三", "大学", "附属"]
|
HOSPITAL_FILTER = ['医院', '人民', '第一', '第二', '第三', '大学', '附属']
|
||||||
|
|
||||||
DEPARTMENT_FILTER = ["医", "伤", "西", "新"]
|
DEPARTMENT_FILTER = ['医', '伤', '西', '新']
|
||||||
|
|
||||||
"""
|
'''
|
||||||
分词配置
|
分词配置
|
||||||
"""
|
'''
|
||||||
jieba.suggest_freq(("肿瘤", "医院"), True)
|
jieba.suggest_freq(('肿瘤', '医院'), True)
|
||||||
jieba.suggest_freq(("骨", "伤"), True)
|
jieba.suggest_freq(('骨', '伤'), True)
|
||||||
jieba.suggest_freq(("感染", "性"), True)
|
jieba.suggest_freq(('感染', '性'), True)
|
||||||
jieba.suggest_freq(("胆", "道"), True)
|
jieba.suggest_freq(('胆', '道'), True)
|
||||||
jieba.suggest_freq(("脾", "胃"), True)
|
jieba.suggest_freq(('脾', '胃'), True)
|
||||||
|
|
||||||
"""
|
'''
|
||||||
模型配置
|
模型配置
|
||||||
"""
|
'''
|
||||||
SETTLEMENT_IE = Taskflow("information_extraction", schema=SETTLEMENT_LIST_SCHEMA, model="uie-x-base",
|
SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base',
|
||||||
task_path="model/settlement_list_model", layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
|
task_path='model/settlement_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
|
||||||
DISCHARGE_IE = Taskflow("information_extraction", schema=DISCHARGE_RECORD_SCHEMA, model="uie-x-base",
|
DISCHARGE_IE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base',
|
||||||
task_path="model/discharge_record_model", layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
|
task_path='model/discharge_record_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
|
||||||
COST_IE = Taskflow("information_extraction", schema=COST_LIST_SCHEMA, model="uie-x-base", device_id=1,
|
COST_IE = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base', device_id=1,
|
||||||
task_path="model/cost_list_model", layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
|
task_path='model/cost_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
|
||||||
|
|
||||||
OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=1, det_db_box_thresh=0.3)
|
OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=1, det_db_box_thresh=0.3)
|
||||||
|
|||||||
@@ -7,22 +7,23 @@ from collections import defaultdict
|
|||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
|
import fitz
|
||||||
import jieba
|
import jieba
|
||||||
|
import numpy as np
|
||||||
import requests
|
import requests
|
||||||
|
import zxingcpp
|
||||||
from rapidfuzz import process, fuzz
|
from rapidfuzz import process, fuzz
|
||||||
from sqlalchemy import update
|
from sqlalchemy import update
|
||||||
|
|
||||||
from db import MysqlSession
|
from db import MysqlSession
|
||||||
from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec
|
from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec
|
||||||
from doc_dewarp import dewarp
|
|
||||||
from log import HOSTNAME
|
from log import HOSTNAME
|
||||||
from paddle_detection import detector
|
|
||||||
from photo_review import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \
|
from photo_review import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \
|
||||||
PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR, \
|
PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR, \
|
||||||
ADMISSION_ID, SETTLEMENT_ID, AGE, OCR, SETTLEMENT_IE, DISCHARGE_IE, COST_IE, PHHD_BATCH_SIZE, SLEEP_MINUTES, \
|
ADMISSION_ID, SETTLEMENT_ID, AGE, OCR, SETTLEMENT_IE, DISCHARGE_IE, COST_IE, PHHD_BATCH_SIZE, SLEEP_MINUTES, \
|
||||||
UPPERCASE_MEDICAL_EXPENSES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, DEPARTMENT_FILTER
|
UPPERCASE_MEDICAL_EXPENSES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, DEPARTMENT_FILTER
|
||||||
from ucloud import ufile
|
from ucloud import ufile
|
||||||
from util import image_util, util
|
from util import image_util, util, html_util
|
||||||
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \
|
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \
|
||||||
handle_insurance_type, handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, \
|
handle_insurance_type, handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, \
|
||||||
parse_hospital
|
parse_hospital
|
||||||
@@ -73,6 +74,47 @@ def request_ie_result(task_enum, phrecs):
|
|||||||
raise Exception(f"请求信息抽取结果失败,状态码:{response.status_code}")
|
raise Exception(f"请求信息抽取结果失败,状态码:{response.status_code}")
|
||||||
|
|
||||||
|
|
||||||
|
# 尝试从二维码中获取高清图片
|
||||||
|
def get_better_image_from_qrcode(image, dpi=150):
|
||||||
|
js_base_url = 'http://einvoice.jsczt.cn'
|
||||||
|
results = zxingcpp.read_barcodes(image)
|
||||||
|
for result in results:
|
||||||
|
pdf = None
|
||||||
|
pdf_path = None
|
||||||
|
try:
|
||||||
|
url = result.text
|
||||||
|
if url.startswith(js_base_url):
|
||||||
|
id_base = html_util.get_jsczt_id_base(url)
|
||||||
|
pdf_url = f'{js_base_url}/download?idBase={id_base}'
|
||||||
|
pdf_path = html_util.download_pdf(pdf_url)
|
||||||
|
# 打开PDF文件
|
||||||
|
pdf = fitz.open(pdf_path)
|
||||||
|
# 选择第一页
|
||||||
|
page = pdf[0]
|
||||||
|
# 定义缩放系数(DPI)
|
||||||
|
default_dpi = 72
|
||||||
|
zoom = dpi / default_dpi
|
||||||
|
# 设置矩阵变换参数
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
# 渲染页面
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
# 将渲染结果转换为OpenCV兼容的格式
|
||||||
|
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1))
|
||||||
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
||||||
|
return img, page.get_text()
|
||||||
|
else:
|
||||||
|
logging.getLogger('qr').info(f'未知二维码内容:{url}')
|
||||||
|
except Exception as e:
|
||||||
|
logging.getLogger('error').error('从二维码中获取高清图片时出错', exc_info=e)
|
||||||
|
continue
|
||||||
|
finally:
|
||||||
|
if pdf:
|
||||||
|
pdf.close()
|
||||||
|
if pdf_path:
|
||||||
|
util.delete_temp_file(pdf_path)
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
# 关键信息提取
|
# 关键信息提取
|
||||||
def information_extraction(ie, phrecs, identity):
|
def information_extraction(ie, phrecs, identity):
|
||||||
result = {}
|
result = {}
|
||||||
@@ -83,60 +125,88 @@ def information_extraction(ie, phrecs, identity):
|
|||||||
|
|
||||||
image = image_util.read(img_path)
|
image = image_util.read(img_path)
|
||||||
|
|
||||||
target_images = []
|
# 尝试从二维码中获取高清图片
|
||||||
target_images += detector.request_book_areas(image) # 识别文档区域并裁剪
|
better_image, text = get_better_image_from_qrcode(image)
|
||||||
if not target_images:
|
zx_ie_results = []
|
||||||
target_images.append(image) # 识别失败
|
if better_image is not None:
|
||||||
angle_count = defaultdict(int, {"0": 0}) # 分割后图片的最优角度统计
|
img_angle = '0'
|
||||||
for target_image in target_images:
|
image = better_image
|
||||||
dewarped_image = dewarp.dewarp_image(target_image) # 去扭曲
|
if text:
|
||||||
angles = image_util.parse_rotation_angles(dewarped_image)
|
info_extract = ie(text)[0]
|
||||||
zx_ie_results = []
|
else:
|
||||||
split_results = image_util.split(dewarped_image)
|
info_extract = ie_temp_image(ie, OCR, image)
|
||||||
for split_result in split_results:
|
ie_result = {'result': info_extract, 'angle': '0'}
|
||||||
if split_result["img"] is None or split_result["img"].size == 0:
|
|
||||||
continue
|
|
||||||
rotated_img = image_util.rotate(split_result["img"], int(angles[0]))
|
|
||||||
ie_results = [{"result": ie_temp_image(ie, OCR, rotated_img), "angle": angles[0]}]
|
|
||||||
if not ie_results[0]["result"] or len(ie_results[0]["result"]) < len(ie.kwargs.get("schema")):
|
|
||||||
rotated_img = image_util.rotate(split_result["img"], int(angles[1]))
|
|
||||||
ie_results.append({"result": ie_temp_image(ie, OCR, rotated_img), "angle": angles[1]})
|
|
||||||
|
|
||||||
now = util.get_default_datetime()
|
now = util.get_default_datetime()
|
||||||
best_angle = ["0", 0]
|
if not ie_result['result']:
|
||||||
for ie_result in ie_results:
|
continue
|
||||||
if not ie_result["result"]:
|
|
||||||
|
result_json = json.dumps(ie_result['result'], ensure_ascii=False)
|
||||||
|
if len(result_json) > 5000:
|
||||||
|
result_json = result_json[:5000]
|
||||||
|
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
|
||||||
|
cfjaddress=phrec.cfjaddress, content=result_json,
|
||||||
|
rotation_angle=int(ie_result['angle']),
|
||||||
|
x_offset=0, y_offset=0, create_time=now,
|
||||||
|
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
|
||||||
|
|
||||||
|
result = merge_result(result, ie_result['result'])
|
||||||
|
else:
|
||||||
|
target_images = []
|
||||||
|
# target_images += detector.request_book_areas(image) # 识别文档区域并裁剪
|
||||||
|
if not target_images:
|
||||||
|
target_images.append(image) # 识别失败
|
||||||
|
angle_count = defaultdict(int, {'0': 0}) # 分割后图片的最优角度统计
|
||||||
|
for target_image in target_images:
|
||||||
|
# dewarped_image = dewarp.dewarp_image(target_image) # 去扭曲
|
||||||
|
dewarped_image = target_image
|
||||||
|
angles = image_util.parse_rotation_angles(dewarped_image)
|
||||||
|
|
||||||
|
split_results = image_util.split(dewarped_image)
|
||||||
|
for split_result in split_results:
|
||||||
|
if split_result['img'] is None or split_result['img'].size == 0:
|
||||||
continue
|
continue
|
||||||
|
rotated_img = image_util.rotate(split_result['img'], int(angles[0]))
|
||||||
|
ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}]
|
||||||
|
if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')):
|
||||||
|
rotated_img = image_util.rotate(split_result['img'], int(angles[1]))
|
||||||
|
ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]})
|
||||||
|
now = util.get_default_datetime()
|
||||||
|
best_angle = ['0', 0]
|
||||||
|
for ie_result in ie_results:
|
||||||
|
if not ie_result['result']:
|
||||||
|
continue
|
||||||
|
|
||||||
result_json = json.dumps(ie_result["result"], ensure_ascii=False)
|
result_json = json.dumps(ie_result['result'], ensure_ascii=False)
|
||||||
if len(result_json) > 5000:
|
if len(result_json) > 5000:
|
||||||
result_json = result_json[:5000]
|
result_json = result_json[:5000]
|
||||||
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
|
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
|
||||||
cfjaddress=phrec.cfjaddress, content=result_json,
|
cfjaddress=phrec.cfjaddress, content=result_json,
|
||||||
rotation_angle=int(ie_result["angle"]),
|
rotation_angle=int(ie_result['angle']),
|
||||||
x_offset=split_result["x_offset"],
|
x_offset=split_result['x_offset'],
|
||||||
y_offset=split_result["y_offset"], create_time=now,
|
y_offset=split_result['y_offset'], create_time=now,
|
||||||
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
|
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
|
||||||
|
|
||||||
result = merge_result(result, ie_result["result"])
|
result = merge_result(result, ie_result['result'])
|
||||||
|
|
||||||
if len(ie_result["result"]) > best_angle[1]:
|
if len(ie_result['result']) > best_angle[1]:
|
||||||
best_angle = [ie_result["angle"], len(ie_result["result"])]
|
best_angle = [ie_result['angle'], len(ie_result['result'])]
|
||||||
|
|
||||||
angle_count[best_angle[0]] += 1
|
angle_count[best_angle[0]] += 1
|
||||||
|
img_angle = max(angle_count, key=angle_count.get)
|
||||||
|
|
||||||
img_angle = max(angle_count, key=angle_count.get)
|
if img_angle != '0' or better_image is not None:
|
||||||
if img_angle != "0":
|
|
||||||
image = image_util.rotate(image, int(img_angle))
|
image = image_util.rotate(image, int(img_angle))
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
|
||||||
cv2.imwrite(temp_file.name, image)
|
cv2.imwrite(temp_file.name, image)
|
||||||
try:
|
try:
|
||||||
ufile.upload_file(phrec.cfjaddress, temp_file.name)
|
ufile.upload_file(phrec.cfjaddress, temp_file.name)
|
||||||
# 修正旋转角度
|
if img_angle != '0':
|
||||||
for zx_ie_result in zx_ie_results:
|
# 修正旋转角度
|
||||||
zx_ie_result.rotation_angle -= int(img_angle)
|
for zx_ie_result in zx_ie_results:
|
||||||
|
zx_ie_result.rotation_angle -= int(img_angle)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"上传图片({phrec.cfjaddress})失败", exc_info=e)
|
logging.error(f'上传图片({phrec.cfjaddress})失败', exc_info=e)
|
||||||
finally:
|
finally:
|
||||||
util.delete_temp_file(temp_file.name)
|
util.delete_temp_file(temp_file.name)
|
||||||
|
|
||||||
|
|||||||
@@ -13,3 +13,4 @@ sqlacodegen==2.3.0.post1
|
|||||||
sqlalchemy==1.4.52
|
sqlalchemy==1.4.52
|
||||||
tenacity==8.5.0
|
tenacity==8.5.0
|
||||||
ufile==3.2.9
|
ufile==3.2.9
|
||||||
|
zxing-cpp==2.2.0
|
||||||
43
util/html_util.py
Normal file
43
util/html_util.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import logging
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_random
|
||||||
|
|
||||||
|
|
||||||
|
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
|
||||||
|
after=lambda x: logging.warning('获取江苏省财政票据idBase失败!'))
|
||||||
|
def get_jsczt_id_base(url):
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise Exception(f'请求江苏省财政票据失败!状态码: {response.status_code}')
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
hidden_input = soup.find('input', {'name': "idBase"})
|
||||||
|
if hidden_input:
|
||||||
|
# 获取隐藏字段的值
|
||||||
|
value = hidden_input.get('value')
|
||||||
|
return value
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
|
||||||
|
after=lambda x: logging.warning('下载pdf失败!'))
|
||||||
|
def download_pdf(url, local_filename=None):
|
||||||
|
# 如果没有提供文件名,则使用URL中的文件名
|
||||||
|
if local_filename is None:
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
|
||||||
|
local_filename = temp_pdf.name
|
||||||
|
# 发送HTTP GET请求
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
# 检查请求是否成功
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise Exception(f'下载pdf失败!状态码: {response.status_code}')
|
||||||
|
else:
|
||||||
|
# 打开一个文件用于写入二进制数据
|
||||||
|
with open(local_filename, 'wb') as file:
|
||||||
|
# 迭代写入文件
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk: # filter out keep-alive new chunks
|
||||||
|
file.write(chunk)
|
||||||
|
return local_filename
|
||||||
Reference in New Issue
Block a user