新增二维码识别替换高清图片功能

This commit is contained in:
2024-09-05 13:29:17 +08:00
parent 53a3dcd508
commit de631bef2e
6 changed files with 255 additions and 198 deletions

View File

@@ -118,3 +118,5 @@
20. 版本号1.13.0 20. 版本号1.13.0
1. 新增文档检测功能 1. 新增文档检测功能
2. 新增扭曲矫正功能 2. 新增扭曲矫正功能
21. 版本号1.14.0
1. 新增二维码识别替换高清图片功能

View File

@@ -1,8 +1,35 @@
x-env: template:
&template &template
image: fcb_photo_review:1.13.10 image: fcb_photo_review:1.14.1
restart: always restart: always
review_template:
&review_template
<<: *template
volumes:
- ./log:/app/log
- ./model:/app/model
deploy:
resources:
reservations:
devices:
- device_ids: [ '0', '1' ]
capabilities: [ 'gpu' ]
driver: 'nvidia'
mask_template:
&mask_template
<<: *template
volumes:
- ./log:/app/log
deploy:
resources:
reservations:
devices:
- device_ids: [ '1' ]
capabilities: [ 'gpu' ]
driver: 'nvidia'
services: services:
det_api: det_api:
<<: *template <<: *template
@@ -13,153 +40,67 @@ services:
volumes: volumes:
- ./log:/app/log - ./log:/app/log
- ./model:/app/model - ./model:/app/model
command: [ "det_api.py" ] # command: [ 'det_api.py' ]
deploy: deploy:
resources: resources:
reservations: reservations:
devices: devices:
- device_ids: [ "0" ] - device_ids: [ '0' ]
capabilities: [ "gpu" ] capabilities: [ 'gpu' ]
driver: "nvidia" driver: 'nvidia'
photo_review_1: photo_review_1:
<<: *template <<: *review_template
container_name: photo_review_1 container_name: photo_review_1
hostname: photo_review_1 hostname: photo_review_1
volumes:
- ./log:/app/log
- ./model:/app/model
depends_on: depends_on:
- det_api - det_api
command: [ "photo_review.py", "--clean", "True" ] command: [ 'photo_review.py', '--clean', 'True' ]
deploy:
resources:
reservations:
devices:
- device_ids: [ "0", "1" ]
capabilities: [ "gpu" ]
driver: "nvidia"
photo_review_2: photo_review_2:
<<: *template <<: *review_template
container_name: photo_review_2 container_name: photo_review_2
hostname: photo_review_2 hostname: photo_review_2
volumes:
- ./log:/app/log
- ./model:/app/model
depends_on: depends_on:
- photo_review_1 - photo_review_1
command: [ "photo_review.py" ] command: [ 'photo_review.py' ]
deploy:
resources:
reservations:
devices:
- device_ids: [ "0", "1" ]
capabilities: [ "gpu" ]
driver: "nvidia"
photo_review_3: photo_review_3:
<<: *template <<: *review_template
container_name: photo_review_3 container_name: photo_review_3
hostname: photo_review_3 hostname: photo_review_3
volumes:
- ./log:/app/log
- ./model:/app/model
depends_on: depends_on:
- photo_review_2 - photo_review_2
command: [ "photo_review.py" ] command: [ 'photo_review.py' ]
deploy:
resources:
reservations:
devices:
- device_ids: [ "0", "1" ]
capabilities: [ "gpu" ]
driver: "nvidia"
photo_review_4: photo_review_4:
<<: *template <<: *review_template
container_name: photo_review_4 container_name: photo_review_4
hostname: photo_review_4 hostname: photo_review_4
volumes:
- ./log:/app/log
- ./model:/app/model
depends_on: depends_on:
- photo_review_3 - photo_review_3
command: [ "photo_review.py" ] command: [ 'photo_review.py' ]
deploy:
resources:
reservations:
devices:
- device_ids: [ "0", "1" ]
capabilities: [ "gpu" ]
driver: "nvidia"
photo_review_5: photo_review_5:
<<: *template <<: *review_template
container_name: photo_review_5 container_name: photo_review_5
hostname: photo_review_5 hostname: photo_review_5
volumes:
- ./log:/app/log
- ./model:/app/model
depends_on: depends_on:
- photo_review_4 - photo_review_4
command: [ "photo_review.py" ] command: [ 'photo_review.py' ]
deploy:
resources:
reservations:
devices:
- device_ids: [ "0", "1" ]
capabilities: [ "gpu" ]
driver: "nvidia"
photo_mask_1: photo_mask_1:
<<: *template <<: *mask_template
container_name: photo_mask_1 container_name: photo_mask_1
hostname: photo_mask_1 hostname: photo_mask_1
volumes:
- ./log:/app/log
depends_on: depends_on:
- photo_review_5 - photo_review_5
command: [ "photo_mask.py", "--clean", "True" ] command: [ 'photo_mask.py', '--clean', 'True' ]
deploy:
resources:
reservations:
devices:
- device_ids: [ "1" ]
capabilities: [ "gpu" ]
driver: "nvidia"
photo_mask_2: photo_mask_2:
<<: *template <<: *mask_template
container_name: photo_mask_2 container_name: photo_mask_2
hostname: photo_mask_2 hostname: photo_mask_2
volumes:
- ./log:/app/log
depends_on: depends_on:
- photo_mask_1 - photo_mask_1
command: [ "photo_mask.py" ] command: [ 'photo_mask.py' ]
deploy:
resources:
reservations:
devices:
- device_ids: [ "1" ]
capabilities: [ "gpu" ]
driver: "nvidia"
# photo_review_6:
# <<: *template
# container_name: photo_review_6
# hostname: photo_review_6
# volumes:
# - ./log:/app/log
# - ./model:/app/model
# depends_on:
# - photo_mask_2
# command: [ "photo_review.py" ]
# deploy:
# resources:
# reservations:
# devices:
# - device_ids: [ "0", "1" ]
# capabilities: [ "gpu" ]
# driver: "nvidia"

View File

@@ -2,9 +2,9 @@ import jieba
from paddlenlp import Taskflow from paddlenlp import Taskflow
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
""" '''
项目配置 项目配置
""" '''
# 每次从数据库获取的案子数量 # 每次从数据库获取的案子数量
PHHD_BATCH_SIZE = 10 PHHD_BATCH_SIZE = 10
# 没有查询到案子的等待时间(分钟) # 没有查询到案子的等待时间(分钟)
@@ -18,35 +18,35 @@ LAYOUT_ANALYSIS = False
信息抽取关键词配置 信息抽取关键词配置
""" """
# 患者姓名 # 患者姓名
PATIENT_NAME = ["患者姓名"] PATIENT_NAME = ['患者姓名']
# 入院日期 # 入院日期
ADMISSION_DATE = ["入院日期"] ADMISSION_DATE = ['入院日期']
# 出院日期 # 出院日期
DISCHARGE_DATE = ["出院日期"] DISCHARGE_DATE = ['出院日期']
# 发生医疗费 # 发生医疗费
MEDICAL_EXPENSES = ["费用总额"] MEDICAL_EXPENSES = ['费用总额']
# 个人现金支付 # 个人现金支付
PERSONAL_CASH_PAYMENT = ["个人现金支付"] PERSONAL_CASH_PAYMENT = ['个人现金支付']
# 个人账户支付 # 个人账户支付
PERSONAL_ACCOUNT_PAYMENT = ["个人账户支付"] PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付']
# 个人自费金额 # 个人自费金额
PERSONAL_FUNDED_AMOUNT = ["自费金额"] PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费']
# 医保类别 # 医保类别
MEDICAL_INSURANCE_TYPE = ["医保类型"] MEDICAL_INSURANCE_TYPE = ['医保类型']
# 就诊医院 # 就诊医院
HOSPITAL = ["医院"] HOSPITAL = ['医院']
# 就诊科室 # 就诊科室
DEPARTMENT = ["科室"] DEPARTMENT = ['科室']
# 主治医生 # 主治医生
DOCTOR = ["主治医生"] DOCTOR = ['主治医生']
# 住院号 # 住院号
ADMISSION_ID = ["住院号"] ADMISSION_ID = ['住院号']
# 医保结算单号码 # 医保结算单号码
SETTLEMENT_ID = ["医保结算单号码"] SETTLEMENT_ID = ['医保结算单号码']
# 年龄 # 年龄
AGE = ["年龄"] AGE = ['年龄']
# 大写总额 # 大写总额
UPPERCASE_MEDICAL_EXPENSES = ["大写总额"] UPPERCASE_MEDICAL_EXPENSES = ['大写总额']
SETTLEMENT_LIST_SCHEMA = \ SETTLEMENT_LIST_SCHEMA = \
(PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT (PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT
@@ -58,47 +58,47 @@ DISCHARGE_RECORD_SCHEMA = \
COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES
""" '''
别名配置 别名配置
""" '''
# 使用别名中的value替换key。考虑到效率问题只会替换第一个匹配到的key。 # 使用别名中的value替换key。考虑到效率问题只会替换第一个匹配到的key。
HOSPITAL_ALIAS = { HOSPITAL_ALIAS = {
"沐阳": ["沭阳"], '沐阳': ['沭阳'],
"连水": ["涟水"], '连水': ['涟水'],
"唯宁": ["睢宁"], '唯宁': ['睢宁'], # 雕宁
"九〇四": ["904"], '九〇四': ['904'],
"漂水": ["溧水"], '漂水': ['溧水'],
} }
DEPARTMENT_ALIAS = { DEPARTMENT_ALIAS = {
"耳鼻喉": ["耳鼻咽喉"], '耳鼻喉': ['耳鼻咽喉'],
"急症": ["急诊"], '急症': ['急诊'],
} }
""" '''
搜索过滤配置 搜索过滤配置
""" '''
# 默认会过滤单字 # 默认会过滤单字
HOSPITAL_FILTER = ["医院", "人民", "第一", "第二", "第三", "大学", "附属"] HOSPITAL_FILTER = ['医院', '人民', '第一', '第二', '第三', '大学', '附属']
DEPARTMENT_FILTER = ["", "", "西", ""] DEPARTMENT_FILTER = ['', '', '西', '']
""" '''
分词配置 分词配置
""" '''
jieba.suggest_freq(("肿瘤", "医院"), True) jieba.suggest_freq(('肿瘤', '医院'), True)
jieba.suggest_freq(("", ""), True) jieba.suggest_freq(('', ''), True)
jieba.suggest_freq(("感染", ""), True) jieba.suggest_freq(('感染', ''), True)
jieba.suggest_freq(("", ""), True) jieba.suggest_freq(('', ''), True)
jieba.suggest_freq(("", ""), True) jieba.suggest_freq(('', ''), True)
""" '''
模型配置 模型配置
""" '''
SETTLEMENT_IE = Taskflow("information_extraction", schema=SETTLEMENT_LIST_SCHEMA, model="uie-x-base", SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base',
task_path="model/settlement_list_model", layout_analysis=LAYOUT_ANALYSIS, precision='fp16') task_path='model/settlement_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
DISCHARGE_IE = Taskflow("information_extraction", schema=DISCHARGE_RECORD_SCHEMA, model="uie-x-base", DISCHARGE_IE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base',
task_path="model/discharge_record_model", layout_analysis=LAYOUT_ANALYSIS, precision='fp16') task_path='model/discharge_record_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
COST_IE = Taskflow("information_extraction", schema=COST_LIST_SCHEMA, model="uie-x-base", device_id=1, COST_IE = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base', device_id=1,
task_path="model/cost_list_model", layout_analysis=LAYOUT_ANALYSIS, precision='fp16') task_path='model/cost_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=1, det_db_box_thresh=0.3) OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=1, det_db_box_thresh=0.3)

View File

@@ -7,22 +7,23 @@ from collections import defaultdict
from time import sleep from time import sleep
import cv2 import cv2
import fitz
import jieba import jieba
import numpy as np
import requests import requests
import zxingcpp
from rapidfuzz import process, fuzz from rapidfuzz import process, fuzz
from sqlalchemy import update from sqlalchemy import update
from db import MysqlSession from db import MysqlSession
from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec
from doc_dewarp import dewarp
from log import HOSTNAME from log import HOSTNAME
from paddle_detection import detector
from photo_review import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \ from photo_review import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \
PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR, \ PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR, \
ADMISSION_ID, SETTLEMENT_ID, AGE, OCR, SETTLEMENT_IE, DISCHARGE_IE, COST_IE, PHHD_BATCH_SIZE, SLEEP_MINUTES, \ ADMISSION_ID, SETTLEMENT_ID, AGE, OCR, SETTLEMENT_IE, DISCHARGE_IE, COST_IE, PHHD_BATCH_SIZE, SLEEP_MINUTES, \
UPPERCASE_MEDICAL_EXPENSES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, DEPARTMENT_FILTER UPPERCASE_MEDICAL_EXPENSES, HOSPITAL_ALIAS, HOSPITAL_FILTER, DEPARTMENT_ALIAS, DEPARTMENT_FILTER
from ucloud import ufile from ucloud import ufile
from util import image_util, util from util import image_util, util, html_util
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \ from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \
handle_insurance_type, handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, \ handle_insurance_type, handle_original_data, handle_hospital, handle_department, handle_id, handle_age, parse_money, \
parse_hospital parse_hospital
@@ -73,6 +74,47 @@ def request_ie_result(task_enum, phrecs):
raise Exception(f"请求信息抽取结果失败,状态码:{response.status_code}") raise Exception(f"请求信息抽取结果失败,状态码:{response.status_code}")
# 尝试从二维码中获取高清图片
def get_better_image_from_qrcode(image, dpi=150):
js_base_url = 'http://einvoice.jsczt.cn'
results = zxingcpp.read_barcodes(image)
for result in results:
pdf = None
pdf_path = None
try:
url = result.text
if url.startswith(js_base_url):
id_base = html_util.get_jsczt_id_base(url)
pdf_url = f'{js_base_url}/download?idBase={id_base}'
pdf_path = html_util.download_pdf(pdf_url)
# 打开PDF文件
pdf = fitz.open(pdf_path)
# 选择第一页
page = pdf[0]
# 定义缩放系数DPI
default_dpi = 72
zoom = dpi / default_dpi
# 设置矩阵变换参数
mat = fitz.Matrix(zoom, zoom)
# 渲染页面
pix = page.get_pixmap(matrix=mat)
# 将渲染结果转换为OpenCV兼容的格式
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1))
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
return img, page.get_text()
else:
logging.getLogger('qr').info(f'未知二维码内容:{url}')
except Exception as e:
logging.getLogger('error').error('从二维码中获取高清图片时出错', exc_info=e)
continue
finally:
if pdf:
pdf.close()
if pdf_path:
util.delete_temp_file(pdf_path)
return None, None
# 关键信息提取 # 关键信息提取
def information_extraction(ie, phrecs, identity): def information_extraction(ie, phrecs, identity):
result = {} result = {}
@@ -83,60 +125,88 @@ def information_extraction(ie, phrecs, identity):
image = image_util.read(img_path) image = image_util.read(img_path)
target_images = [] # 尝试从二维码中获取高清图片
target_images += detector.request_book_areas(image) # 识别文档区域并裁剪 better_image, text = get_better_image_from_qrcode(image)
if not target_images:
target_images.append(image) # 识别失败
angle_count = defaultdict(int, {"0": 0}) # 分割后图片的最优角度统计
for target_image in target_images:
dewarped_image = dewarp.dewarp_image(target_image) # 去扭曲
angles = image_util.parse_rotation_angles(dewarped_image)
zx_ie_results = [] zx_ie_results = []
split_results = image_util.split(dewarped_image) if better_image is not None:
for split_result in split_results: img_angle = '0'
if split_result["img"] is None or split_result["img"].size == 0: image = better_image
continue if text:
rotated_img = image_util.rotate(split_result["img"], int(angles[0])) info_extract = ie(text)[0]
ie_results = [{"result": ie_temp_image(ie, OCR, rotated_img), "angle": angles[0]}] else:
if not ie_results[0]["result"] or len(ie_results[0]["result"]) < len(ie.kwargs.get("schema")): info_extract = ie_temp_image(ie, OCR, image)
rotated_img = image_util.rotate(split_result["img"], int(angles[1])) ie_result = {'result': info_extract, 'angle': '0'}
ie_results.append({"result": ie_temp_image(ie, OCR, rotated_img), "angle": angles[1]})
now = util.get_default_datetime() now = util.get_default_datetime()
best_angle = ["0", 0] if not ie_result['result']:
for ie_result in ie_results:
if not ie_result["result"]:
continue continue
result_json = json.dumps(ie_result["result"], ensure_ascii=False) result_json = json.dumps(ie_result['result'], ensure_ascii=False)
if len(result_json) > 5000: if len(result_json) > 5000:
result_json = result_json[:5000] result_json = result_json[:5000]
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity, zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
cfjaddress=phrec.cfjaddress, content=result_json, cfjaddress=phrec.cfjaddress, content=result_json,
rotation_angle=int(ie_result["angle"]), rotation_angle=int(ie_result['angle']),
x_offset=split_result["x_offset"], x_offset=0, y_offset=0, create_time=now,
y_offset=split_result["y_offset"], create_time=now,
creator=HOSTNAME, update_time=now, updater=HOSTNAME)) creator=HOSTNAME, update_time=now, updater=HOSTNAME))
result = merge_result(result, ie_result["result"]) result = merge_result(result, ie_result['result'])
else:
target_images = []
# target_images += detector.request_book_areas(image) # 识别文档区域并裁剪
if not target_images:
target_images.append(image) # 识别失败
angle_count = defaultdict(int, {'0': 0}) # 分割后图片的最优角度统计
for target_image in target_images:
# dewarped_image = dewarp.dewarp_image(target_image) # 去扭曲
dewarped_image = target_image
angles = image_util.parse_rotation_angles(dewarped_image)
if len(ie_result["result"]) > best_angle[1]: split_results = image_util.split(dewarped_image)
best_angle = [ie_result["angle"], len(ie_result["result"])] for split_result in split_results:
if split_result['img'] is None or split_result['img'].size == 0:
continue
rotated_img = image_util.rotate(split_result['img'], int(angles[0]))
ie_results = [{'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[0]}]
if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')):
rotated_img = image_util.rotate(split_result['img'], int(angles[1]))
ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img), 'angle': angles[1]})
now = util.get_default_datetime()
best_angle = ['0', 0]
for ie_result in ie_results:
if not ie_result['result']:
continue
result_json = json.dumps(ie_result['result'], ensure_ascii=False)
if len(result_json) > 5000:
result_json = result_json[:5000]
zx_ie_results.append(ZxIeResult(pk_phhd=phrec.pk_phhd, pk_phrec=phrec.pk_phrec, id=identity,
cfjaddress=phrec.cfjaddress, content=result_json,
rotation_angle=int(ie_result['angle']),
x_offset=split_result['x_offset'],
y_offset=split_result['y_offset'], create_time=now,
creator=HOSTNAME, update_time=now, updater=HOSTNAME))
result = merge_result(result, ie_result['result'])
if len(ie_result['result']) > best_angle[1]:
best_angle = [ie_result['angle'], len(ie_result['result'])]
angle_count[best_angle[0]] += 1 angle_count[best_angle[0]] += 1
img_angle = max(angle_count, key=angle_count.get) img_angle = max(angle_count, key=angle_count.get)
if img_angle != "0":
if img_angle != '0' or better_image is not None:
image = image_util.rotate(image, int(img_angle)) image = image_util.rotate(image, int(img_angle))
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file: with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
cv2.imwrite(temp_file.name, image) cv2.imwrite(temp_file.name, image)
try: try:
ufile.upload_file(phrec.cfjaddress, temp_file.name) ufile.upload_file(phrec.cfjaddress, temp_file.name)
if img_angle != '0':
# 修正旋转角度 # 修正旋转角度
for zx_ie_result in zx_ie_results: for zx_ie_result in zx_ie_results:
zx_ie_result.rotation_angle -= int(img_angle) zx_ie_result.rotation_angle -= int(img_angle)
except Exception as e: except Exception as e:
logging.error(f"上传图片({phrec.cfjaddress})失败", exc_info=e) logging.error(f'上传图片({phrec.cfjaddress})失败', exc_info=e)
finally: finally:
util.delete_temp_file(temp_file.name) util.delete_temp_file(temp_file.name)

View File

@@ -13,3 +13,4 @@ sqlacodegen==2.3.0.post1
sqlalchemy==1.4.52 sqlalchemy==1.4.52
tenacity==8.5.0 tenacity==8.5.0
ufile==3.2.9 ufile==3.2.9
zxing-cpp==2.2.0

43
util/html_util.py Normal file
View File

@@ -0,0 +1,43 @@
import logging
import tempfile
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_random
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
after=lambda x: logging.warning('获取江苏省财政票据idBase失败'))
def get_jsczt_id_base(url):
response = requests.get(url)
if response.status_code != 200:
raise Exception(f'请求江苏省财政票据失败!状态码: {response.status_code}')
soup = BeautifulSoup(response.text, 'html.parser')
hidden_input = soup.find('input', {'name': "idBase"})
if hidden_input:
# 获取隐藏字段的值
value = hidden_input.get('value')
return value
return None
@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
after=lambda x: logging.warning('下载pdf失败'))
def download_pdf(url, local_filename=None):
# 如果没有提供文件名则使用URL中的文件名
if local_filename is None:
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
local_filename = temp_pdf.name
# 发送HTTP GET请求
response = requests.get(url, stream=True)
# 检查请求是否成功
if response.status_code != 200:
raise Exception(f'下载pdf失败状态码: {response.status_code}')
else:
# 打开一个文件用于写入二进制数据
with open(local_filename, 'wb') as file:
# 迭代写入文件
for chunk in response.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
file.write(chunk)
return local_filename