优化二维码解析

2024-09-12 13:56:49 +08:00
parent 7cd0a564a0
commit 6c14910841
3 changed files with 40 additions and 5 deletions
--- a/util/html_util.py
+++ b/util/html_util.py
@@ -1,5 +1,7 @@
 import logging
+import re
 import tempfile
+from urllib.parse import parse_qs, urlparse

 import requests
 from bs4 import BeautifulSoup
@@ -56,3 +58,29 @@ def get_wx_pdf_url(url):
        value = pdf_url.get('href')
        return value
    return None
+
+
+@retry(stop=stop_after_attempt(3), wait=wait_random(1, 3), reraise=True,
+       after=lambda x: logging.warning('获取泰州三院电子发票失败！'))
+def get_tz3y_pdf_url(url):
+    response = requests.get(url)
+    if response.status_code != 200:
+        raise Exception(f'请求泰州三院电子发票失败！状态码: {response.status_code}')
+    soup = BeautifulSoup(response.text, 'html.parser')
+    script_tag = soup.find('script', {'src': None})
+    if script_tag:
+        url_match = re.search(r'var url="(.*?)"\+fphm;', script_tag.string)
+        if url_match:
+            request_pdf_url = url_match.group(1)
+            query = urlparse(url).query
+            query_params = parse_qs(query)
+            fphm = query_params.get('fphm')[0]
+            request_pdf_url += fphm
+            response = requests.get(request_pdf_url)
+            if response.status_code != 200:
+                raise Exception(f'请求泰州三院电子发票失败！状态码: {response.status_code}')
+            pdf_match = re.search(r"'dzfpUrl':'(.*)'", response.text)
+            if pdf_match:
+                return pdf_match.group(1)
+
+    return None