From 9556da47e9e4520e8064bb701fc24f6375d5d89d Mon Sep 17 00:00:00 2001 From: liuyebo <1515783401@qq.com> Date: Fri, 18 Oct 2024 10:05:38 +0800 Subject: [PATCH] =?UTF-8?q?=E8=BF=87=E6=BB=A4=E9=A1=B5=E7=A0=81=E5=BC=82?= =?UTF-8?q?=E5=B8=B8=E7=9A=84=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- util/data_util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/util/data_util.py b/util/data_util.py index 65d7705..69fb417 100644 --- a/util/data_util.py +++ b/util/data_util.py @@ -195,6 +195,8 @@ def parse_page_num(page_list): page_texts = [p.get('text', '') for p in page] join = ''.join(page_texts) numbers = re.findall(r'\d+', join) + # 过滤异常值 + numbers = [num for num in numbers if int(num) <= 30] if not numbers: continue pages.append(min(numbers))