diff --git a/util/data_util.py b/util/data_util.py index 65d7705..69fb417 100644 --- a/util/data_util.py +++ b/util/data_util.py @@ -195,6 +195,8 @@ def parse_page_num(page_list): page_texts = [p.get('text', '') for p in page] join = ''.join(page_texts) numbers = re.findall(r'\d+', join) + # 过滤异常值 + numbers = [num for num in numbers if int(num) <= 30] if not numbers: continue pages.append(min(numbers))