过滤页码异常的值
This commit is contained in:
@@ -195,6 +195,8 @@ def parse_page_num(page_list):
|
|||||||
page_texts = [p.get('text', '') for p in page]
|
page_texts = [p.get('text', '') for p in page]
|
||||||
join = ''.join(page_texts)
|
join = ''.join(page_texts)
|
||||||
numbers = re.findall(r'\d+', join)
|
numbers = re.findall(r'\d+', join)
|
||||||
|
# 过滤异常值
|
||||||
|
numbers = [num for num in numbers if int(num) <= 30]
|
||||||
if not numbers:
|
if not numbers:
|
||||||
continue
|
continue
|
||||||
pages.append(min(numbers))
|
pages.append(min(numbers))
|
||||||
|
|||||||
Reference in New Issue
Block a user