Files
fcb_photo_review/util/data_util.py
2024-10-18 10:24:52 +08:00

211 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import re
from datetime import datetime
from util import common_util
# 处理金额类数据
def handle_decimal(string):
if not string:
return ''
original_string = string
string = re.sub(r'[^0-9.]', '', string)
if not string:
# 可能抓到大写金额了,尝试用大写金额解析
try:
return common_util.chinese_money_to_number(original_string)
except Exception as e:
logging.warning('大写金额解析失败', exc_info=e)
return ''
if '.' not in string:
if len(string) > 2:
result = string[:-2] + '.' + string[-2:]
else:
result = string
else:
front, back = string.rsplit('.', 1)
front = front.replace('.', '')
if back:
back = '.' + back[:2]
result = front + back
return result[:16]
def parse_money(capital_num, num):
if capital_num:
try:
money = common_util.chinese_money_to_number(capital_num)
return capital_num, money
except Exception as e:
logging.warning('大写金额解析失败', exc_info=e)
return num, handle_decimal(num)
# 处理日期类数据
def handle_date(string):
if not string:
return ''
string = string.replace('', '-').replace('', '-').replace('', '').replace('/', '-').replace('.', '-')
string = re.sub(r'[^0-9-]', '', string)
string = string.strip('-')
if '-' in string:
dash_count = string.count('-')
if dash_count > 2:
third_dash_index = string.find('-', string.find('-', string.find('-') + 1) + 1)
string = string[:third_dash_index]
day = string[string.rindex('-') + 1:]
if len(day) > 2:
string = string[:2 - len(day)]
else:
if len(string) > 8:
string = string[:8]
if len(string) < 6:
return ''
# 定义可能的日期格式
formats = [
# yyyy-MM-dd
'%Y-%m-%d',
# yy-MM-dd
'%y-%m-%d',
# yyyyMMdd
'%Y%m%d',
# yyMMdd
'%y%m%d',
]
# 遍历所有格式,尝试解析日期
for fmt in formats:
try:
date = datetime.strptime(string, fmt)
# 限定日期的年份范围
if 2000 < date.year < 2100:
return date.strftime('%Y-%m-%d')
continue
except ValueError:
continue
return ''
def handle_hospital(string):
if not string:
return ''
return string[:255]
def handle_department(string):
if not string:
return ''
return string[:255]
def parse_department(string):
result = []
if not string:
return result
string = string.replace(')', '').replace('', '').replace('(', ' ').replace('', ' ') # 去除括号
string = re.sub(r'[^⺀-鿿 ]', '', string) # 去除非汉字字符,除了空格
string = re.sub(r'[一二三四五六七八九十]', '', string) # 去除中文数字
string = string.replace('病区', '').replace('', '') # 去除常见的无意义词
string = string.replace('', ' ') # 分离科室
departments = string.strip().split(' ')
for department in departments:
if department:
result.append(department)
return set(result)
# 处理姓名类数据
def handle_name(string):
if not string:
return ''
return re.sub(r'[^⺀-鿿·]', '', string)[:30]
# 处理医保类型数据
def handle_insurance_type(string):
if not string:
return ''
worker_insurance_keys = ['社保', '城保', '', '退休']
villager_insurance_keys = ['农保', '居民']
migrant_worker_insurance_keys = ['农民工']
no_insurance_keys = ['自费', '全费']
if any(key in string for key in worker_insurance_keys):
return '职工医保'
if any(key in string for key in villager_insurance_keys):
return '居民医保'
if any(key in string for key in migrant_worker_insurance_keys):
return '农民工医保'
if any(key in string for key in no_insurance_keys):
return '无医保'
return '其他'
# 处理原始数据
def handle_original_data(string):
if not string:
return ''
# 防止过长存入数据库失败
return string[:255]
# 处理id类数据
def handle_id(string):
if not string:
return ''
# 防止过长存入数据库失败
return string[:50]
# 处理年龄类数据
def handle_age(string):
if not string:
return ''
string = string.split('')[0]
num = re.sub(r'\D', '', string)
return num[-3:]
# 分析医院
def parse_hospital(string):
result = []
if not string:
return result
string = common_util.traditional_to_simple_chinese(string)
string_without_brackets = string.replace(')', '').replace('', '').replace('(', ' ').replace('', ' ')
string_without_company = string_without_brackets.replace('有限公司', '')
split_hospitals = string_without_company.replace('医院', '医院 ')
result += split_hospitals.strip().split(' ')
return result
def parse_page_num(page_list):
if not page_list:
return None, None
pages = []
total = []
for page in page_list:
page_texts = [p.get('text', '') for p in page]
join = ''.join(page_texts)
numbers = re.findall(r'\d+', join)
# 过滤异常值
numbers = [num for num in numbers if int(num) <= 30]
if not numbers:
continue
pages.append(min(numbers))
total.append(max(numbers))
return pages, int(max(total if total else ['1']))
def handle_tiny_int(num):
if not num:
return None
return num if num <= 127 else 127