201 lines
5.2 KiB
Python
201 lines
5.2 KiB
Python
import logging
|
||
import re
|
||
from datetime import datetime
|
||
|
||
from util import common_util
|
||
|
||
|
||
# 处理金额类数据
|
||
def handle_decimal(string):
|
||
if not string:
|
||
return ''
|
||
string = re.sub(r'[^0-9.]', '', string)
|
||
if not string:
|
||
return ''
|
||
if '.' not in string:
|
||
if len(string) > 2:
|
||
result = string[:-2] + '.' + string[-2:]
|
||
else:
|
||
result = string
|
||
else:
|
||
front, back = string.rsplit('.', 1)
|
||
front = front.replace('.', '')
|
||
if back:
|
||
back = '.' + back[:2]
|
||
result = front + back
|
||
return result[:16]
|
||
|
||
|
||
def parse_money(capital_num, num):
|
||
if capital_num:
|
||
try:
|
||
money = common_util.chinese_money_to_number(capital_num)
|
||
return capital_num, money
|
||
except Exception as e:
|
||
logging.warning('大写金额解析失败', exc_info=e)
|
||
|
||
return num, handle_decimal(num)
|
||
|
||
|
||
# 处理日期类数据
|
||
def handle_date(string):
|
||
if not string:
|
||
return ''
|
||
|
||
string = string.replace('年', '-').replace('月', '-').replace('日', '').replace('/', '-').replace('.', '-')
|
||
string = re.sub(r'[^0-9-]', '', string)
|
||
string = string.strip('-')
|
||
if '-' in string:
|
||
dash_count = string.count('-')
|
||
if dash_count > 2:
|
||
third_dash_index = string.find('-', string.find('-', string.find('-') + 1) + 1)
|
||
string = string[:third_dash_index]
|
||
day = string[string.rindex('-') + 1:]
|
||
if len(day) > 2:
|
||
string = string[:2 - len(day)]
|
||
else:
|
||
if len(string) > 8:
|
||
string = string[:8]
|
||
|
||
if len(string) < 6:
|
||
return ''
|
||
|
||
# 定义可能的日期格式
|
||
formats = [
|
||
# yyyy-MM-dd
|
||
'%Y-%m-%d',
|
||
# yy-MM-dd
|
||
'%y-%m-%d',
|
||
# yyyyMMdd
|
||
'%Y%m%d',
|
||
# yyMMdd
|
||
'%y%m%d',
|
||
]
|
||
|
||
# 遍历所有格式,尝试解析日期
|
||
for fmt in formats:
|
||
try:
|
||
date = datetime.strptime(string, fmt)
|
||
# 限定日期的年份范围
|
||
if 2000 < date.year < 2100:
|
||
return date.strftime('%Y-%m-%d')
|
||
continue
|
||
except ValueError:
|
||
continue
|
||
|
||
return ''
|
||
|
||
|
||
def handle_hospital(string):
|
||
if not string:
|
||
return ''
|
||
return string[:255]
|
||
|
||
|
||
def handle_department(string):
|
||
if not string:
|
||
return ''
|
||
return string[:255]
|
||
|
||
|
||
def parse_department(string):
|
||
result = []
|
||
if not string:
|
||
return result
|
||
|
||
string = string.replace(')', '').replace(')', '').replace('(', ' ').replace('(', ' ') # 去除括号
|
||
string = re.sub(r'[^⺀-鿿 ]', '', string) # 去除非汉字字符,除了空格
|
||
string = re.sub(r'[一二三四五六七八九十]', '', string) # 去除中文数字
|
||
string = string.replace('病区', '').replace('病', '') # 去除常见的无意义词
|
||
string = string.replace('科', ' ') # 分离科室
|
||
departments = string.strip().split(' ')
|
||
for department in departments:
|
||
if department:
|
||
result.append(department)
|
||
return set(result)
|
||
|
||
|
||
# 处理姓名类数据
|
||
def handle_name(string):
|
||
if not string:
|
||
return ''
|
||
return re.sub(r'[^⺀-鿿·]', '', string)[:30]
|
||
|
||
|
||
# 处理医保类型数据
|
||
def handle_insurance_type(string):
|
||
if not string:
|
||
return ''
|
||
worker_insurance_keys = ['社保', '城保', '职', '退休']
|
||
villager_insurance_keys = ['农保', '居民']
|
||
migrant_worker_insurance_keys = ['农民工']
|
||
no_insurance_keys = ['自费', '全费']
|
||
if any(key in string for key in worker_insurance_keys):
|
||
return '职工医保'
|
||
if any(key in string for key in villager_insurance_keys):
|
||
return '居民医保'
|
||
if any(key in string for key in migrant_worker_insurance_keys):
|
||
return '农民工医保'
|
||
if any(key in string for key in no_insurance_keys):
|
||
return '无医保'
|
||
return '其他'
|
||
|
||
|
||
# 处理原始数据
|
||
def handle_original_data(string):
|
||
if not string:
|
||
return ''
|
||
# 防止过长存入数据库失败
|
||
return string[:255]
|
||
|
||
|
||
# 处理id类数据
|
||
def handle_id(string):
|
||
if not string:
|
||
return ''
|
||
# 防止过长存入数据库失败
|
||
return string[:50]
|
||
|
||
|
||
# 处理年龄类数据
|
||
def handle_age(string):
|
||
if not string:
|
||
return ''
|
||
string = string.split('岁')[0]
|
||
num = re.sub(r'\D', '', string)
|
||
return num[-3:]
|
||
|
||
|
||
# 分析医院
|
||
def parse_hospital(string):
|
||
result = []
|
||
if not string:
|
||
return result
|
||
|
||
string = common_util.traditional_to_simple_chinese(string)
|
||
string_without_brackets = string.replace(')', '').replace(')', '').replace('(', ' ').replace('(', ' ')
|
||
string_without_company = string_without_brackets.replace('有限公司', '')
|
||
split_hospitals = string_without_company.replace('医院', '医院 ')
|
||
result += split_hospitals.strip().split(' ')
|
||
return result
|
||
|
||
|
||
def parse_page_num(page_list):
|
||
if not page_list:
|
||
return None, None
|
||
pages = []
|
||
total = []
|
||
for page in page_list:
|
||
page_texts = [p.get('text', '') for p in page]
|
||
join = ''.join(page_texts)
|
||
numbers = re.findall(r'\d+', join)
|
||
pages.append(min(numbers))
|
||
total.append(max(numbers))
|
||
return pages, max(total)
|
||
|
||
|
||
def handle_tiny_int(num):
|
||
if not num:
|
||
return None
|
||
return num if num <= 127 else 127
|