import logging import re from datetime import datetime from util import common_util # 处理金额类数据 def handle_decimal(string): if not string: return '' original_string = string string = re.sub(r'[^0-9.]', '', string) if not string: # 可能抓到大写金额了,尝试用大写金额解析 try: return common_util.chinese_money_to_number(original_string) except Exception as e: logging.warning('大写金额解析失败', exc_info=e) return '' if '.' not in string: if len(string) > 2: result = string[:-2] + '.' + string[-2:] else: result = string else: front, back = string.rsplit('.', 1) front = front.replace('.', '') if back: back = '.' + back[:2] result = front + back return result[:16] def parse_money(capital_num, num): if capital_num: try: money = common_util.chinese_money_to_number(capital_num) return capital_num, money except Exception as e: logging.warning('大写金额解析失败', exc_info=e) return num, handle_decimal(num) # 处理日期类数据 def handle_date(string): if not string: return '' string = string.replace('年', '-').replace('月', '-').replace('日', '').replace('/', '-').replace('.', '-') string = re.sub(r'[^0-9-]', '', string) string = string.strip('-') if '-' in string: dash_count = string.count('-') if dash_count > 2: third_dash_index = string.find('-', string.find('-', string.find('-') + 1) + 1) string = string[:third_dash_index] day = string[string.rindex('-') + 1:] if len(day) > 2: string = string[:2 - len(day)] else: if len(string) > 8: string = string[:8] if len(string) < 6: return '' # 定义可能的日期格式 formats = [ # yyyy-MM-dd '%Y-%m-%d', # yy-MM-dd '%y-%m-%d', # yyyyMMdd '%Y%m%d', # yyMMdd '%y%m%d', ] # 遍历所有格式,尝试解析日期 for fmt in formats: try: date = datetime.strptime(string, fmt) # 限定日期的年份范围 if 2000 < date.year < 2100: return date.strftime('%Y-%m-%d') continue except ValueError: continue return '' def handle_hospital(string): if not string: return '' return string[:255] def handle_department(string): if not string: return '' return string[:255] def parse_department(string): result = [] if not string: return result string = string.replace(')', '').replace(')', '').replace('(', ' ').replace('(', ' ') # 去除括号 string = re.sub(r'[^⺀-鿿 ]', '', string) # 去除非汉字字符,除了空格 string = re.sub(r'[一二三四五六七八九十]', '', string) # 去除中文数字 string = string.replace('病区', '').replace('病', '') # 去除常见的无意义词 string = string.replace('科', ' ') # 分离科室 departments = string.strip().split(' ') for department in departments: if department: result.append(department) return set(result) # 处理姓名类数据 def handle_name(string): if not string: return '' return re.sub(r'[^⺀-鿿·]', '', string)[:30] # 处理医保类型数据 def handle_insurance_type(string): if not string: return '' worker_insurance_keys = ['社保', '城保', '职', '退休'] villager_insurance_keys = ['农保', '居民'] migrant_worker_insurance_keys = ['农民工'] no_insurance_keys = ['自费', '全费'] if any(key in string for key in worker_insurance_keys): return '职工医保' if any(key in string for key in villager_insurance_keys): return '居民医保' if any(key in string for key in migrant_worker_insurance_keys): return '农民工医保' if any(key in string for key in no_insurance_keys): return '无医保' return '其他' # 处理原始数据 def handle_original_data(string): if not string: return '' # 防止过长存入数据库失败 return string[:255] # 处理id类数据 def handle_id(string): if not string: return '' # 防止过长存入数据库失败 return string[:50] # 处理年龄类数据 def handle_age(string): if not string: return '' string = string.split('岁')[0] num = re.sub(r'\D', '', string) return num[-3:] # 分析医院 def parse_hospital(string): result = [] if not string: return result string = common_util.traditional_to_simple_chinese(string) string_without_brackets = string.replace(')', '').replace(')', '').replace('(', ' ').replace('(', ' ') string_without_company = string_without_brackets.replace('有限公司', '') split_hospitals = string_without_company.replace('医院', '医院 ') result += split_hospitals.strip().split(' ') return result def parse_page_num(page_list): if not page_list: return None, None pages = [] total = [] for page in page_list: page_texts = [p.get('text', '') for p in page] join = ''.join(page_texts) numbers = re.findall(r'\d+', join) if not numbers: continue pages.append(min(numbers)) total.append(max(numbers)) return pages, int(max(total)) def handle_tiny_int(num): if not num: return None return num if num <= 127 else 127