import logging import re from datetime import datetime from util import util # 处理金额类数据 def handle_decimal(string): if not string: return "" string = re.sub(r'[^0-9.]', '', string) if not string: return "" if "." not in string: if len(string) > 2: result = string[:-2] + "." + string[-2:] else: result = string else: front, back = string.rsplit('.', 1) front = front.replace(".", "") if back: back = "." + back[:2] result = front + back return result[:16] def parse_money(capital_num, num): if capital_num: try: money = util.chinese_money_to_number(capital_num) return capital_num, money except Exception as e: logging.warning("大写金额解析失败", exc_info=e) return num, handle_decimal(num) # 处理日期类数据 def handle_date(string): if not string: return "" string = string.replace("年", "-").replace("月", "-").replace("日", "").replace("/", "-").replace(".", "-") string = re.sub(r'[^0-9-]', '', string) string = string.strip("-") if "-" in string: dash_count = string.count("-") if dash_count > 2: third_dash_index = string.find("-", string.find("-", string.find("-") + 1) + 1) string = string[:third_dash_index] day = string[string.rindex("-") + 1:] if len(day) > 2: string = string[:2 - len(day)] else: if len(string) > 8: string = string[:8] if len(string) < 6: return "" # 定义可能的日期格式 formats = [ # yyyy-MM-dd '%Y-%m-%d', # yy-MM-dd '%y-%m-%d', # yyyyMMdd '%Y%m%d', # yyMMdd '%y%m%d', ] # 遍历所有格式,尝试解析日期 for fmt in formats: try: date = datetime.strptime(string, fmt) # 限定日期的年份范围 if 2000 < date.year < 2100: return date.strftime("%Y-%m-%d") continue except ValueError: continue return "" def handle_hospital(string): if not string: return "" # 只允许汉字、数字 string = re.sub(r'[^⺀-鿿0-9]', '', string) return string[:255] def handle_department(string): if not string: return "" # 只允许汉字 string = re.sub(r'[^⺀-鿿]', '', string) return string[:255] def parse_department(string): result = [] if not string: return result string = string.replace(")", "").replace(")", "").replace("(", " ").replace("(", " ") # 去除括号 string = re.sub(r'[^⺀-鿿 ]', '', string) # 去除非汉字字符,除了空格 string = re.sub(r'[一二三四五六七八九十]', '', string) # 去除中文数字 string = string.replace("病区", "").replace("病", "") # 去除常见的无意义词 string = string.replace("科", " ") # 分离科室 departments = string.strip().split(" ") for department in departments: if department: result.append(department) return set(result) # 处理姓名类数据 def handle_name(string): if not string: return "" return re.sub(r'[^⺀-鿿·]', '', string)[:30] def handle_doctor(string): if not string: return "无" return re.sub(r'[^⺀-鿿·]', '', string)[:20] # 处理医保类型数据 def handle_insurance_type(string): if not string: return "" worker_insurance_keys = ["社保", "城保", "职", "退休"] villager_insurance_keys = ["农保", "居民"] migrant_worker_insurance_keys = ["农民工"] no_insurance_keys = ["自费", "全费"] if any(key in string for key in worker_insurance_keys): return "职工医保" if any(key in string for key in villager_insurance_keys): return "居民医保" if any(key in string for key in migrant_worker_insurance_keys): return "农民工医保" if any(key in string for key in no_insurance_keys): return "无医保" return "其他" # 处理原始数据 def handle_original_data(string): if not string: return "" # 防止过长存入数据库失败 return string[:255] # 处理id类数据 def handle_id(string): if not string: return "" # 只允许字母和数字 string = re.sub(r'[^0-9a-zA-Z]', '', string) # 防止过长存入数据库失败 return string[:50] # 处理年龄类数据 def handle_age(string): if not string: return "" string = string.split("岁")[0] num = re.sub(r'\D', '', string) return num[-3:] # 分析医院 def parse_hospital(string): result = [] if not string: return result string = util.traditional_to_simple_chinese(string) string_without_brackets = string.replace(")", "").replace(")", "").replace("(", " ").replace("(", " ") string_without_company = string_without_brackets.replace("有限公司", "") split_hospitals = string_without_company.replace("医院", "医院 ") result += split_hospitals.strip().split(" ") return result def handle_text(string): if not string: return "" return string[:16383]