import re from datetime import datetime # 处理金额类数据 def handle_decimal(string): if not string: return "" string = re.sub(r'[^0-9.]', '', string) if "." not in string: front = string back = "" else: front, back = string.rsplit('.', 1) front = front.replace(".", "") front = front[-16:] if back: back = "." + back return front + back # 处理日期类数据 def handle_date(string): if not string: return "" string = string.replace("年", "-").replace("月", "-").replace("日", "").replace("/", "-").replace(".", "-") string = re.sub(r'[^0-9-]', '', string) string = string.strip("-") if "-" in string: dash_count = string.count("-") if dash_count > 2: third_dash_index = string.find("-", string.find("-", string.find("-") + 1) + 1) string = string[:third_dash_index] day = string[string.rindex("-") + 1:] if len(day) > 2: string = string[:2 - len(day)] else: if len(string) > 8: string = string[:8] if len(string) < 6: return "" # 定义可能的日期格式 formats = [ # yyyy-MM-dd '%Y-%m-%d', # yy-MM-dd '%y-%m-%d', # yyyyMMdd '%Y%m%d', # yyMMdd '%y%m%d', ] # 遍历所有格式,尝试解析日期 for fmt in formats: try: date = datetime.strptime(string, fmt) # 限定日期的年份范围 if 2000 < date.year < 2100: return date.strftime("%Y-%m-%d") continue except ValueError: continue return "" def handle_hospital(string): if not string: return "" return string[:255] def handle_department(string): if not string: return "" return string[:255] def parse_department(string): result = [] if not string: return result result.append(handle_department(string)) string_without_num = re.sub(r'\d|一|二|三|四|五|六|七|八|九|十', '', string) if string_without_num != string: result.append(handle_department(string_without_num)) string_without_brackets = re.sub(r'\([^()]*\)|\[[^\[\]]*\]|\{[^\{\}]*\}|([^()]*)', "", string_without_num) if string_without_brackets != string_without_num: result.append(handle_department(string_without_brackets)) pure_string = string_without_brackets.split("科")[0] + "科" if pure_string != string_without_brackets: result.append(handle_department(pure_string)) return result # 处理姓名类数据 def handle_name(string): if not string: return "" return re.sub(r'[^⺀-鿿·]', '', string)[:30] # 处理医保类型数据 def handle_insurance_type(string): if not string: return "" return string.replace(":", "").replace(":", "")[:255] # 处理原始数据 def handle_original_data(string): if not string: return "" # 防止过长存入数据库失败 return string[:255] # 处理id类数据 def handle_id(string): if not string: return "" # 防止过长存入数据库失败 return string[:50] # 处理年龄类数据 def handle_age(string): if not string: return "" string = string.split("岁")[0] num = re.sub(r'\D', '', string) return num[-3:]