import re from datetime import datetime # 处理金额类数据 def handle_decimal(string): if not string: return "" string = re.sub(r'[^0-9.]', '', string) if "." not in string: return string front, back = string.rsplit('.', 1) front = front.replace(".", "") front = front[-16:] if back: back = "." + back return front + back # 处理日期类数据 def handle_date(string): if not string: return "" string = string.replace("年", "-").replace("月", "-").replace("日", "").replace("/", "-").replace(".", "-") string = re.sub(r'[^0-9-]', '', string) length = len(string) if length > 8 and "-" not in string: string = string[:8] elif length > 10 and "-" in string: string = string[:10] if is_valid_date_format(string): return string else: return "" # 判断是否是合法的日期格式 def is_valid_date_format(date_str): if len(date_str) < 6: return False # 定义可能的日期格式 formats = [ # yyyy-MM-dd '%Y-%m-%d', # yy-MM-dd '%y-%m-%d', # yyyyMMdd '%Y%m%d', # yyMMdd '%y%m%d', ] # 遍历所有格式,尝试解析日期 for fmt in formats: try: datetime.strptime(date_str, fmt) return True except ValueError: pass return False def handle_department(string): result = [] if not string: return result result.append(string) string_without_num = re.sub(r'\d|一|二|三|四|五|六|七|八|九|十', '', string) if string_without_num != string: result.append(string_without_num) string_without_brackets = re.sub(r'\([^()]*\)|\[[^\[\]]*\]|\{[^\{\}]*\}|([^()]*)', "", string_without_num) if string_without_brackets != string_without_num: result.append(string_without_brackets) pure_string = string_without_brackets.split("科")[0] + "科" if pure_string != string_without_brackets: result.append(pure_string) return result # 处理姓名类数据 def handle_name(string): if not string: return "" return re.sub(r'[^⺀-鿿·]', '', string)