Files
fcb_photo_review/photo_review/util/data_util.py
2024-07-04 17:22:56 +08:00

138 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from datetime import datetime
# 处理金额类数据
def handle_decimal(string):
if not string:
return ""
string = re.sub(r'[^0-9.]', '', string)
if "." not in string:
front = string
back = ""
else:
front, back = string.rsplit('.', 1)
front = front.replace(".", "")
front = front[-16:]
if back:
back = "." + back
return front + back
# 处理日期类数据
def handle_date(string):
if not string:
return ""
string = string.replace("", "-").replace("", "-").replace("", "").replace("/", "-").replace(".", "-")
string = re.sub(r'[^0-9-]', '', string)
string = string.strip("-")
if "-" in string:
dash_count = string.count("-")
if dash_count > 2:
third_dash_index = string.find("-", string.find("-", string.find("-") + 1) + 1)
string = string[:third_dash_index]
day = string[string.rindex("-") + 1:]
if len(day) > 2:
string = string[:2 - len(day)]
else:
if len(string) > 8:
string = string[:8]
if len(string) < 6:
return ""
# 定义可能的日期格式
formats = [
# yyyy-MM-dd
'%Y-%m-%d',
# yy-MM-dd
'%y-%m-%d',
# yyyyMMdd
'%Y%m%d',
# yyMMdd
'%y%m%d',
]
# 遍历所有格式,尝试解析日期
for fmt in formats:
try:
date = datetime.strptime(string, fmt)
# 限定日期的年份范围
if 2000 < date.year < 2100:
return date.strftime("%Y-%m-%d")
continue
except ValueError:
continue
return ""
def handle_hospital(string):
if not string:
return ""
return string[:255]
def handle_department(string):
if not string:
return ""
return string[:255]
def parse_department(string):
result = []
if not string:
return result
result.append(handle_department(string))
string_without_num = re.sub(r'\d|一|二|三|四|五|六|七|八|九|十', '', string)
if string_without_num != string:
result.append(handle_department(string_without_num))
string_without_brackets = re.sub(r'\([^()]*\)|\[[^\[\]]*\]|\{[^\{\}]*\}|[^]*', "", string_without_num)
if string_without_brackets != string_without_num:
result.append(handle_department(string_without_brackets))
pure_string = string_without_brackets.split("")[0] + ""
if pure_string != string_without_brackets:
result.append(handle_department(pure_string))
return result
# 处理姓名类数据
def handle_name(string):
if not string:
return ""
return re.sub(r'[^⺀-鿿·]', '', string)[:30]
# 处理医保类型数据
def handle_insurance_type(string):
if not string:
return ""
return string.replace(":", "").replace("", "")[:255]
# 处理原始数据
def handle_original_data(string):
if not string:
return ""
# 防止过长存入数据库失败
return string[:255]
# 处理id类数据
def handle_id(string):
if not string:
return ""
# 防止过长存入数据库失败
return string[:50]
# 处理年龄类数据
def handle_age(string):
if not string:
return ""
string = string.split("")[0]
num = re.sub(r'\D', '', string)
return num[-3:]