Files
fcb_photo_review/util/data_util.py

190 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import re
from datetime import datetime
from util import util
# 处理金额类数据
def handle_decimal(string):
if not string:
return ""
string = re.sub(r'[^0-9.]', '', string)
if not string:
return ""
if "." not in string:
if len(string) > 2:
result = string[:-2] + "." + string[-2:]
else:
result = string
else:
front, back = string.rsplit('.', 1)
front = front.replace(".", "")
if back:
back = "." + back[:2]
result = front + back
return result[:16]
def parse_money(capital_num, num):
if capital_num:
try:
money = util.chinese_money_to_number(capital_num)
return capital_num, money
except Exception as e:
logging.warning("大写金额解析失败", exc_info=e)
return num, handle_decimal(num)
# 处理日期类数据
def handle_date(string):
if not string:
return ""
string = string.replace("", "-").replace("", "-").replace("", "").replace("/", "-").replace(".", "-")
string = re.sub(r'[^0-9-]', '', string)
string = string.strip("-")
if "-" in string:
dash_count = string.count("-")
if dash_count > 2:
third_dash_index = string.find("-", string.find("-", string.find("-") + 1) + 1)
string = string[:third_dash_index]
day = string[string.rindex("-") + 1:]
if len(day) > 2:
string = string[:2 - len(day)]
else:
if len(string) > 8:
string = string[:8]
if len(string) < 6:
return ""
# 定义可能的日期格式
formats = [
# yyyy-MM-dd
'%Y-%m-%d',
# yy-MM-dd
'%y-%m-%d',
# yyyyMMdd
'%Y%m%d',
# yyMMdd
'%y%m%d',
]
# 遍历所有格式,尝试解析日期
for fmt in formats:
try:
date = datetime.strptime(string, fmt)
# 限定日期的年份范围
if 2000 < date.year < 2100:
return date.strftime("%Y-%m-%d")
continue
except ValueError:
continue
return ""
def handle_hospital(string):
if not string:
return ""
return string[:255]
def handle_department(string):
if not string:
return ""
return string[:255]
def parse_department(string):
result = []
if not string:
return result
string = re.sub(r'\([^()]*\)|\[[^\[\]]*]|\{[^{}]*}|[^]*|[^⺀-鿿]', '', string)[:255]
if string == "":
return result
result.append(string)
string_without_num = re.sub(r'\d|一|二|三|四|五|六|七|八|九|十', '', string)
if string == "":
return result
if string_without_num != string:
result.append(string_without_num)
pure_string = string_without_num.split("")[0] + ""
if string == "":
return result
if pure_string != string_without_num:
result.append(pure_string)
pure_string_without_io = pure_string.replace("", "").replace("", "")
if string == "":
return result
if pure_string_without_io != pure_string:
result.append(pure_string)
return result
# 处理姓名类数据
def handle_name(string):
if not string:
return ""
return re.sub(r'[^⺀-鿿·]', '', string)[:30]
# 处理医保类型数据
def handle_insurance_type(string):
if not string:
return ""
worker_insurance_keys = ["社保", "城保", "", "退休"]
villager_insurance_keys = ["农保", "居民"]
migrant_worker_insurance_keys = ["农民工"]
no_insurance_keys = ["自费", "全费"]
if any(key in string for key in worker_insurance_keys):
return "职工医保"
if any(key in string for key in villager_insurance_keys):
return "居民医保"
if any(key in string for key in migrant_worker_insurance_keys):
return "农民工医保"
if any(key in string for key in no_insurance_keys):
return "无医保"
return "其他"
# 处理原始数据
def handle_original_data(string):
if not string:
return ""
# 防止过长存入数据库失败
return string[:255]
# 处理id类数据
def handle_id(string):
if not string:
return ""
# 防止过长存入数据库失败
return string[:50]
# 处理年龄类数据
def handle_age(string):
if not string:
return ""
string = string.split("")[0]
num = re.sub(r'\D', '', string)
return num[-3:]
# 分析医院
def parse_hospital(string):
result = []
if not string:
return result
string = util.traditional_to_simple_chinese(string)
string_without_brackets = string.replace(")", "").replace("", "").replace("(", " ").replace("", " ")
string_without_company = string_without_brackets.replace("有限公司", "")
split_hospitals = string_without_company.replace("医院", "医院 ")
result += split_hospitals.strip().split(" ")
return result