fcb_photo_review/util/data_util.py

import logging
import re
from datetime import datetime

from util import common_util


# 处理金额类数据
def handle_decimal(string):
    if not string:
        return ''
    original_string = string
    string = re.sub(r'[^0-9.]', '', string)
    if not string:
        # 可能抓到大写金额了，尝试用大写金额解析
        try:
            return common_util.chinese_money_to_number(original_string)
        except Exception as e:
            logging.warning('大写金额解析失败', exc_info=e)
            return ''
    if '.' not in string:
        if len(string) > 2:
            result = string[:-2] + '.' + string[-2:]
        else:
            result = string
    else:
        front, back = string.rsplit('.', 1)
        front = front.replace('.', '')
        if back:
            back = '.' + back[:2]
        result = front + back
    return result[:16]


def parse_money(capital_num, num):
    if capital_num:
        try:
            money = common_util.chinese_money_to_number(capital_num)
            return capital_num, money
        except Exception as e:
            logging.warning('大写金额解析失败', exc_info=e)

    return num, handle_decimal(num)


# 处理日期类数据
def handle_date(string):
    if not string:
        return ''

    string = string.replace('年', '-').replace('月', '-').replace('日', '').replace('/', '-').replace('.', '-')
    string = re.sub(r'[^0-9-]', '', string)
    string = string.strip('-')
    if '-' in string:
        dash_count = string.count('-')
        if dash_count > 2:
            third_dash_index = string.find('-', string.find('-', string.find('-') + 1) + 1)
            string = string[:third_dash_index]
        day = string[string.rindex('-') + 1:]
        if len(day) > 2:
            string = string[:2 - len(day)]
    else:
        if len(string) > 8:
            string = string[:8]

    if len(string) < 6:
        return ''

    # 定义可能的日期格式
    formats = [
        # yyyy-MM-dd
        '%Y-%m-%d',
        # yy-MM-dd
        '%y-%m-%d',
        # yyyyMMdd
        '%Y%m%d',
        # yyMMdd
        '%y%m%d',
    ]

    # 遍历所有格式，尝试解析日期
    for fmt in formats:
        try:
            date = datetime.strptime(string, fmt)
            # 限定日期的年份范围
            if 2000 < date.year < 2100:
                return date.strftime('%Y-%m-%d')
            continue
        except ValueError:
            continue

    return ''


def handle_hospital(string):
    if not string:
        return ''
    return string[:255]


def handle_department(string):
    if not string:
        return ''
    return string[:255]


def parse_department(string):
    result = []
    if not string:
        return result

    string = string.replace(')', '').replace('）', '').replace('(', ' ').replace('（', ' ')  # 去除括号
    string = re.sub(r'[^⺀-鿿 ]', '', string)  # 去除非汉字字符，除了空格
    string = re.sub(r'[一二三四五六七八九十]', '', string)  # 去除中文数字
    string = string.replace('病区', '').replace('病', '')  # 去除常见的无意义词
    string = string.replace('科', ' ')  # 分离科室
    departments = string.strip().split(' ')
    for department in departments:
        if department:
            result.append(department)
    return set(result)


# 处理姓名类数据
def handle_name(string):
    if not string:
        return ''
    return re.sub(r'[^⺀-鿿·]', '', string)[:30]


# 处理医保类型数据
def handle_insurance_type(string):
    if not string:
        return ''
    worker_insurance_keys = ['社保', '城保', '职', '退休']
    villager_insurance_keys = ['农保', '居民']
    migrant_worker_insurance_keys = ['农民工']
    no_insurance_keys = ['自费', '全费']
    if any(key in string for key in worker_insurance_keys):
        return '职工医保'
    if any(key in string for key in villager_insurance_keys):
        return '居民医保'
    if any(key in string for key in migrant_worker_insurance_keys):
        return '农民工医保'
    if any(key in string for key in no_insurance_keys):
        return '无医保'
    return '其他'


# 处理原始数据
def handle_original_data(string):
    if not string:
        return ''
    # 防止过长存入数据库失败
    return string[:255]


# 处理id类数据
def handle_id(string):
    if not string:
        return ''
    # 防止过长存入数据库失败
    return string[:50]


# 处理年龄类数据
def handle_age(string):
    if not string:
        return ''
    string = string.split('岁')[0]
    num = re.sub(r'\D', '', string)
    return num[-3:]


# 分析医院
def parse_hospital(string):
    result = []
    if not string:
        return result

    string = common_util.traditional_to_simple_chinese(string)
    string_without_brackets = string.replace(')', '').replace('）', '').replace('(', ' ').replace('（', ' ')
    string_without_company = string_without_brackets.replace('有限公司', '')
    split_hospitals = string_without_company.replace('医院', '医院 ')
    result += split_hospitals.strip().split(' ')
    return result


def parse_page_num(page_list):
    if not page_list:
        return None, None
    pages = []
    total = []
    for page in page_list:
        page_texts = [p.get('text', '') for p in page]
        join = ''.join(page_texts)
        numbers = re.findall(r'\d+', join)
        # 过滤异常值
        numbers = [num for num in numbers if int(num) <= 30]
        if not numbers:
            continue
        pages.append(min(numbers))
        total.append(max(numbers))
    return pages, int(max(total if total else ['1']))


def handle_tiny_int(num):
    if not num:
        return None
    return num if num <= 127 else 127