import jieba from paddlenlp import Taskflow from paddleocr import PaddleOCR ''' 项目配置 ''' # 每次从数据库获取的案子数量 PHHD_BATCH_SIZE = 10 # 没有查询到案子的等待时间(分钟) SLEEP_MINUTES = 5 # 是否发送报错邮件 SEND_ERROR_EMAIL = True # 是否开启布局分析 LAYOUT_ANALYSIS = False """ 信息抽取关键词配置 """ # 患者姓名 PATIENT_NAME = ['患者姓名'] # 入院日期 ADMISSION_DATE = ['入院日期'] # 出院日期 DISCHARGE_DATE = ['出院日期'] # 发生医疗费 MEDICAL_EXPENSES = ['费用总额'] # 个人现金支付 PERSONAL_CASH_PAYMENT = ['个人现金支付'] # 个人账户支付 PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付'] # 个人自费金额 PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费'] # 医保类别 MEDICAL_INSURANCE_TYPE = ['医保类型'] # 就诊医院 HOSPITAL = ['医院'] # 就诊科室 DEPARTMENT = ['科室'] # 主治医生 DOCTOR = ['主治医生'] # 住院号 ADMISSION_ID = ['住院号'] # 医保结算单号码 SETTLEMENT_ID = ['医保结算单号码'] # 年龄 AGE = ['年龄'] # 大写总额 UPPERCASE_MEDICAL_EXPENSES = ['大写总额'] SETTLEMENT_LIST_SCHEMA = \ (PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT + PERSONAL_ACCOUNT_PAYMENT + PERSONAL_FUNDED_AMOUNT + MEDICAL_INSURANCE_TYPE + ADMISSION_ID + SETTLEMENT_ID + UPPERCASE_MEDICAL_EXPENSES) DISCHARGE_RECORD_SCHEMA = \ HOSPITAL + DEPARTMENT + PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + DOCTOR + ADMISSION_ID + AGE COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES ''' 别名配置 ''' # 使用别名中的value替换key。考虑到效率问题,只会替换第一个匹配到的key。 HOSPITAL_ALIAS = { '沐阳': ['沭阳'], '连水': ['涟水'], '唯宁': ['睢宁'], # 雕宁 '九〇四': ['904'], '漂水': ['溧水'], } DEPARTMENT_ALIAS = { '耳鼻喉': ['耳鼻咽喉'], '急症': ['急诊'], } ''' 搜索过滤配置 ''' # 默认会过滤单字 HOSPITAL_FILTER = ['医院', '人民', '第一', '第二', '第三', '大学', '附属'] DEPARTMENT_FILTER = ['医', '伤', '西', '新'] ''' 分词配置 ''' jieba.suggest_freq(('肿瘤', '医院'), True) jieba.suggest_freq(('骨', '伤'), True) jieba.suggest_freq(('感染', '性'), True) jieba.suggest_freq(('胆', '道'), True) jieba.suggest_freq(('脾', '胃'), True) ''' 模型配置 ''' SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base', task_path='model/settlement_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16') DISCHARGE_IE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base', task_path='model/discharge_record_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16') COST_IE = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base', device_id=1, task_path='model/cost_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16') OCR = PaddleOCR( gpu_id=1, use_angle_cls=False, show_log=False, det_db_thresh=0.1, det_db_box_thresh=0.3, det_limit_side_len=1248, drop_score=0.3, rec_model_dir='model/ocr/openatom_rec_repsvtr_ch_infer', rec_algorithm='SVTR_LCNet', )