import jieba from paddlenlp import Taskflow from paddleocr import PaddleOCR """ 项目配置 """ # 每次从数据库获取的案子数量 PHHD_BATCH_SIZE = 10 # 没有查询到案子的等待时间(分钟) SLEEP_MINUTES = 5 # 是否发送报错邮件 SEND_ERROR_EMAIL = True # 是否开启布局分析 LAYOUT_ANALYSIS = False """ 信息抽取关键词配置 """ # 患者姓名 PATIENT_NAME = ["患者姓名"] # 入院日期 ADMISSION_DATE = ["入院日期"] # 出院日期 DISCHARGE_DATE = ["出院日期"] # 发生医疗费 MEDICAL_EXPENSES = ["费用总额"] # 个人现金支付 PERSONAL_CASH_PAYMENT = ["个人现金支付"] # 个人账户支付 PERSONAL_ACCOUNT_PAYMENT = ["个人账户支付"] # 个人自费金额 PERSONAL_FUNDED_AMOUNT = ["自费金额", "个人自费"] # 医保类别 MEDICAL_INSURANCE_TYPE = ["医保类型"] # 就诊医院 HOSPITAL = ["医院"] # 就诊科室 DEPARTMENT = ["科室"] # 主治医生 DOCTOR = ["主治医生"] # 住院号 ADMISSION_ID = ["住院号"] # 医保结算单号码 SETTLEMENT_ID = ["医保结算单号码"] # 年龄 AGE = ["年龄"] # 大写总额 UPPERCASE_MEDICAL_EXPENSES = ["大写总额"] SETTLEMENT_LIST_SCHEMA = \ (PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT + PERSONAL_ACCOUNT_PAYMENT + PERSONAL_FUNDED_AMOUNT + MEDICAL_INSURANCE_TYPE + ADMISSION_ID + SETTLEMENT_ID + UPPERCASE_MEDICAL_EXPENSES) DISCHARGE_RECORD_SCHEMA = \ HOSPITAL + DEPARTMENT + PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + DOCTOR + ADMISSION_ID + AGE COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES """ 别名配置 """ # 使用别名中的value替换key。考虑到效率问题,只会替换第一个匹配到的key。 HOSPITAL_ALIAS = { "沐阳": ["沭阳"], "连水": ["涟水"], "唯宁": ["睢宁"], # 雕宁 "九〇四": ["904"], "漂水": ["溧水"], } DEPARTMENT_ALIAS = { "耳鼻喉": ["耳鼻咽喉"], "急症": ["急诊"], } """ 搜索过滤配置 """ # 默认会过滤单字 HOSPITAL_FILTER = ["医院", "人民", "第一", "第二", "第三", "大学", "附属"] DEPARTMENT_FILTER = ["医", "伤", "西", "新"] """ 分词配置 """ jieba.suggest_freq(("肿瘤", "医院"), True) jieba.suggest_freq(("骨", "伤"), True) jieba.suggest_freq(("感染", "性"), True) jieba.suggest_freq(("胆", "道"), True) jieba.suggest_freq(("脾", "胃"), True) """ 模型配置 """ SETTLEMENT_IE = Taskflow("information_extraction", schema=SETTLEMENT_LIST_SCHEMA, model="uie-x-base", task_path="model/settlement_list_model", layout_analysis=LAYOUT_ANALYSIS, precision="fp16") DISCHARGE_IE = Taskflow("information_extraction", schema=DISCHARGE_RECORD_SCHEMA, model="uie-x-base", task_path="model/discharge_record_model", layout_analysis=LAYOUT_ANALYSIS, precision="fp16") COST_IE = Taskflow("information_extraction", schema=COST_LIST_SCHEMA, model="uie-x-base", device_id=1, task_path="model/cost_list_model", layout_analysis=LAYOUT_ANALYSIS, precision="fp16") OCR = PaddleOCR( device="gpu:0", ocr_version="PP-OCRv4", use_textline_orientation=False, # 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 text_det_thresh=0.1, # 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 text_det_box_thresh=0.3, )