diff --git a/docker-compose.yml b/docker-compose.yml index db984d6..7ac9751 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,6 @@ x-env: &template - image: fcb_photo_review:1.12.6 + image: fcb_photo_review:1.12.7 restart: always services: diff --git a/photo_review/__init__.py b/photo_review/__init__.py index f1a40d4..4d991da 100644 --- a/photo_review/__init__.py +++ b/photo_review/__init__.py @@ -1,5 +1,6 @@ import socket +import jieba from paddlenlp import Taskflow from paddleocr import PaddleOCR @@ -67,14 +68,23 @@ COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPE """ # 使用别名中的value替换key。考虑到效率问题,只会替换第一个匹配到的key。 HOSPITAL_ALIAS = { - "江阴": ["江阴市"], - "溧阳": ["溧阳市"], - "六合": ["六合区"], "沐阳": ["沭阳"], "连水": ["涟水"], - "中医医院": ["中医院"], + "唯宁": ["睢宁"], + "九〇四": ["904"], + "漂水": ["溧水"], } +""" +搜索过滤配置 +""" +HOSPITAL_FILTER = ["医院", "省", "市", "县", "区", "州", "人民", "第一", "第二", "第三", "大学", "附属"] + +""" +分词配置 +""" +jieba.suggest_freq(('肿瘤', '医院'), True) + """ 模型配置 """ diff --git a/photo_review/photo_review.py b/photo_review/photo_review.py index 59a98cf..2b8bd27 100644 --- a/photo_review/photo_review.py +++ b/photo_review/photo_review.py @@ -7,15 +7,17 @@ from collections import defaultdict from time import sleep import cv2 +import jieba import requests -from sqlalchemy import update, or_ +from rapidfuzz import process, fuzz +from sqlalchemy import update from db import MysqlSession from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec from photo_review import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \ PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR, \ ADMISSION_ID, SETTLEMENT_ID, AGE, OCR, SETTLEMENT_IE, DISCHARGE_IE, COST_IE, PHHD_BATCH_SIZE, SLEEP_MINUTES, \ - UPPERCASE_MEDICAL_EXPENSES, HOSTNAME, HOSPITAL_ALIAS + UPPERCASE_MEDICAL_EXPENSES, HOSTNAME, HOSPITAL_ALIAS, HOSPITAL_FILTER from ucloud import ufile from util import image_util, util from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \ @@ -189,6 +191,36 @@ def save_or_update_ie(table, pk_phhd, data): session.close() +def search_hospital(hospital): + def _filter_search_keywords(keywords): + keywords = [x for x in keywords if x not in HOSPITAL_FILTER] + result1 = "" + result2 = "" + for keyword in keywords: + if "医院" in keyword: + break + result2 = result1 + result1 = keyword + result = [result1] + if result2: + result.append(result2) + return result + + cut_list = jieba.lcut(hospital) + session = MysqlSession() + yljg = session.query(BdYljg.pk_yljg, BdYljg.name).filter(BdYljg.name.like(f"%{'%'.join(cut_list)}%")).all() + if not yljg: + filter_keywords = _filter_search_keywords(cut_list) + for filter_keyword in filter_keywords: + yljg = session.query(BdYljg.pk_yljg, BdYljg.name).filter(BdYljg.name.like(f"%{filter_keyword}%")).all() + if yljg: + break + session.close() + yljg = {row.pk_yljg: row.name for row in yljg} + best_match = process.extractOne(hospital, yljg, scorer=fuzz.partial_token_set_ratio) + return best_match + + def settlement_task(pk_phhd, settlement_list, identity): settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity) settlement_data = { @@ -241,25 +273,31 @@ def discharge_task(pk_phhd, discharge_record, identity): discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"]) if hospitals: - hospital_like_conditions = [] + match_hospitals = [] for hospital in hospitals: parsed_hospitals = parse_hospital(hospital) for parsed_hospital in parsed_hospitals: - hospital_index = parsed_hospital.find("医院") - if hospital_index != -1 and hospital_index != len(parsed_hospital) - 2: - parsed_hospital = parsed_hospital[:hospital_index + 2] + "%" + parsed_hospital[hospital_index + 2:] - hospital_like_conditions.append(BdYljg.name.like(f'%{parsed_hospital}%')) + search_result = search_hospital(parsed_hospital) + match_hospitals.append(search_result) + if search_result and search_result[1] == 100: + break for hospital_alias_key in HOSPITAL_ALIAS.keys(): if hospital_alias_key in parsed_hospital: for hospital_alias in HOSPITAL_ALIAS[hospital_alias_key]: new_hospital = parsed_hospital.replace(hospital_alias_key, hospital_alias) - hospital_like_conditions.append(BdYljg.name.like(f'%{new_hospital}%')) + match_hospitals.append(search_hospital(new_hospital)) break - session = MysqlSession() - yljg = session.query(BdYljg.pk_yljg, BdYljg.name).filter(or_(*hospital_like_conditions)).limit(1).one_or_none() - session.close() - if yljg: - discharge_data["pk_yljg"] = yljg.pk_yljg + best_match = None + best_score = 0 + for match_hospital in match_hospitals: + if match_hospital and match_hospital[1] > best_score: + best_match = match_hospital + best_score = match_hospital[1] + if best_score == 100: + break + + if best_match: + discharge_data["pk_yljg"] = best_match[2] if departments: department_values = [] for dept in departments: diff --git a/util/data_util.py b/util/data_util.py index aa97138..9f20831 100644 --- a/util/data_util.py +++ b/util/data_util.py @@ -184,5 +184,6 @@ def parse_hospital(string): string = util.traditional_to_simple_chinese(string) string_without_brackets = string.replace(")", "").replace(")", "").replace("(", " ").replace("(", " ") string_without_company = string_without_brackets.replace("有限公司", "") - result += string_without_company.split(" ") + split_hospitals = string_without_company.replace("医院", "医院 ") + result += split_hospitals.strip().split(" ") return result