使用分词模糊查询优化医院的匹配

This commit is contained in:
2024-08-16 16:04:59 +08:00
parent 729167abca
commit 478f98abfd
4 changed files with 68 additions and 19 deletions

View File

@@ -1,6 +1,6 @@
x-env: x-env:
&template &template
image: fcb_photo_review:1.12.6 image: fcb_photo_review:1.12.7
restart: always restart: always
services: services:

View File

@@ -1,5 +1,6 @@
import socket import socket
import jieba
from paddlenlp import Taskflow from paddlenlp import Taskflow
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
@@ -67,14 +68,23 @@ COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPE
""" """
# 使用别名中的value替换key。考虑到效率问题只会替换第一个匹配到的key。 # 使用别名中的value替换key。考虑到效率问题只会替换第一个匹配到的key。
HOSPITAL_ALIAS = { HOSPITAL_ALIAS = {
"江阴": ["江阴市"],
"溧阳": ["溧阳市"],
"六合": ["六合区"],
"沐阳": ["沭阳"], "沐阳": ["沭阳"],
"连水": ["涟水"], "连水": ["涟水"],
"中医医院": ["中医院"], "唯宁": ["睢宁"],
"九〇四": ["904"],
"漂水": ["溧水"],
} }
"""
搜索过滤配置
"""
HOSPITAL_FILTER = ["医院", "", "", "", "", "", "人民", "第一", "第二", "第三", "大学", "附属"]
"""
分词配置
"""
jieba.suggest_freq(('肿瘤', '医院'), True)
""" """
模型配置 模型配置
""" """

View File

@@ -7,15 +7,17 @@ from collections import defaultdict
from time import sleep from time import sleep
import cv2 import cv2
import jieba
import requests import requests
from sqlalchemy import update, or_ from rapidfuzz import process, fuzz
from sqlalchemy import update
from db import MysqlSession from db import MysqlSession
from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec
from photo_review import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \ from photo_review import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \
PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR, \ PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR, \
ADMISSION_ID, SETTLEMENT_ID, AGE, OCR, SETTLEMENT_IE, DISCHARGE_IE, COST_IE, PHHD_BATCH_SIZE, SLEEP_MINUTES, \ ADMISSION_ID, SETTLEMENT_ID, AGE, OCR, SETTLEMENT_IE, DISCHARGE_IE, COST_IE, PHHD_BATCH_SIZE, SLEEP_MINUTES, \
UPPERCASE_MEDICAL_EXPENSES, HOSTNAME, HOSPITAL_ALIAS UPPERCASE_MEDICAL_EXPENSES, HOSTNAME, HOSPITAL_ALIAS, HOSPITAL_FILTER
from ucloud import ufile from ucloud import ufile
from util import image_util, util from util import image_util, util
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \ from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \
@@ -189,6 +191,36 @@ def save_or_update_ie(table, pk_phhd, data):
session.close() session.close()
def search_hospital(hospital):
def _filter_search_keywords(keywords):
keywords = [x for x in keywords if x not in HOSPITAL_FILTER]
result1 = ""
result2 = ""
for keyword in keywords:
if "医院" in keyword:
break
result2 = result1
result1 = keyword
result = [result1]
if result2:
result.append(result2)
return result
cut_list = jieba.lcut(hospital)
session = MysqlSession()
yljg = session.query(BdYljg.pk_yljg, BdYljg.name).filter(BdYljg.name.like(f"%{'%'.join(cut_list)}%")).all()
if not yljg:
filter_keywords = _filter_search_keywords(cut_list)
for filter_keyword in filter_keywords:
yljg = session.query(BdYljg.pk_yljg, BdYljg.name).filter(BdYljg.name.like(f"%{filter_keyword}%")).all()
if yljg:
break
session.close()
yljg = {row.pk_yljg: row.name for row in yljg}
best_match = process.extractOne(hospital, yljg, scorer=fuzz.partial_token_set_ratio)
return best_match
def settlement_task(pk_phhd, settlement_list, identity): def settlement_task(pk_phhd, settlement_list, identity):
settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity) settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity)
settlement_data = { settlement_data = {
@@ -241,25 +273,31 @@ def discharge_task(pk_phhd, discharge_record, identity):
discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"]) discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"])
if hospitals: if hospitals:
hospital_like_conditions = [] match_hospitals = []
for hospital in hospitals: for hospital in hospitals:
parsed_hospitals = parse_hospital(hospital) parsed_hospitals = parse_hospital(hospital)
for parsed_hospital in parsed_hospitals: for parsed_hospital in parsed_hospitals:
hospital_index = parsed_hospital.find("医院") search_result = search_hospital(parsed_hospital)
if hospital_index != -1 and hospital_index != len(parsed_hospital) - 2: match_hospitals.append(search_result)
parsed_hospital = parsed_hospital[:hospital_index + 2] + "%" + parsed_hospital[hospital_index + 2:] if search_result and search_result[1] == 100:
hospital_like_conditions.append(BdYljg.name.like(f'%{parsed_hospital}%')) break
for hospital_alias_key in HOSPITAL_ALIAS.keys(): for hospital_alias_key in HOSPITAL_ALIAS.keys():
if hospital_alias_key in parsed_hospital: if hospital_alias_key in parsed_hospital:
for hospital_alias in HOSPITAL_ALIAS[hospital_alias_key]: for hospital_alias in HOSPITAL_ALIAS[hospital_alias_key]:
new_hospital = parsed_hospital.replace(hospital_alias_key, hospital_alias) new_hospital = parsed_hospital.replace(hospital_alias_key, hospital_alias)
hospital_like_conditions.append(BdYljg.name.like(f'%{new_hospital}%')) match_hospitals.append(search_hospital(new_hospital))
break break
session = MysqlSession() best_match = None
yljg = session.query(BdYljg.pk_yljg, BdYljg.name).filter(or_(*hospital_like_conditions)).limit(1).one_or_none() best_score = 0
session.close() for match_hospital in match_hospitals:
if yljg: if match_hospital and match_hospital[1] > best_score:
discharge_data["pk_yljg"] = yljg.pk_yljg best_match = match_hospital
best_score = match_hospital[1]
if best_score == 100:
break
if best_match:
discharge_data["pk_yljg"] = best_match[2]
if departments: if departments:
department_values = [] department_values = []
for dept in departments: for dept in departments:

View File

@@ -184,5 +184,6 @@ def parse_hospital(string):
string = util.traditional_to_simple_chinese(string) string = util.traditional_to_simple_chinese(string)
string_without_brackets = string.replace(")", "").replace("", "").replace("(", " ").replace("", " ") string_without_brackets = string.replace(")", "").replace("", "").replace("(", " ").replace("", " ")
string_without_company = string_without_brackets.replace("有限公司", "") string_without_company = string_without_brackets.replace("有限公司", "")
result += string_without_company.split(" ") split_hospitals = string_without_company.replace("医院", "医院 ")
result += split_hospitals.strip().split(" ")
return result return result