使用分词模糊查询优化医院的匹配

This commit is contained in:
2024-08-16 16:04:59 +08:00
parent 729167abca
commit 478f98abfd
4 changed files with 68 additions and 19 deletions

View File

@@ -1,6 +1,6 @@
x-env:
&template
image: fcb_photo_review:1.12.6
image: fcb_photo_review:1.12.7
restart: always
services:

View File

@@ -1,5 +1,6 @@
import socket
import jieba
from paddlenlp import Taskflow
from paddleocr import PaddleOCR
@@ -67,14 +68,23 @@ COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPE
"""
# 使用别名中的value替换key。考虑到效率问题只会替换第一个匹配到的key。
HOSPITAL_ALIAS = {
"江阴": ["江阴市"],
"溧阳": ["溧阳市"],
"六合": ["六合区"],
"沐阳": ["沭阳"],
"连水": ["涟水"],
"中医医院": ["中医院"],
"唯宁": ["睢宁"],
"九〇四": ["904"],
"漂水": ["溧水"],
}
"""
搜索过滤配置
"""
HOSPITAL_FILTER = ["医院", "", "", "", "", "", "人民", "第一", "第二", "第三", "大学", "附属"]
"""
分词配置
"""
jieba.suggest_freq(('肿瘤', '医院'), True)
"""
模型配置
"""

View File

@@ -7,15 +7,17 @@ from collections import defaultdict
from time import sleep
import cv2
import jieba
import requests
from sqlalchemy import update, or_
from rapidfuzz import process, fuzz
from sqlalchemy import update
from db import MysqlSession
from db.mysql import BdYljg, BdYlks, ZxIeResult, ZxIeCost, ZxIeDischarge, ZxIeSettlement, ZxPhhd, ZxPhrec
from photo_review import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \
PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR, \
ADMISSION_ID, SETTLEMENT_ID, AGE, OCR, SETTLEMENT_IE, DISCHARGE_IE, COST_IE, PHHD_BATCH_SIZE, SLEEP_MINUTES, \
UPPERCASE_MEDICAL_EXPENSES, HOSTNAME, HOSPITAL_ALIAS
UPPERCASE_MEDICAL_EXPENSES, HOSTNAME, HOSPITAL_ALIAS, HOSPITAL_FILTER
from ucloud import ufile
from util import image_util, util
from util.data_util import handle_date, handle_decimal, parse_department, handle_name, \
@@ -189,6 +191,36 @@ def save_or_update_ie(table, pk_phhd, data):
session.close()
def search_hospital(hospital):
def _filter_search_keywords(keywords):
keywords = [x for x in keywords if x not in HOSPITAL_FILTER]
result1 = ""
result2 = ""
for keyword in keywords:
if "医院" in keyword:
break
result2 = result1
result1 = keyword
result = [result1]
if result2:
result.append(result2)
return result
cut_list = jieba.lcut(hospital)
session = MysqlSession()
yljg = session.query(BdYljg.pk_yljg, BdYljg.name).filter(BdYljg.name.like(f"%{'%'.join(cut_list)}%")).all()
if not yljg:
filter_keywords = _filter_search_keywords(cut_list)
for filter_keyword in filter_keywords:
yljg = session.query(BdYljg.pk_yljg, BdYljg.name).filter(BdYljg.name.like(f"%{filter_keyword}%")).all()
if yljg:
break
session.close()
yljg = {row.pk_yljg: row.name for row in yljg}
best_match = process.extractOne(hospital, yljg, scorer=fuzz.partial_token_set_ratio)
return best_match
def settlement_task(pk_phhd, settlement_list, identity):
settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity)
settlement_data = {
@@ -241,25 +273,31 @@ def discharge_task(pk_phhd, discharge_record, identity):
discharge_data["discharge_date"] = handle_date(discharge_data["discharge_date_str"])
if hospitals:
hospital_like_conditions = []
match_hospitals = []
for hospital in hospitals:
parsed_hospitals = parse_hospital(hospital)
for parsed_hospital in parsed_hospitals:
hospital_index = parsed_hospital.find("医院")
if hospital_index != -1 and hospital_index != len(parsed_hospital) - 2:
parsed_hospital = parsed_hospital[:hospital_index + 2] + "%" + parsed_hospital[hospital_index + 2:]
hospital_like_conditions.append(BdYljg.name.like(f'%{parsed_hospital}%'))
search_result = search_hospital(parsed_hospital)
match_hospitals.append(search_result)
if search_result and search_result[1] == 100:
break
for hospital_alias_key in HOSPITAL_ALIAS.keys():
if hospital_alias_key in parsed_hospital:
for hospital_alias in HOSPITAL_ALIAS[hospital_alias_key]:
new_hospital = parsed_hospital.replace(hospital_alias_key, hospital_alias)
hospital_like_conditions.append(BdYljg.name.like(f'%{new_hospital}%'))
match_hospitals.append(search_hospital(new_hospital))
break
session = MysqlSession()
yljg = session.query(BdYljg.pk_yljg, BdYljg.name).filter(or_(*hospital_like_conditions)).limit(1).one_or_none()
session.close()
if yljg:
discharge_data["pk_yljg"] = yljg.pk_yljg
best_match = None
best_score = 0
for match_hospital in match_hospitals:
if match_hospital and match_hospital[1] > best_score:
best_match = match_hospital
best_score = match_hospital[1]
if best_score == 100:
break
if best_match:
discharge_data["pk_yljg"] = best_match[2]
if departments:
department_values = []
for dept in departments:

View File

@@ -184,5 +184,6 @@ def parse_hospital(string):
string = util.traditional_to_simple_chinese(string)
string_without_brackets = string.replace(")", "").replace("", "").replace("(", " ").replace("", " ")
string_without_company = string_without_brackets.replace("有限公司", "")
result += string_without_company.split(" ")
split_hospitals = string_without_company.replace("医院", "医院 ")
result += split_hospitals.strip().split(" ")
return result