From d5181c33b84b7d573db22bd2acc18194adc98b6b Mon Sep 17 00:00:00 2001 From: liuyebo <1515783401@qq.com> Date: Tue, 20 Aug 2024 16:54:36 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E7=A7=91=E5=AE=A4=E7=9A=84?= =?UTF-8?q?=E5=8C=B9=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- photo_review/photo_review.py | 45 +++++++++++++++++++++++++++--------- util/data_util.py | 30 ++++++++---------------- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/photo_review/photo_review.py b/photo_review/photo_review.py index 69fccce..e4f1e38 100644 --- a/photo_review/photo_review.py +++ b/photo_review/photo_review.py @@ -221,6 +221,22 @@ def search_hospital(hospital): return best_match +def search_department(department): + cut_list = jieba.lcut(department) + session = MysqlSession() + ylks = session.query(BdYlks.pk_ylks, BdYlks.name).filter(BdYlks.name.like(f"%{'%'.join(cut_list)}%")).all() + if not ylks: + filter_keywords = cut_list + for filter_keyword in filter_keywords: + ylks = session.query(BdYlks.pk_ylks, BdYlks.name).filter(BdYlks.name.like(f"%{filter_keyword}%")).all() + if ylks: + break + session.close() + ylks = {row.pk_ylks: row.name for row in ylks} + best_match = process.extractOne(department, ylks, scorer=fuzz.partial_token_set_ratio) + return best_match + + def settlement_task(pk_phhd, settlement_list, identity): settlement_list_ie_result = information_extraction(SETTLEMENT_IE, settlement_list, identity) settlement_data = { @@ -299,17 +315,24 @@ def discharge_task(pk_phhd, discharge_record, identity): if best_match: discharge_data["pk_yljg"] = best_match[2] if departments: - department_values = [] - for dept in departments: - department_values += parse_department(dept) - department_values = list(set(department_values)) - if department_values: - session = MysqlSession() - ylks = session.query(BdYlks.pk_ylks, BdYlks.name) \ - .filter(BdYlks.name.in_(department_values)).limit(1).one_or_none() - session.close() - if ylks: - discharge_data["pk_ylks"] = ylks.pk_ylks + match_departments = [] + for department in departments: + parsed_departments = parse_department(department) + for parsed_department in parsed_departments: + search_result = search_department(parsed_department) + match_departments.append(search_result) + if search_result and search_result[1] == 100: + break + best_match = None + best_score = 0 + for match_department in match_departments: + if match_department and match_department[1] > best_score: + best_match = match_department + best_score = match_department[1] + if best_score == 100: + break + if best_match: + discharge_data["pk_ylks"] = best_match[2] save_or_update_ie(ZxIeDischarge, pk_phhd, discharge_data) diff --git a/util/data_util.py b/util/data_util.py index 9f20831..ee0d5c9 100644 --- a/util/data_util.py +++ b/util/data_util.py @@ -102,26 +102,16 @@ def parse_department(string): result = [] if not string: return result - string = re.sub(r'\([^()]*\)|\[[^\[\]]*]|\{[^{}]*}|([^()]*)|[^⺀-鿿]', '', string)[:255] - if string == "科": - return result - result.append(string) - string_without_num = re.sub(r'\d|一|二|三|四|五|六|七|八|九|十', '', string) - if string == "科": - return result - if string_without_num != string: - result.append(string_without_num) - pure_string = string_without_num.split("科")[0] + "科" - if string == "科": - return result - if pure_string != string_without_num: - result.append(pure_string) - pure_string_without_io = pure_string.replace("内", "").replace("外", "") - if string == "科": - return result - if pure_string_without_io != pure_string: - result.append(pure_string) - return result + + string = string.replace(")", "").replace(")", "").replace("(", " ").replace("(", " ") # 去除括号 + string = re.sub(r'[^⺀-鿿 ]', '', string) # 去除非汉字字符,除了空格 + string = re.sub(r'[一二三四五六七八九十]', '', string) # 去除中文数字 + string = string.replace("科", " ") # 分离科室 + departments = string.strip().split(" ") + for department in departments: + if department: + result.append(department) + return set(result) # 处理姓名类数据