diff --git a/config/mysql.py b/config/mysql.py index 69931fe..6fd9a77 100644 --- a/config/mysql.py +++ b/config/mysql.py @@ -16,7 +16,7 @@ PASSWORD = 'test9Root' DB_URL = f'mysql+pymysql://{USERNAME}:{PASSWORD}@{HOSTNAME}:{PORT}/{DATABASE}' # 是否打印执行的sql -SHOW_SQL = False +SHOW_SQL = True Engine = create_engine(DB_URL, echo=SHOW_SQL) Base = declarative_base(Engine) diff --git a/config/photo_review.py b/config/photo_review.py index 89a307a..08dc4aa 100644 --- a/config/photo_review.py +++ b/config/photo_review.py @@ -6,3 +6,6 @@ PHHD_BATCH_SIZE = 10 # 没有查询到案子的等待时间(分钟) SLEEP_MINUTES = 5 + +# 是否发送报错邮件 +SEND_ERROR_EMAIL = True diff --git a/main.py b/main.py index a947d9c..4ffa10f 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ import traceback from auto_email.error_email import send_an_error_email from config.log import LOGGING_CONFIG -from config.photo_review import RETRY_TIME +from config.photo_review import RETRY_TIME, SEND_ERROR_EMAIL from photo_review.photo_review import main # 项目必须从此处启动,否则代码中的相对路径可能导致错误的发生 @@ -19,5 +19,7 @@ if __name__ == '__main__': main() except Exception as e: log.error(traceback.format_exc()) - send_an_error_email(program_name='照片审核关键信息抽取脚本', error_name=repr(e), error_detail=traceback.format_exc()) + if SEND_ERROR_EMAIL: + send_an_error_email(program_name='照片审核关键信息抽取脚本', error_name=repr(e), + error_detail=traceback.format_exc()) continue diff --git a/photo_review/photo_review.py b/photo_review/photo_review.py index 8bb3a84..2338575 100644 --- a/photo_review/photo_review.py +++ b/photo_review/photo_review.py @@ -3,6 +3,7 @@ import logging from time import sleep from paddlenlp import Taskflow +from sqlalchemy import update from config.keys import PATIENT_NAME, ADMISSION_DATE, DISCHARGE_DATE, MEDICAL_EXPENSES, PERSONAL_CASH_PAYMENT, \ PERSONAL_ACCOUNT_PAYMENT, PERSONAL_FUNDED_AMOUNT, MEDICAL_INSURANCE_TYPE, HOSPITAL, DEPARTMENT, DOCTOR @@ -16,7 +17,7 @@ from photo_review.entity.zx_ie_settlement import ZxIeSettlement from photo_review.entity.zx_ocr import ZxOcr from photo_review.entity.zx_phhd import ZxPhhd from photo_review.entity.zx_phrec import ZxPhrec -from photo_review.util.data_util import handle_date, handle_decimal +from photo_review.util.data_util import handle_date, handle_decimal, handle_department from photo_review.util.ucloud import get_private_url @@ -27,6 +28,7 @@ def information_extraction(schema, phrecs, task_path): pic_path = get_private_url(phrec.cfjaddress) if pic_path: ie = Taskflow("information_extraction", schema=schema, model="uie-x-base", task_path=task_path) + # 批量抽取写法:(ie([{"doc": "./data/6.jpg"}, {"doc": "./data/7.jpg"}]) result = ie({"doc": pic_path}) result_json = json.dumps(result, ensure_ascii=False) @@ -173,13 +175,18 @@ def photo_review(pk_phhd): discharge_data["hospital"] = yljg.name department_value = get_values_of_keys(discharge_record_ie_result, department_key) if department_value: - session = MysqlSession() - ylks = session.query(BdYlks.pk_ylks, BdYlks.name) \ - .filter(BdYlks.name.in_(department_value)).limit(1).one_or_none() - session.close() - if ylks: - discharge_data["pk_ylks"] = ylks.pk_ylks - discharge_data["department"] = ylks.name + department_values = [] + for dept in department_value: + department_values += handle_department(dept) + department_values = list(set(department_values)) + if department_values: + session = MysqlSession() + ylks = session.query(BdYlks.pk_ylks, BdYlks.name) \ + .filter(BdYlks.name.in_(department_values)).limit(1).one_or_none() + session.close() + if ylks: + discharge_data["pk_ylks"] = ylks.pk_ylks + discharge_data["department"] = ylks.name save_or_update_ie(ZxIeDischarge, pk_phhd, discharge_data) cost_list_ie_result = information_extraction(cost_list_schema, cost_list, "config/model/cost_list_model") @@ -192,19 +199,16 @@ def photo_review(pk_phhd): } cost_data["admission_date"] = handle_date(cost_data["admission_date_str"]) cost_data["discharge_date"] = handle_date(cost_data["discharge_date_str"]) - cost_data["medical_expenses"] = handle_date(cost_data["medical_expenses_str"]) + cost_data["medical_expenses"] = handle_decimal(cost_data["medical_expenses_str"]) save_or_update_ie(ZxIeCost, pk_phhd, cost_data) def main(): - # 最后处理的报销案子pk - last_pk_phhd = 0 # 持续检测新案子 while 1: session = MysqlSession() phhds = session.query(ZxPhhd.pk_phhd) \ - .filter(ZxPhhd.pk_phhd > last_pk_phhd) \ - .filter(ZxPhhd.cStatus == '2') \ + .filter(ZxPhhd.exsuccess_flag == '1') \ .limit(PHHD_BATCH_SIZE) \ .all() session.close() @@ -212,7 +216,13 @@ def main(): for phhd in phhds: pk_phhd = phhd.pk_phhd photo_review(pk_phhd) - last_pk_phhd = pk_phhd + + # 识别完成更新标识 + session = MysqlSession() + stmt = (update(ZxPhhd).where(ZxPhhd.pk_phhd == pk_phhd).values(exsuccess_flag=8)) + session.execute(stmt) + session.commit() + session.close() else: # 没有查询到新案子,等待一段时间后再查 sleep_minutes = SLEEP_MINUTES diff --git a/photo_review/util/data_util.py b/photo_review/util/data_util.py index a6f951c..36f0c5d 100644 --- a/photo_review/util/data_util.py +++ b/photo_review/util/data_util.py @@ -6,7 +6,12 @@ from datetime import datetime def handle_decimal(string): if not string: return "" - return re.sub(r'[^0-9.]', '', string) + string = re.sub(r'[^0-9.]', '', string) + front, back = string.rsplit('.', 1) + front = front.replace(".", "") + if back: + back = "." + back + return front + back # 处理日期类数据 @@ -14,8 +19,13 @@ def handle_date(string): if not string: return "" - string = string.replace("年", "-").replace("月", "-").replace("日", "") + string = string.replace("年", "-").replace("月", "-").replace("日", "").replace("/", "-").replace(".", "-") string = re.sub(r'[^0-9-]', '', string) + length = len(string) + if length > 8 and "-" not in string: + string = string[:8] + elif length > 10 and "-" in string: + string = string[:10] if is_valid_date_format(string): return string else: @@ -48,3 +58,20 @@ def is_valid_date_format(date_str): pass return False + + +def handle_department(string): + result = [] + if not string: + return result + result.append(string) + string_without_num = re.sub(r'\d|一|二|三|四|五|六|七|八|九|十', '', string) + if string_without_num != string: + result.append(string_without_num) + string_without_brackets = re.sub(r'\([^()]*\)|\[[^\[\]]*\]|\{[^\{\}]*\}|([^()]*)', "", string_without_num) + if string_without_brackets != string_without_num: + result.append(string_without_brackets) + pure_string = string_without_brackets.split("科")[0] + "科" + if pure_string != string_without_brackets: + result.append(pure_string) + return result