更新OCR版本,Bata版,还不能上线

This commit is contained in:
2025-09-15 15:41:30 +08:00
parent d266c2828c
commit 670172e79e
9 changed files with 117 additions and 110 deletions

View File

@@ -2,9 +2,9 @@ import jieba
from paddlenlp import Taskflow
from paddleocr import PaddleOCR
'''
"""
项目配置
'''
"""
# 每次从数据库获取的案子数量
PHHD_BATCH_SIZE = 10
# 没有查询到案子的等待时间(分钟)
@@ -18,35 +18,35 @@ LAYOUT_ANALYSIS = False
信息抽取关键词配置
"""
# 患者姓名
PATIENT_NAME = ['患者姓名']
PATIENT_NAME = ["患者姓名"]
# 入院日期
ADMISSION_DATE = ['入院日期']
ADMISSION_DATE = ["入院日期"]
# 出院日期
DISCHARGE_DATE = ['出院日期']
DISCHARGE_DATE = ["出院日期"]
# 发生医疗费
MEDICAL_EXPENSES = ['费用总额']
MEDICAL_EXPENSES = ["费用总额"]
# 个人现金支付
PERSONAL_CASH_PAYMENT = ['个人现金支付']
PERSONAL_CASH_PAYMENT = ["个人现金支付"]
# 个人账户支付
PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付']
PERSONAL_ACCOUNT_PAYMENT = ["个人账户支付"]
# 个人自费金额
PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费']
PERSONAL_FUNDED_AMOUNT = ["自费金额", "个人自费"]
# 医保类别
MEDICAL_INSURANCE_TYPE = ['医保类型']
MEDICAL_INSURANCE_TYPE = ["医保类型"]
# 就诊医院
HOSPITAL = ['医院']
HOSPITAL = ["医院"]
# 就诊科室
DEPARTMENT = ['科室']
DEPARTMENT = ["科室"]
# 主治医生
DOCTOR = ['主治医生']
DOCTOR = ["主治医生"]
# 住院号
ADMISSION_ID = ['住院号']
ADMISSION_ID = ["住院号"]
# 医保结算单号码
SETTLEMENT_ID = ['医保结算单号码']
SETTLEMENT_ID = ["医保结算单号码"]
# 年龄
AGE = ['年龄']
AGE = ["年龄"]
# 大写总额
UPPERCASE_MEDICAL_EXPENSES = ['大写总额']
UPPERCASE_MEDICAL_EXPENSES = ["大写总额"]
SETTLEMENT_LIST_SCHEMA = \
(PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT
@@ -58,57 +58,55 @@ DISCHARGE_RECORD_SCHEMA = \
COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES
'''
"""
别名配置
'''
"""
# 使用别名中的value替换key。考虑到效率问题只会替换第一个匹配到的key。
HOSPITAL_ALIAS = {
'沐阳': ['沭阳'],
'连水': ['涟水'],
'唯宁': ['睢宁'], # 雕宁
'九〇四': ['904'],
'漂水': ['溧水'],
"沐阳": ["沭阳"],
"连水": ["涟水"],
"唯宁": ["睢宁"], # 雕宁
"九〇四": ["904"],
"漂水": ["溧水"],
}
DEPARTMENT_ALIAS = {
'耳鼻喉': ['耳鼻咽喉'],
'急症': ['急诊'],
"耳鼻喉": ["耳鼻咽喉"],
"急症": ["急诊"],
}
'''
"""
搜索过滤配置
'''
"""
# 默认会过滤单字
HOSPITAL_FILTER = ['医院', '人民', '第一', '第二', '第三', '大学', '附属']
HOSPITAL_FILTER = ["医院", "人民", "第一", "第二", "第三", "大学", "附属"]
DEPARTMENT_FILTER = ['', '', '西', '']
DEPARTMENT_FILTER = ["", "", "西", ""]
'''
"""
分词配置
'''
jieba.suggest_freq(('肿瘤', '医院'), True)
jieba.suggest_freq(('', ''), True)
jieba.suggest_freq(('感染', ''), True)
jieba.suggest_freq(('', ''), True)
jieba.suggest_freq(('', ''), True)
"""
jieba.suggest_freq(("肿瘤", "医院"), True)
jieba.suggest_freq(("", ""), True)
jieba.suggest_freq(("感染", ""), True)
jieba.suggest_freq(("", ""), True)
jieba.suggest_freq(("", ""), True)
'''
"""
模型配置
'''
SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base',
task_path='model/settlement_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
DISCHARGE_IE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base',
task_path='model/discharge_record_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
COST_IE = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base', device_id=1,
task_path='model/cost_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16')
"""
SETTLEMENT_IE = Taskflow("information_extraction", schema=SETTLEMENT_LIST_SCHEMA, model="uie-x-base",
task_path="model/settlement_list_model", layout_analysis=LAYOUT_ANALYSIS, precision="fp16")
DISCHARGE_IE = Taskflow("information_extraction", schema=DISCHARGE_RECORD_SCHEMA, model="uie-x-base",
task_path="model/discharge_record_model", layout_analysis=LAYOUT_ANALYSIS, precision="fp16")
COST_IE = Taskflow("information_extraction", schema=COST_LIST_SCHEMA, model="uie-x-base", device_id=1,
task_path="model/cost_list_model", layout_analysis=LAYOUT_ANALYSIS, precision="fp16")
OCR = PaddleOCR(
gpu_id=1,
use_angle_cls=False,
show_log=False,
det_db_thresh=0.1,
det_db_box_thresh=0.3,
det_limit_side_len=1248,
drop_score=0.3,
rec_model_dir='model/ocr/openatom_rec_repsvtr_ch_infer',
rec_algorithm='SVTR_LCNet',
)
device="gpu:0",
ocr_version="PP-OCRv4",
use_textline_orientation=False,
# 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点
text_det_thresh=0.1,
# 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域
text_det_box_thresh=0.3,
)

View File

@@ -36,14 +36,15 @@ def merge_result(result1, result2):
return result1
def ie_temp_image(ie, ocr, image):
def ie_temp_image(ie, ocr, image, is_screenshot=False):
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
cv2.imwrite(temp_file.name, image)
ie_result = []
ocr_pure_text = ''
angle = '0'
try:
layout = util.get_ocr_layout(ocr, temp_file.name)
layout, angle = util.get_ocr_layout(ocr, temp_file.name, is_screenshot)
if not layout:
# 无识别结果
ie_result = []
@@ -61,7 +62,7 @@ def ie_temp_image(ie, ocr, image):
os.remove(temp_file.name)
except Exception as e:
logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e)
return ie_result, ocr_pure_text
return ie_result, ocr_pure_text, angle
# 关键信息提取
@@ -159,7 +160,7 @@ def information_extraction(ie, phrecs, identity):
if not img_path:
continue
image = image_util.read(img_path)
image, exif_data = image_util.read(img_path)
if image is None:
# 图片可能因为某些原因获取不到
continue
@@ -175,7 +176,7 @@ def information_extraction(ie, phrecs, identity):
if text:
info_extract = ie(text)[0]
else:
info_extract = ie_temp_image(ie, OCR, image)[0]
info_extract = ie_temp_image(ie, OCR, image, True)[0]
ie_result = {'result': info_extract, 'angle': '0'}
now = util.get_default_datetime()
@@ -193,27 +194,20 @@ def information_extraction(ie, phrecs, identity):
result = merge_result(result, ie_result['result'])
else:
is_screenshot = image_util.is_screenshot(image, exif_data)
target_images = []
# target_images += detector.request_book_areas(image) # 识别文档区域并裁剪
if not target_images:
target_images.append(image) # 识别失败
angle_count = defaultdict(int, {'0': 0}) # 分割后图片的最优角度统计
for target_image in target_images:
# dewarped_image = dewarp.dewarp_image(target_image) # 去扭曲
dewarped_image = target_image
angles = image_util.parse_rotation_angles(dewarped_image)
split_results = image_util.split(dewarped_image)
split_results = image_util.split(target_image)
for split_result in split_results:
if split_result['img'] is None or split_result['img'].size == 0:
continue
rotated_img = image_util.rotate(split_result['img'], int(angles[0]))
ie_temp_result = ie_temp_image(ie, OCR, rotated_img)
ie_temp_result = ie_temp_image(ie, OCR, split_result['img'], is_screenshot)
ocr_text += ie_temp_result[1]
ie_results = [{'result': ie_temp_result[0], 'angle': angles[0]}]
if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')):
rotated_img = image_util.rotate(split_result['img'], int(angles[1]))
ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img)[0], 'angle': angles[1]})
ie_results = [{'result': ie_temp_result[0], 'angle': ie_temp_result[2]}]
now = util.get_default_datetime()
best_angle = ['0', 0]
for ie_result in ie_results: