diff --git a/Dockerfile b/Dockerfile index 39a4e3c..82fa0e1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,7 @@ ENV PYTHONUNBUFFERED=1 \ COPY requirements.txt /app/requirements.txt COPY packages /app/packages RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo '$TZ' > /etc/timezone \ + && python3 -m pip install --upgrade pip \ && pip install --no-cache-dir -r requirements.txt \ && pip uninstall -y onnxruntime onnxruntime-gpu \ && pip install onnxruntime-gpu==1.18.0 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ diff --git a/Dockerfile.dev b/Dockerfile.dev new file mode 100644 index 0000000..4a477f6 --- /dev/null +++ b/Dockerfile.dev @@ -0,0 +1,33 @@ +# 使用官方的paddle镜像作为基础 +FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 + +# 设置工作目录 +WORKDIR /app + +# 设置环境变量 +ENV PYTHONUNBUFFERED=1 \ + # 设置时区 + TZ=Asia/Shanghai \ + # 设置pip镜像地址,加快安装速度 + PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple + +# 安装language-pack-en和openssh-server +RUN apt update && \ + apt install -y language-pack-en && \ + apt install -y openssh-server + +# 配置SSH服务 +RUN mkdir /var/run/sshd && \ + # 设置root密码,可根据需要修改 + echo 'root:fcb0102' | chpasswd && \ + # 允许root登录SSH + sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config + +# 将当前目录内容复制到容器的/app内 +COPY . /app + +# 暴露22端口 +EXPOSE 22 + +# 启动SSH服务 +CMD ["/usr/sbin/sshd", "-D"] \ No newline at end of file diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000..790bde0 --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,26 @@ +services: + fcb_ai_dev: + image: fcb_ai_dev:0.0.10 + build: + context: . + dockerfile: Dockerfile.dev + # 容器名称,可自定义 + container_name: fcb_ai_dev + hostname: fcb_ai_dev + # 始终重启容器 + restart: always + # 端口映射,根据需要修改主机端口 + ports: + - "8022:22" + # 数据卷映射,根据实际路径修改 + volumes: + - ./log:/app/log + - ./model:/app/model + # 启用GPU支持 + deploy: + resources: + reservations: + devices: + - device_ids: [ '0', '1' ] + capabilities: [ 'gpu' ] + driver: 'nvidia' \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 4b8e985..e50fc9b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,6 @@ x-env: &template - image: fcb_photo_review:1.15.4 + image: fcb_photo_review:1.15.7 restart: always x-review: diff --git a/document/Linux下搭建和部署Paddle相关项目.docx b/document/Linux下搭建和部署Paddle相关项目.docx new file mode 100644 index 0000000..73d37c1 Binary files /dev/null and b/document/Linux下搭建和部署Paddle相关项目.docx differ diff --git a/document/OCR工作效率统计.xlsx b/document/OCR工作效率统计.xlsx new file mode 100644 index 0000000..45c2b27 Binary files /dev/null and b/document/OCR工作效率统计.xlsx differ diff --git a/document/PaddleOCR命令.md b/document/PaddleOCR命令.md new file mode 100644 index 0000000..9d1b66d --- /dev/null +++ b/document/PaddleOCR命令.md @@ -0,0 +1,153 @@ +# PaddleOCR + +------ + +## 数据集 + +该部分内容均在PPOCRLabel目录下进行 + +```bash +# 进入PPOCRLabel目录 +cd .\PPOCRLabel\ +``` + +### 打标 + +可以对PPOCRLabel.py直接使用PyCharm中的Run,但是默认是英文的 + +```bash +# 以中文运行打标应用 +python PPOCRLabel.py --lang ch +# 含有关键词提取的打标 +python PPOCRLabel.py --lang ch --kie True +``` + +### 划分数据集 + +```bash +python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data/drivingData +``` + +------ + +## 检测模型 + +先回到项目根目录 + +### 训练 + +```bash +python tools/train.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml +``` + +### 测试 + +```bash +python tools/infer_det.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml -o Global.pretrained_model=output/det_v4_bankcard/best_accuracy.pdparams Global.infer_img=train_data/drivingData/1.jpg +``` + +### 恢复训练 + +```bash +python tools/train.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml -o Global.checkpoints=./output/det_v4_bankcard/latest +``` + +------ + +## 识别模型 + +### 训练 + +```bash +python tools/train.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml +``` + +### 测试 + +```bash +python tools/infer_rec.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml -o Global.pretrained_model=output/rec_v4_bankcard/best_accuracy.pdparams Global.infer_img=train_data/drivingData/crop_img/1_crop_0.jpg +``` + +------ + +## 推理模型 + +### 检测模型转换 + +```bash +python tools/export_model.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml -o Global.pretrained_model=output/det_v4_bankcard/best_accuracy.pdparams +``` + +### 识别模型转换 + +```bash +python tools/export_model.py -c configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml -o Global.pretrained_model=output/rec_v4_bankcard/best_accuracy.pdparams +``` + +### 检测识别测试 + +```bash +python tools/infer/predict_system.py --det_model_dir=inference_model/det_v4_bankcard --rec_model_dir=inference_model/rec_v4_bankcard --rec_char_dict_path=ppocr/utils/num_dict.txt --image_dir=train_data/drivingData/1.jpg +``` + +------ + +## 移动端模型 + +### 检测模型转换 + +```bash +paddle_lite_opt --model_file=inference_model/det_v4_bankcard/inference.pdmodel --param_file=inference_model/det_v4_bankcard/inference.pdiparams --optimize_out=inference_model/det_v4_nb_bankcard --valid_targets=arm --optimize_out_type=naive_buffer +``` + +### 识别模型转换 + +```bash +paddle_lite_opt --model_file=inference_model/rec_v4_bankcard/inference.pdmodel --param_file=inference_model/rec_v4_bankcard/inference.pdiparams --optimize_out=inference_model/rec_v4_nb_bankcard --valid_targets=arm --optimize_out_type=naive_buffer +``` + +------ + +------ + +# PaddleNLP + +## 数据集 + +使用Label Studio进行数据标注,安装过程省略 + +```bash +# 打开Anaconda Prompt +# 激活安装Label Studio的环境 +conda activate label-studio +# 启动Label Studio +label-studio start +``` + +[打标流程](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/applications/information_extraction/label_studio_doc.md) + +### 数据转换 + +```bash +# 进入PaddleNLP\applications\information_extraction后执行 +python label_studio.py --label_studio_file ./document/data/label_studio.json --save_dir ./document/data --splits 0.8 0.1 0.1 --task_type ext +``` + + + +------ + +## 训练模型 + +```bash +# 进入PaddleNLP\applications\information_extraction\document后执行(双卡训练) +python -u -m paddle.distributed.launch --gpus "0,1" finetune.py --device gpu --logging_steps 5 --save_steps 25 --eval_steps 25 --seed 42 --model_name_or_path uie-x-base --output_dir ./checkpoint/model_best --train_path data/train.txt --dev_path data/dev.txt --max_seq_len 512 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --num_train_epochs 10 --learning_rate 1e-5 --do_train --do_eval --do_export --export_model_dir ./checkpoint/model_best --overwrite_output_dir --disable_tqdm False --metric_for_best_model eval_f1 --load_best_model_at_end True --save_total_limit 1 +``` + +------ + +参考: + +[PaddleOCR训练属于自己的模型详细教程](https://blog.csdn.net/qq_52852432/article/details/131817619?utm_medium=distribute.pc_relevant.none-task-blog-2~default~baidujs_baidulandingword~default-0-131817619-blog-124628731.235^v40^pc_relevant_3m_sort_dl_base1&spm=1001.2101.3001.4242.1&utm_relevant_index=3) +[端侧部署](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/deploy/lite/readme_ch.md) +[PaddleNLP关键信息抽取](https://blog.csdn.net/z5z5z5z56/article/details/130346646) \ No newline at end of file diff --git a/document/paddle镜像自带依赖.md b/document/paddle镜像自带依赖.md new file mode 100644 index 0000000..0fc10ff --- /dev/null +++ b/document/paddle镜像自带依赖.md @@ -0,0 +1,329 @@ +anyio 4.0.0 +astor 0.8.1 +certifi 2019.11.28 +chardet 3.0.4 +dbus-python 1.2.16 +decorator 5.1.1 +distro-info 0.23+ubuntu1.1 +exceptiongroup 1.1.3 +h11 0.14.0 +httpcore 1.0.2 +httpx 0.25.1 +idna 2.8 +numpy 1.26.2 +opt-einsum 3.3.0 +paddlepaddle-gpu 2.6.1.post120 +Pillow 10.1.0 +pip 24.0 +protobuf 4.25.0 +PyGObject 3.36.0 +python-apt 2.0.1+ubuntu0.20.4.1 +requests 2.22.0 +requests-unixsocket 0.2.0 +setuptools 68.2.2 +six 1.14.0 +sniffio 1.3.0 +unattended-upgrades 0.1 +urllib3 1.25.8 + + + + + + + +ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.1.0-gpu-cuda12.9-cudnn9.9: + +python:3.10.12 + +Package Version +------------------------ ---------- +anyio 4.9.0 +certifi 2025.6.15 +decorator 5.2.1 +exceptiongroup 1.3.0 +h11 0.16.0 +httpcore 1.0.9 +httpx 0.28.1 +idna 3.10 +networkx 3.4.2 +numpy 2.2.6 +nvidia-cublas-cu12 12.9.0.13 +nvidia-cuda-cccl-cu12 12.9.27 +nvidia-cuda-cupti-cu12 12.9.19 +nvidia-cuda-nvrtc-cu12 12.9.41 +nvidia-cuda-runtime-cu12 12.9.37 +nvidia-cudnn-cu12 9.9.0.52 +nvidia-cufft-cu12 11.4.0.6 +nvidia-cufile-cu12 1.14.0.30 +nvidia-curand-cu12 10.3.10.19 +nvidia-cusolver-cu12 11.7.4.40 +nvidia-cusparse-cu12 12.5.9.5 +nvidia-cusparselt-cu12 0.7.1 +nvidia-nccl-cu12 2.26.5 +nvidia-nvjitlink-cu12 12.9.41 +nvidia-nvtx-cu12 12.9.19 +opt-einsum 3.3.0 +paddlepaddle-gpu 3.1.0 +pillow 11.2.1 +pip 25.1.1 +protobuf 6.31.1 +setuptools 59.6.0 +sniffio 1.3.1 +typing_extensions 4.14.0 +wheel 0.37.1 + + + + + +ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 + +python:3.10.18 + +Package Version Editable project location +------------------------- -------------------- ------------------------- +aiohappyeyeballs 2.6.1 +aiohttp 3.12.13 +aiosignal 1.4.0 +aistudio_sdk 0.3.5 +albucore 0.0.13+pdx +albumentations 1.4.10+pdx +alembic 1.16.2 +annotated-types 0.7.0 +anyio 4.9.0 +astor 0.8.1 +asttokens 3.0.0 +async-timeout 4.0.3 +attrdict3 2.0.2 +attrs 25.3.0 +babel 2.17.0 +bce-python-sdk 0.9.35 +beautifulsoup4 4.13.4 +blinker 1.9.0 +cachetools 6.1.0 +certifi 2019.11.28 +cffi 1.17.1 +chardet 3.0.4 +charset-normalizer 3.4.2 +chinese-calendar 1.8.0 +click 8.2.1 +cloudpickle 3.1.1 +colorama 0.4.6 +colorlog 6.9.0 +ConfigSpace 1.2.1 +contourpy 1.3.2 +cssselect 1.3.0 +cssutils 2.11.1 +cycler 0.12.1 +Cython 3.1.2 +dataclasses-json 0.6.7 +datasets 3.6.0 +dbus-python 1.2.16 +decorator 5.2.1 +decord 0.6.0 +descartes 1.1.0 +dill 0.3.4 +distro 1.9.0 +distro-info 0.23+ubuntu1.1 +easydict 1.13 +einops 0.8.1 +et_xmlfile 2.0.0 +exceptiongroup 1.2.2 +executing 2.2.0 +faiss-cpu 1.8.0.post1 +fastapi 0.116.0 +filelock 3.18.0 +fire 0.7.0 +FLAML 2.3.5 +Flask 3.1.1 +flask-babel 4.0.0 +fonttools 4.58.5 +frozenlist 1.7.0 +fsspec 2025.3.0 +ftfy 6.3.1 +future 1.0.0 +gast 0.3.3 +GPUtil 1.4.0 +greenlet 3.2.3 +h11 0.14.0 +h5py 3.14.0 +hf-xet 1.1.5 +hpbandster 0.7.4 +httpcore 1.0.7 +httpx 0.28.1 +httpx-sse 0.4.1 +huggingface-hub 0.33.2 +idna 2.8 +imageio 2.37.0 +imagesize 1.4.1 +imgaug 0.4.0+pdx +ipython 8.37.0 +itsdangerous 2.2.0 +jedi 0.19.2 +jieba 0.42.1 +Jinja2 3.1.6 +jiter 0.10.0 +joblib 1.5.1 +jsonpatch 1.33 +jsonpointer 3.0.0 +jsonschema 4.24.0 +jsonschema-specifications 2025.4.1 +kiwisolver 1.4.8 +langchain 0.3.26 +langchain-community 0.3.27 +langchain-core 0.3.68 +langchain-openai 0.3.27 +langchain-text-splitters 0.3.8 +langsmith 0.4.4 +lapx 0.5.11.post1 +lazy_loader 0.4 +llvmlite 0.44.0 +lmdb 1.6.2 +lxml 6.0.0 +Mako 1.3.10 +markdown-it-py 3.0.0 +MarkupSafe 3.0.2 +marshmallow 3.26.1 +matplotlib 3.5.3 +matplotlib-inline 0.1.7 +mdurl 0.1.2 +more-itertools 10.7.0 +motmetrics 1.4.0 +msgpack 1.1.1 +multidict 6.6.3 +multiprocess 0.70.12.2 +mypy_extensions 1.1.0 +netifaces 0.11.0 +networkx 3.4.2 +numba 0.61.2 +numpy 1.24.4 +nuscenes-devkit 1.1.11+pdx +onnx 1.17.0 +onnxoptimizer 0.3.13 +openai 1.93.1 +opencv-contrib-python 4.10.0.84 +openpyxl 3.1.5 +opt-einsum 3.3.0 +optuna 4.4.0 +orjson 3.10.18 +packaging 24.2 +paddle2onnx 2.0.2rc3 +paddle3d 0.0.0 +paddleclas 2.6.0 +paddledet 0.0.0 +paddlefsl 1.1.0 +paddlenlp 2.8.0.post0 +paddlepaddle-gpu 3.0.0 +paddleseg 0.0.0.dev0 +paddlets 1.1.0 +paddlex 3.1.2 /root/PaddleX +pandas 1.3.5 +parso 0.8.4 +patsy 1.0.1 +pexpect 4.9.0 +pillow 11.1.0 +pip 25.1.1 +polygraphy 0.49.24 +ppvideo 2.3.0 +premailer 3.10.0 +prettytable 3.16.0 +prompt_toolkit 3.0.51 +propcache 0.3.2 +protobuf 6.30.1 +psutil 7.0.0 +ptyprocess 0.7.0 +pure_eval 0.2.3 +py-cpuinfo 9.0.0 +pyarrow 20.0.0 +pybind11 2.13.6 +pybind11-stubgen 2.5.1 +pyclipper 1.3.0.post6 +pycocotools 2.0.8 +pycparser 2.22 +pycryptodome 3.23.0 +pydantic 2.11.7 +pydantic_core 2.33.2 +pydantic-settings 2.10.1 +Pygments 2.19.2 +PyGObject 3.36.0 +PyMatting 1.1.14 +pyod 2.0.5 +pypandoc 1.15 +pyparsing 3.2.3 +pypdfium2 4.30.1 +pyquaternion 0.9.9 +Pyro4 4.82 +python-apt 2.0.1+ubuntu0.20.4.1 +python-dateutil 2.9.0.post0 +python-docx 1.2.0 +python-dotenv 1.1.1 +pytz 2025.2 +PyWavelets 1.3.0 +PyYAML 6.0.2 +RapidFuzz 3.13.0 +rarfile 4.2 +ray 2.47.1 +referencing 0.36.2 +regex 2024.11.6 +requests 2.32.4 +requests-toolbelt 1.0.0 +requests-unixsocket 0.2.0 +rich 14.0.0 +rpds-py 0.26.0 +ruamel.yaml 0.18.14 +ruamel.yaml.clib 0.2.12 +safetensors 0.5.3 +scikit-image 0.25.2 +scikit-learn 1.3.2 +scipy 1.15.3 +seaborn 0.13.2 +sentencepiece 0.2.0 +seqeval 1.2.2 +serpent 1.41 +setuptools 68.2.2 +shap 0.48.0 +Shapely 1.8.5.post1 +shellingham 1.5.4 +six 1.14.0 +sklearn 0.0 +slicer 0.0.8 +sniffio 1.3.1 +soundfile 0.13.1 +soupsieve 2.7 +SQLAlchemy 2.0.41 +stack-data 0.6.3 +starlette 0.46.2 +statsmodels 0.14.1 +tenacity 9.1.2 +tensorboardX 2.6.4 +tensorrt 10.5.0 +termcolor 3.1.0 +terminaltables 3.1.10 +threadpoolctl 3.6.0 +tifffile 2025.5.10 +tiktoken 0.9.0 +tokenizers 0.19.1 +tomli 2.2.1 +tool_helpers 0.1.2 +tqdm 4.67.1 +traitlets 5.14.3 +typeguard 4.4.4 +typer 0.16.0 +typing_extensions 4.14.1 +typing-inspect 0.9.0 +typing-inspection 0.4.1 +tzdata 2025.2 +ujson 5.10.0 +unattended-upgrades 0.1 +urllib3 1.25.8 +uvicorn 0.35.0 +visualdl 2.5.3 +Wand 0.6.13 +wcwidth 0.2.13 +Werkzeug 3.1.3 +xmltodict 0.14.2 +xxhash 3.5.0 +yacs 0.1.8 +yarl 1.20.1 +zstandard 0.23.0 \ No newline at end of file diff --git a/document/关于使用PaddleOCR训练模型的进展情况说明.docx b/document/关于使用PaddleOCR训练模型的进展情况说明.docx new file mode 100644 index 0000000..4d0bff6 Binary files /dev/null and b/document/关于使用PaddleOCR训练模型的进展情况说明.docx differ diff --git a/document/医保类型识别结果.xlsx b/document/医保类型识别结果.xlsx new file mode 100644 index 0000000..778bac3 Binary files /dev/null and b/document/医保类型识别结果.xlsx differ diff --git a/photo_mask/auto_photo_mask.py b/photo_mask/auto_photo_mask.py index eea755e..e82685b 100644 --- a/photo_mask/auto_photo_mask.py +++ b/photo_mask/auto_photo_mask.py @@ -62,7 +62,7 @@ def find_boxes(content, layout, offset=0, length=None, improve=False, image_path captured_image, offset_x, offset_y = image_util.expand_to_a4_size(captured_image) cv2.imwrite(temp_file.name, captured_image) try: - layouts = util.get_ocr_layout(OCR, temp_file.name) + layouts, _ = util.get_ocr_layout(OCR, temp_file.name) except TypeError: # 如果是类型错误,大概率是没识别到文字 layouts = [] @@ -100,7 +100,7 @@ def get_mask_layout(image, name, id_card_num): result = [] try: try: - layouts = util.get_ocr_layout(OCR, temp_file.name) + layouts, _ = util.get_ocr_layout(OCR, temp_file.name) # layouts = OCR.parse({"doc": temp_file.name})["layout"] except TypeError: # 如果是类型错误,大概率是没识别到文字 @@ -198,7 +198,7 @@ def mask_photo(img_url, name, id_card_num, color=(255, 255, 255)): return do_mask, i # 打开图片 - image = image_util.read(img_url) + image, _ = image_util.read(img_url) if image is None: return False, image original_image = image diff --git a/photo_mask/photo_mask_error_check.py b/photo_mask/photo_mask_error_check.py index ca5ecc8..fc5e4f7 100644 --- a/photo_mask/photo_mask_error_check.py +++ b/photo_mask/photo_mask_error_check.py @@ -23,7 +23,7 @@ def check_error(error_ocr): image = mask_photo(img_url, name, id_card_num, (0, 0, 0))[1] final_img_url = ufile.get_private_url(error_ocr.cfjaddress, "drg100") - final_image = image_util.read(final_img_url) + final_image, _ = image_util.read(final_img_url) return image_util.combined(final_image, image) diff --git a/photo_review.py b/photo_review.py index 232376a..ef05ef3 100644 --- a/photo_review.py +++ b/photo_review.py @@ -13,14 +13,14 @@ from photo_review import auto_photo_review, SEND_ERROR_EMAIL # 项目必须从此处启动,否则代码中的相对路径可能导致错误的发生 if __name__ == '__main__': - program_name = '照片审核自动识别脚本' + program_name = "照片审核自动识别脚本" logging.config.dictConfig(LOGGING_CONFIG) parser = argparse.ArgumentParser() parser.add_argument("--clean", default=False, type=bool, help="是否将识别中的案子改为待识别状态") args = parser.parse_args() if args.clean: - # 主要用于启动时,清除仍在涂抹中的案子 + # 主要用于启动时,清除仍在识别中的案子 session = MysqlSession() update_flag = (update(ZxPhhd).where(ZxPhhd.exsuccess_flag == "2").values(exsuccess_flag="1")) session.execute(update_flag) @@ -34,7 +34,7 @@ if __name__ == '__main__': logging.info(f"【{program_name}】开始运行") auto_photo_review.main() except Exception as e: - error_logger = logging.getLogger('error') + error_logger = logging.getLogger("error") error_logger.error(traceback.format_exc()) if SEND_ERROR_EMAIL: send_error_email(program_name, repr(e), traceback.format_exc()) diff --git a/photo_review/__init__.py b/photo_review/__init__.py index 5b8be9f..e35c6d0 100644 --- a/photo_review/__init__.py +++ b/photo_review/__init__.py @@ -2,9 +2,9 @@ import jieba from paddlenlp import Taskflow from paddleocr import PaddleOCR -''' +""" 项目配置 -''' +""" # 每次从数据库获取的案子数量 PHHD_BATCH_SIZE = 10 # 没有查询到案子的等待时间(分钟) @@ -18,35 +18,35 @@ LAYOUT_ANALYSIS = False 信息抽取关键词配置 """ # 患者姓名 -PATIENT_NAME = ['患者姓名'] +PATIENT_NAME = ["患者姓名"] # 入院日期 -ADMISSION_DATE = ['入院日期'] +ADMISSION_DATE = ["入院日期"] # 出院日期 -DISCHARGE_DATE = ['出院日期'] +DISCHARGE_DATE = ["出院日期"] # 发生医疗费 -MEDICAL_EXPENSES = ['费用总额'] +MEDICAL_EXPENSES = ["费用总额"] # 个人现金支付 -PERSONAL_CASH_PAYMENT = ['个人现金支付'] +PERSONAL_CASH_PAYMENT = ["个人现金支付"] # 个人账户支付 -PERSONAL_ACCOUNT_PAYMENT = ['个人账户支付'] +PERSONAL_ACCOUNT_PAYMENT = ["个人账户支付"] # 个人自费金额 -PERSONAL_FUNDED_AMOUNT = ['自费金额', '个人自费'] +PERSONAL_FUNDED_AMOUNT = ["自费金额", "个人自费"] # 医保类别 -MEDICAL_INSURANCE_TYPE = ['医保类型'] +MEDICAL_INSURANCE_TYPE = ["医保类型"] # 就诊医院 -HOSPITAL = ['医院'] +HOSPITAL = ["医院"] # 就诊科室 -DEPARTMENT = ['科室'] +DEPARTMENT = ["科室"] # 主治医生 -DOCTOR = ['主治医生'] +DOCTOR = ["主治医生"] # 住院号 -ADMISSION_ID = ['住院号'] +ADMISSION_ID = ["住院号"] # 医保结算单号码 -SETTLEMENT_ID = ['医保结算单号码'] +SETTLEMENT_ID = ["医保结算单号码"] # 年龄 -AGE = ['年龄'] +AGE = ["年龄"] # 大写总额 -UPPERCASE_MEDICAL_EXPENSES = ['大写总额'] +UPPERCASE_MEDICAL_EXPENSES = ["大写总额"] SETTLEMENT_LIST_SCHEMA = \ (PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES + PERSONAL_CASH_PAYMENT @@ -58,57 +58,55 @@ DISCHARGE_RECORD_SCHEMA = \ COST_LIST_SCHEMA = PATIENT_NAME + ADMISSION_DATE + DISCHARGE_DATE + MEDICAL_EXPENSES -''' +""" 别名配置 -''' +""" # 使用别名中的value替换key。考虑到效率问题,只会替换第一个匹配到的key。 HOSPITAL_ALIAS = { - '沐阳': ['沭阳'], - '连水': ['涟水'], - '唯宁': ['睢宁'], # 雕宁 - '九〇四': ['904'], - '漂水': ['溧水'], + "沐阳": ["沭阳"], + "连水": ["涟水"], + "唯宁": ["睢宁"], # 雕宁 + "九〇四": ["904"], + "漂水": ["溧水"], } DEPARTMENT_ALIAS = { - '耳鼻喉': ['耳鼻咽喉'], - '急症': ['急诊'], + "耳鼻喉": ["耳鼻咽喉"], + "急症": ["急诊"], } -''' +""" 搜索过滤配置 -''' +""" # 默认会过滤单字 -HOSPITAL_FILTER = ['医院', '人民', '第一', '第二', '第三', '大学', '附属'] +HOSPITAL_FILTER = ["医院", "人民", "第一", "第二", "第三", "大学", "附属"] -DEPARTMENT_FILTER = ['医', '伤', '西', '新'] +DEPARTMENT_FILTER = ["医", "伤", "西", "新"] -''' +""" 分词配置 -''' -jieba.suggest_freq(('肿瘤', '医院'), True) -jieba.suggest_freq(('骨', '伤'), True) -jieba.suggest_freq(('感染', '性'), True) -jieba.suggest_freq(('胆', '道'), True) -jieba.suggest_freq(('脾', '胃'), True) +""" +jieba.suggest_freq(("肿瘤", "医院"), True) +jieba.suggest_freq(("骨", "伤"), True) +jieba.suggest_freq(("感染", "性"), True) +jieba.suggest_freq(("胆", "道"), True) +jieba.suggest_freq(("脾", "胃"), True) -''' +""" 模型配置 -''' -SETTLEMENT_IE = Taskflow('information_extraction', schema=SETTLEMENT_LIST_SCHEMA, model='uie-x-base', - task_path='model/settlement_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16') -DISCHARGE_IE = Taskflow('information_extraction', schema=DISCHARGE_RECORD_SCHEMA, model='uie-x-base', - task_path='model/discharge_record_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16') -COST_IE = Taskflow('information_extraction', schema=COST_LIST_SCHEMA, model='uie-x-base', device_id=1, - task_path='model/cost_list_model', layout_analysis=LAYOUT_ANALYSIS, precision='fp16') +""" +SETTLEMENT_IE = Taskflow("information_extraction", schema=SETTLEMENT_LIST_SCHEMA, model="uie-x-base", + task_path="model/settlement_list_model", layout_analysis=LAYOUT_ANALYSIS, precision="fp16") +DISCHARGE_IE = Taskflow("information_extraction", schema=DISCHARGE_RECORD_SCHEMA, model="uie-x-base", + task_path="model/discharge_record_model", layout_analysis=LAYOUT_ANALYSIS, precision="fp16") +COST_IE = Taskflow("information_extraction", schema=COST_LIST_SCHEMA, model="uie-x-base", device_id=1, + task_path="model/cost_list_model", layout_analysis=LAYOUT_ANALYSIS, precision="fp16") OCR = PaddleOCR( - gpu_id=1, - use_angle_cls=False, - show_log=False, - det_db_thresh=0.1, - det_db_box_thresh=0.3, - det_limit_side_len=1248, - drop_score=0.3, - rec_model_dir='model/ocr/openatom_rec_repsvtr_ch_infer', - rec_algorithm='SVTR_LCNet', -) + device="gpu:0", + ocr_version="PP-OCRv4", + use_textline_orientation=False, + # 检测像素阈值,输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 + text_det_thresh=0.1, + # 检测框阈值,检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 + text_det_box_thresh=0.3, + ) \ No newline at end of file diff --git a/photo_review/auto_photo_review.py b/photo_review/auto_photo_review.py index a0b4028..dca8699 100644 --- a/photo_review/auto_photo_review.py +++ b/photo_review/auto_photo_review.py @@ -36,14 +36,15 @@ def merge_result(result1, result2): return result1 -def ie_temp_image(ie, ocr, image): +def ie_temp_image(ie, ocr, image, is_screenshot=False): with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file: cv2.imwrite(temp_file.name, image) ie_result = [] ocr_pure_text = '' + angle = '0' try: - layout = util.get_ocr_layout(ocr, temp_file.name) + layout, angle = util.get_ocr_layout(ocr, temp_file.name, is_screenshot) if not layout: # 无识别结果 ie_result = [] @@ -61,7 +62,7 @@ def ie_temp_image(ie, ocr, image): os.remove(temp_file.name) except Exception as e: logging.info(f"删除临时文件 {temp_file.name} 时出错", exc_info=e) - return ie_result, ocr_pure_text + return ie_result, ocr_pure_text, angle # 关键信息提取 @@ -159,7 +160,7 @@ def information_extraction(ie, phrecs, identity): if not img_path: continue - image = image_util.read(img_path) + image, exif_data = image_util.read(img_path) if image is None: # 图片可能因为某些原因获取不到 continue @@ -175,7 +176,7 @@ def information_extraction(ie, phrecs, identity): if text: info_extract = ie(text)[0] else: - info_extract = ie_temp_image(ie, OCR, image)[0] + info_extract = ie_temp_image(ie, OCR, image, True)[0] ie_result = {'result': info_extract, 'angle': '0'} now = util.get_default_datetime() @@ -193,27 +194,20 @@ def information_extraction(ie, phrecs, identity): result = merge_result(result, ie_result['result']) else: + is_screenshot = image_util.is_screenshot(image, exif_data) target_images = [] # target_images += detector.request_book_areas(image) # 识别文档区域并裁剪 if not target_images: target_images.append(image) # 识别失败 angle_count = defaultdict(int, {'0': 0}) # 分割后图片的最优角度统计 for target_image in target_images: - # dewarped_image = dewarp.dewarp_image(target_image) # 去扭曲 - dewarped_image = target_image - angles = image_util.parse_rotation_angles(dewarped_image) - - split_results = image_util.split(dewarped_image) + split_results = image_util.split(target_image) for split_result in split_results: if split_result['img'] is None or split_result['img'].size == 0: continue - rotated_img = image_util.rotate(split_result['img'], int(angles[0])) - ie_temp_result = ie_temp_image(ie, OCR, rotated_img) + ie_temp_result = ie_temp_image(ie, OCR, split_result['img'], is_screenshot) ocr_text += ie_temp_result[1] - ie_results = [{'result': ie_temp_result[0], 'angle': angles[0]}] - if not ie_results[0]['result'] or len(ie_results[0]['result']) < len(ie.kwargs.get('schema')): - rotated_img = image_util.rotate(split_result['img'], int(angles[1])) - ie_results.append({'result': ie_temp_image(ie, OCR, rotated_img)[0], 'angle': angles[1]}) + ie_results = [{'result': ie_temp_result[0], 'angle': ie_temp_result[2]}] now = util.get_default_datetime() best_angle = ['0', 0] for ie_result in ie_results: @@ -262,18 +256,18 @@ def information_extraction(ie, phrecs, identity): session.add_all(zx_ie_results) session.commit() - # 添加清晰度测试 - if better_image is None: - # 替换后图片默认清晰 - clarity_result = image_util.parse_clarity(image) - unsharp_flag = 0 if (clarity_result[0] == 0 and clarity_result[1] >= 0.8) else 1 - update_clarity = (update(ZxPhrec).where(ZxPhrec.pk_phrec == phrec.pk_phrec).values( - cfjaddress2=json.dumps(clarity_result), - unsharp_flag=unsharp_flag, - )) - session.execute(update_clarity) - session.commit() - session.close() + # # 添加清晰度测试 + # if better_image is None: + # # 替换后图片默认清晰 + # clarity_result = image_util.parse_clarity(image) + # unsharp_flag = 0 if (clarity_result[0] == 0 and clarity_result[1] >= 0.8) else 1 + # update_clarity = (update(ZxPhrec).where(ZxPhrec.pk_phrec == phrec.pk_phrec).values( + # cfjaddress2=json.dumps(clarity_result), + # unsharp_flag=unsharp_flag, + # )) + # session.execute(update_clarity) + # session.commit() + # session.close() result['ocr_text'] = ocr_text return result @@ -320,7 +314,7 @@ def save_or_update_ie(table, pk_phhd, data): if db_data: # 更新 db_data.update_time = now - db_data.creator = HOSTNAME + db_data.updater = HOSTNAME for k, v in data.items(): setattr(db_data, k, v) else: @@ -421,6 +415,10 @@ def settlement_task(pk_phhd, settlement_list, identity): get_best_value_in_keys(settlement_list_ie_result, MEDICAL_EXPENSES)) settlement_data["medical_expenses_str"] = handle_original_data(parse_money_result[0]) settlement_data["medical_expenses"] = parse_money_result[1] + + if not settlement_data["settlement_id"]: + # 如果没有结算单号就填住院号 + settlement_data["settlement_id"] = settlement_data["admission_id"] save_or_update_ie(ZxIeSettlement, pk_phhd, settlement_data) diff --git a/requirements.txt b/requirements.txt index 6697c5e..cbdcec4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,11 @@ -numpy==1.26.4 -onnxconverter-common==1.14.0 +aistudio_sdk==0.2.6 +onnxconverter-common==1.15.0 +onnxruntime-gpu==1.22.0 OpenCC==1.1.6 -opencv-python==4.6.0.66 paddle2onnx==1.2.3 -paddleclas==2.5.2 -paddlenlp==2.6.1 -paddleocr==2.7.3 -pillow==10.4.0 +paddlenlp==3.0.0b4 +paddleocr==3.1.1 +PyMuPDF==1.26.3 pymysql==1.1.1 -requests==2.32.3 -sqlacodegen==2.3.0.post1 -sqlalchemy==1.4.52 -tenacity==8.5.0 -ufile==3.2.9 -zxing-cpp==2.2.0 \ No newline at end of file +ufile==3.2.11 +zxing-cpp==2.3.0 \ No newline at end of file diff --git a/update_dev.sh b/update_dev.sh new file mode 100644 index 0000000..bec8922 --- /dev/null +++ b/update_dev.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# 项目更新脚本 +echo "开始更新测试项目..." +# 备份docker-compose配置 +cp -i docker-compose.dev.yml docker-compose-backup.dev.yml +# 拉取最新的git +git pull +# 构建新镜像 +docker-compose -f docker-compose.dev.yml build +# 停止旧的容器 +docker-compose -f docker-compose-backup.dev.yml down +# 启动新的容器 +docker-compose -f docker-compose.dev.yml up -d +# 删除docker-compose备份 +rm -f docker-compose-backup.dev.yml +# 查看容器运行情况 +docker ps +# 查看镜像 +docker images +# 结束 +echo "测试项目更新完成,请确认容器版本正确,自行删除过期镜像。" \ No newline at end of file diff --git a/util/image_util.py b/util/image_util.py index 31a29da..c0f1c7b 100644 --- a/util/image_util.py +++ b/util/image_util.py @@ -1,9 +1,12 @@ import logging import math import urllib.request +from io import BytesIO import cv2 import numpy +from PIL import Image +from PIL.ExifTags import TAGS from paddleclas import PaddleClas from tenacity import retry, stop_after_attempt, wait_random @@ -14,20 +17,36 @@ def read(image_path): """ 从网络或本地读取图片 :param image_path: 网络或本地路径 - :return: NumPy数组形式的图片 + :return: NumPy数组形式的图片, EXIF数据 """ if image_path.startswith("http"): # 发送HTTP请求并获取图像数据 resp = urllib.request.urlopen(image_path, timeout=60) # 将数据读取为字节流 image_data = resp.read() - # 将字节流转换为NumPy数组 - image_np = numpy.frombuffer(image_data, numpy.uint8) - # 解码NumPy数组为OpenCV图像格式 - image = cv2.imdecode(image_np, cv2.IMREAD_COLOR) else: - image = cv2.imread(image_path) - return image + with open(image_path, "rb") as f: + image_data = f.read() + + # 解析EXIF信息(基于原始字节流) + exif_data = {} + try: + # 用PIL打开原始字节流 + with Image.open(BytesIO(image_data)) as img: + # 获取EXIF字典 + exif_info = img._getexif() + if exif_info: + # 将EXIF标签的数字ID转换为可读名称(如36867对应"DateTimeOriginal") + for tag_id, value in exif_info.items(): + tag_name = TAGS.get(tag_id, tag_id) + exif_data[tag_name] = value + except Exception as e: + logging.error("解析EXIF信息失败", exc_info=e) + # 将字节流转换为NumPy数组 + image_np = numpy.frombuffer(image_data, numpy.uint8) + # 解码NumPy数组为OpenCV图像格式 + image = cv2.imdecode(image_np, cv2.IMREAD_COLOR) + return image, exif_data def capture(image, rectangle): @@ -61,7 +80,7 @@ def split(image, ratio=1.414, overlap=0.05, x_compensation=3): """ split_result = [] if isinstance(image, str): - image = read(image) + image, _ = read(image) height, width = image.shape[:2] hw_ratio = height / width wh_ratio = width / height @@ -268,3 +287,61 @@ def parse_clarity(image): except Exception as e: logging.error("获取图片清晰度失败", exc_info=e) return clarity_result + + +def is_photo_by_exif(exif_tags): + """分析EXIF数据判断是否为照片""" + # 照片通常包含的EXIF标签 + photo_tags = [ + 'FNumber', # 光圈 + 'ExposureTime', # 曝光时间 + 'ISOSpeedRatings', # ISO + 'FocalLength', # 焦距 + 'LensModel', # 镜头型号 + 'GPSLatitude' # GPS位置信息 + ] + + # 统计照片相关的EXIF标签数量 + photo_tag_count = 0 + if exif_tags: + for tag in photo_tags: + if tag in exif_tags: + photo_tag_count += 1 + # 如果有2个以上照片相关的EXIF标签,倾向于是照片 + if photo_tag_count >= 2: + return True + # 不确定是照片返回False + return False + + +def is_screenshot_by_image_features(image): + """分析图像特征判断是否为截图""" + # 定义边缘像素标准差阈值,小于此阈值则认为图片是截图 + edge_std_threshold = 20.0 + try: + # 检查边缘像素的一致性(截图边缘通常更整齐) + edge_pixels = [] + # 取图像边缘10像素 + edge_pixels.extend(image[:10, :].flatten()) # 顶部边缘 + edge_pixels.extend(image[-10:, :].flatten()) # 底部边缘 + edge_pixels.extend(image[:, :10].flatten()) # 左侧边缘 + edge_pixels.extend(image[:, -10:].flatten()) # 右侧边缘 + + # 计算边缘像素的标准差(值越小说明越一致) + edge_std = numpy.std(edge_pixels) + logging.info(f"边缘像素标准差: {edge_std}") + return edge_std < edge_std_threshold + except Exception as e: + logging.error("图像特征分析失败", exc_info=e) + return False + + +def is_screenshot(image, exif_tags): + """综合判断是否是截图""" + # 先检查EXIF数据 + result_of_exif = is_photo_by_exif(exif_tags) + # 如果有明显的照片EXIF信息,直接判断为照片 + if result_of_exif: + return False + # 分析图像特征 + return is_screenshot_by_image_features(image) diff --git a/util/util.py b/util/util.py index 29b9737..01a6fe6 100644 --- a/util/util.py +++ b/util/util.py @@ -12,9 +12,10 @@ def get_default_datetime(): return datetime.now().strftime('%Y-%m-%d %H:%M:%S') -def get_ocr_layout(ocr, img_path): +def get_ocr_layout(ocr, img_path, is_screenshot=False): """ 获取ocr识别的结果,转为合适的layout形式 + :param is_screenshot: 是否是截图 :param ocr: ocr模型 :param img_path: 图片本地路径 :return: @@ -36,18 +37,18 @@ def get_ocr_layout(ocr, img_path): return True layout = [] - ocr_result = ocr.ocr(img_path, cls=False) - ocr_result = ocr_result[0] + ocr_result = ocr.predict(input=img_path, use_doc_orientation_classify=not is_screenshot, use_doc_unwarping=not is_screenshot) + ocr_result = next(ocr_result) if not ocr_result: - return layout - for segment in ocr_result: - box = segment[0] + return layout, "0" + angle = ocr_result.get("doc_preprocessor_res", {}).get("angle", "0") + for i in range(len(ocr_result.get('rec_texts'))): + box = ocr_result.get("rec_polys")[i].tolist() box = _get_box(box) if not _normal_box(box): continue - text = segment[1][0] - layout.append((box, text)) - return layout + layout.append((box, ocr_result.get("rec_texts")[i])) + return layout, str(angle) def delete_temp_file(temp_files): diff --git a/visual_model_test/visual_model_test.py b/visual_model_test/visual_model_test.py index cef670a..1a33974 100644 --- a/visual_model_test/visual_model_test.py +++ b/visual_model_test/visual_model_test.py @@ -24,7 +24,7 @@ def write_visual_result(image, angle=0, layout=None, result=None): img_name = img[:last_dot_index] img_type = img[last_dot_index + 1:] - img_array = image_util.read(image) + img_array, _ = image_util.read(image) if angle != 0: img_array = image_util.rotate(img_array, angle) with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file: @@ -63,7 +63,7 @@ def visual_model_test(model_type, test_img, task_path, schema): img["y_offset"] -= offset_y temp_files_paths.append(temp_file.name) - parsed_doc = util.get_ocr_layout( + parsed_doc, _ = util.get_ocr_layout( PaddleOCR(det_db_box_thresh=0.3, det_db_thresh=0.1, det_limit_side_len=1248, drop_score=0.3, save_crop_res=False), temp_file.name)