From 7bda4a72cd64fc191f6512eb1a260fd6f8827eeb Mon Sep 17 00:00:00 2001
From: liuyebo <1515783401@qq.com>
Date: Wed, 18 Sep 2024 17:09:15 +0800
Subject: [PATCH] =?UTF-8?q?=E6=96=87=E6=9C=AC=E5=88=86=E7=B1=BB=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE=E5=87=86=E5=A4=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tool/text_clas.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 tool/text_clas.py

diff --git a/tool/text_clas.py b/tool/text_clas.py
new file mode 100644
index 0000000..b15080a
--- /dev/null
+++ b/tool/text_clas.py
@@ -0,0 +1,28 @@
+import os
+
+from paddleocr import PaddleOCR
+
+OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=0, det_db_box_thresh=0.3)
+
+
+def image_to_text(dir_path):
+    txt_dir = os.path.join(dir_path, 'txt')
+    if not os.path.exists(txt_dir):
+        os.mkdir(txt_dir)
+    for file in os.listdir(dir_path):
+        if os.path.isdir(os.path.join(dir_path, file)):
+            continue
+        ocr_results = OCR.ocr(os.path.join(dir_path, file), cls=False)[0]
+        text = ''
+        for ocr_result in ocr_results:
+            text += ocr_result[1][0]
+        text = text[:2048]  # 文本分类模型只能一次接收2048个字符
+        with open(os.path.join(txt_dir, f'{file}.txt'), 'w', encoding='utf-8') as f:
+            f.write(text)
+        print(f'{file} 转换成功')
+
+
+if __name__ == '__main__':
+    image_dir_path = 'image'
+    image_to_text(image_dir_path)
+    print('done')