文本分类数据准备

2024-09-18 17:09:15 +08:00
parent 6d462fc7f8
commit 7bda4a72cd
1 changed files with 28 additions and 0 deletions
--- a/tool/text_clas.py
+++ b/tool/text_clas.py
@@ -0,0 +1,28 @@
 import os
 from paddleocr import PaddleOCR
 OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=0, det_db_box_thresh=0.3)
 def image_to_text(dir_path):
    txt_dir = os.path.join(dir_path, 'txt')
    if not os.path.exists(txt_dir):
        os.mkdir(txt_dir)
    for file in os.listdir(dir_path):
        if os.path.isdir(os.path.join(dir_path, file)):
            continue
        ocr_results = OCR.ocr(os.path.join(dir_path, file), cls=False)[0]
        text = ''
        for ocr_result in ocr_results:
            text += ocr_result[1][0]
        text = text[:2048]  # 文本分类模型只能一次接收2048个字符
        with open(os.path.join(txt_dir, f'{file}.txt'), 'w', encoding='utf-8') as f:
            f.write(text)
        print(f'{file} 转换成功')
 if __name__ == '__main__':
    image_dir_path = 'image'
    image_to_text(image_dir_path)
    print('done')