diff --git a/tool/text_clas.py b/tool/text_clas.py new file mode 100644 index 0000000..b15080a --- /dev/null +++ b/tool/text_clas.py @@ -0,0 +1,28 @@ +import os + +from paddleocr import PaddleOCR + +OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=0, det_db_box_thresh=0.3) + + +def image_to_text(dir_path): + txt_dir = os.path.join(dir_path, 'txt') + if not os.path.exists(txt_dir): + os.mkdir(txt_dir) + for file in os.listdir(dir_path): + if os.path.isdir(os.path.join(dir_path, file)): + continue + ocr_results = OCR.ocr(os.path.join(dir_path, file), cls=False)[0] + text = '' + for ocr_result in ocr_results: + text += ocr_result[1][0] + text = text[:2048] # 文本分类模型只能一次接收2048个字符 + with open(os.path.join(txt_dir, f'{file}.txt'), 'w', encoding='utf-8') as f: + f.write(text) + print(f'{file} 转换成功') + + +if __name__ == '__main__': + image_dir_path = 'image' + image_to_text(image_dir_path) + print('done')