import os from paddleocr import PaddleOCR OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=0, det_db_box_thresh=0.3) def image_to_text(dir_path): txt_dir = os.path.join(dir_path, 'txt') if not os.path.exists(txt_dir): os.mkdir(txt_dir) for file in os.listdir(dir_path): if os.path.isdir(os.path.join(dir_path, file)): continue ocr_results = OCR.ocr(os.path.join(dir_path, file), cls=False)[0] text = '' for ocr_result in ocr_results: text += ocr_result[1][0] text = text[:2048] # 文本分类模型只能一次接收2048个字符 with open(os.path.join(txt_dir, f'{file}.txt'), 'w', encoding='utf-8') as f: f.write(text) print(f'{file} 转换成功') if __name__ == '__main__': image_dir_path = 'image' image_to_text(image_dir_path) print('done')