文本分类数据准备
This commit is contained in:
28
tool/text_clas.py
Normal file
28
tool/text_clas.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
|
OCR = PaddleOCR(use_angle_cls=False, show_log=False, gpu_id=0, det_db_box_thresh=0.3)
|
||||||
|
|
||||||
|
|
||||||
|
def image_to_text(dir_path):
|
||||||
|
txt_dir = os.path.join(dir_path, 'txt')
|
||||||
|
if not os.path.exists(txt_dir):
|
||||||
|
os.mkdir(txt_dir)
|
||||||
|
for file in os.listdir(dir_path):
|
||||||
|
if os.path.isdir(os.path.join(dir_path, file)):
|
||||||
|
continue
|
||||||
|
ocr_results = OCR.ocr(os.path.join(dir_path, file), cls=False)[0]
|
||||||
|
text = ''
|
||||||
|
for ocr_result in ocr_results:
|
||||||
|
text += ocr_result[1][0]
|
||||||
|
text = text[:2048] # 文本分类模型只能一次接收2048个字符
|
||||||
|
with open(os.path.join(txt_dir, f'{file}.txt'), 'w', encoding='utf-8') as f:
|
||||||
|
f.write(text)
|
||||||
|
print(f'{file} 转换成功')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
image_dir_path = 'image'
|
||||||
|
image_to_text(image_dir_path)
|
||||||
|
print('done')
|
||||||
Reference in New Issue
Block a user