alonsopg
10/24/2016 - 1:14 AM

pdf -> txt

pdf -> txt

from textract import process
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
# para spa, hay que bajarlo de aquí: https://github.com/tesseract-ocr/langdata/tree/master/spa y ponerlo en
# el folder correspondiente
def transform_files(input_directory, output_directory):    
    import codecs, glob, os
    from collections import OrderedDict
    all_texts = OrderedDict()
    
    for filename in glob.glob(os.path.join(input_directory, '*.pdf')):    
        texts = process(filename, method='tesseract', language='spa')
        filename = os.path.basename(filename)
        all_texts[filename] = texts

    for i, (original_filename, a_list) in enumerate(all_texts.items()):
        new_filename, _ = os.path.splitext(original_filename)
        new_filename += '.txt'
        new_dir_path = output_directory
        
        
        
        path = os.path.join(new_dir_path, new_filename)
        print('Transforming: %s => %s' % (original_filename, path,))
        with codecs.open(path, "w", encoding='utf8') as filename:
            for item in [a_list]:
                filename.write(item+"\n")


input_d = '/Users/user/Desktop/Imagenes/'
out_d = '/Users/user/Desktop/ImagenesTXT_OCR/'

%time transform_files(input_d, out_d)