pragyapradhan
7/19/2017 - 10:20 AM

untar/ unzip

untar/ unzip

import glob
import ntpath
import os
import tarfile

from shutil import copyfile


folder = 'tars/'
base_path = 'output_by_type'


def create_dir(path):
    # utility function to create a directory
    if not os.path.exists(path):
        os.makedirs(path)


def extract_all_tars():
    for mytarfile in glob.glob('%s/*.tar' % folder):  # loop through items in dir
        item = ntpath.basename(mytarfile)
        path = 'output/%s' % item
        # Create folder for each .tar here
        create_dir(path)

        tar_ref = tarfile.TarFile('%s/%s' % (folder, item))  # create zipfile object
        # extract file to dir
        tar_ref.extractall(path)
        tar_ref.close()  # close file


def rename_to_csv():
    [os.rename(f, f.replace('.txt', '.csv')) for f in glob.glob('output/*/*') if f.endswith('.txt')]


def create_output_directories_by_type():
    dirs = set()

    for f in glob.glob('output/*/*.csv'):
        item = ntpath.basename(f)
        file_type = item.split('-')[0]

        dirs.add(file_type)

    # make sure the base output directory exists
    create_dir(base_path)

    # create each output directory
    for d in dirs:
        path = '%s/%s' % (base_path, d)
        create_dir(path)


def pick_out_files():
    # copy each file to the correct location
    for f in glob.glob('output/*/*.csv'):
        item = ntpath.basename(f)
        copyfile(f, '%s/%s/%s' % (base_path, item.split('-')[0], item))


def combine_files():
    for directory in os.listdir(base_path):
        file_path = os.path.join(base_path, directory, '%s.csv' % directory)

        if os.path.exists(file_path):
            os.remove(file_path)

        all_files = glob.glob(os.path.join(base_path, directory, "*.csv"))

        with open(file_path, "wb") as outfile:
            for idx, f in enumerate(all_files):
                with open(f, "rb") as infile:
                    if idx != 0:
                        outfile.writelines(infile.readlines()[1:])
                    else:
                        outfile.write(infile.read())


if __name__ == '__main__':
    extract_all_tars()
    rename_to_csv()
    create_output_directories_by_type()
    pick_out_files()
    combine_files()