untar/ unzip
import glob
import ntpath
import os
import tarfile
from shutil import copyfile
folder = 'tars/'
base_path = 'output_by_type'
def create_dir(path):
# utility function to create a directory
if not os.path.exists(path):
os.makedirs(path)
def extract_all_tars():
for mytarfile in glob.glob('%s/*.tar' % folder): # loop through items in dir
item = ntpath.basename(mytarfile)
path = 'output/%s' % item
# Create folder for each .tar here
create_dir(path)
tar_ref = tarfile.TarFile('%s/%s' % (folder, item)) # create zipfile object
# extract file to dir
tar_ref.extractall(path)
tar_ref.close() # close file
def rename_to_csv():
[os.rename(f, f.replace('.txt', '.csv')) for f in glob.glob('output/*/*') if f.endswith('.txt')]
def create_output_directories_by_type():
dirs = set()
for f in glob.glob('output/*/*.csv'):
item = ntpath.basename(f)
file_type = item.split('-')[0]
dirs.add(file_type)
# make sure the base output directory exists
create_dir(base_path)
# create each output directory
for d in dirs:
path = '%s/%s' % (base_path, d)
create_dir(path)
def pick_out_files():
# copy each file to the correct location
for f in glob.glob('output/*/*.csv'):
item = ntpath.basename(f)
copyfile(f, '%s/%s/%s' % (base_path, item.split('-')[0], item))
def combine_files():
for directory in os.listdir(base_path):
file_path = os.path.join(base_path, directory, '%s.csv' % directory)
if os.path.exists(file_path):
os.remove(file_path)
all_files = glob.glob(os.path.join(base_path, directory, "*.csv"))
with open(file_path, "wb") as outfile:
for idx, f in enumerate(all_files):
with open(f, "rb") as infile:
if idx != 0:
outfile.writelines(infile.readlines()[1:])
else:
outfile.write(infile.read())
if __name__ == '__main__':
extract_all_tars()
rename_to_csv()
create_output_directories_by_type()
pick_out_files()
combine_files()