Split a text file into several smaller files
1----------------
24
532
1
432
4321
4321
4321
32
32
4
321
432----------------
4
24
21
43
321
34
321
4
324
3
24
32----------------
432
14
32
4
32
432
324
421
2143
from __future__ import division # force integer division to return a floating point number
from argparse import ArgumentParser, Action
from os.path import splitext
from math import ceil
from itertools import izip_longest
class FileSplitter(object):
def execute(self, argv=None):
"""
Split a text file into several smaller files. User must specify either a maximum file count or
a maximum line count per file. If user specifies both, then the maximum file count will take priority.
"""
# Read arguments and split file
args = self._parse_arguments(argv)
if args.files:
splitfunc = self.split_by_filecount
elif args.lines:
splitfunc = self.split_by_linecount
else:
raise ValueError("Must specify either a maximum file count or a maximum line count")
newfiles = splitfunc(args.file, args.limit)
# Output message
print "Split {0} into {1} files".format(args.file, len(newfiles))
# Verbose output message
if args.verbose:
for f in newfiles: print f
def split_by_filecount(self, myfile, maxfiles):
"""
Split a text file into, at most, <maxfiles> smaller files.
Returns a list of the newly created files.
"""
lines = sum(1 for line in open(myfile))
newfiles = self.split_by_linecount(myfile, int(ceil(lines / maxfiles)))
return newfiles
def split_by_linecount(self, myfile, maxlines):
"""
Split a text file into several smaller files, each with no more than <maxlines> lines per file.
Returns a list of the newly created files.
"""
with open(myfile, 'r') as infile:
# Split file into smaller chunks
chunks = izip_longest(*[infile] * maxlines)
# Write the lines each chunk to a separate file
newfiles = []
for i, chunk in enumerate(chunks, start=1):
newfile = self._write_to_file(chunk, myfile, 'w', i)
newfiles.append(newfile)
return newfiles
def _write_to_file(self, lines, myfile, mode='w', suffix=''):
"""
Write lines to a text file, and return the text file name if successful.
Can optionally supply a suffix to the filename.
"""
# Add suffix to filename if supplied
if suffix:
fn, ext = splitext(myfile)
newfile = "{0}{1}{2}".format(fn, suffix, ext)
else:
newfile = myfile
# Write supplied lines to file
with open(newfile, mode) as f:
for line in lines:
if line is not None:
f.write(line)
return newfile
def _parse_arguments(self, argv=None):
"""Define necessary inputs and parse command-line arguments"""
parser = ArgumentParser(description='Split text file into several smaller files')
parser.add_argument('file', help='File to be split')
parser.add_argument('limit', help='Limit value', type=int, const=1, action=self.AssertAtLeastConst)
parser.add_argument('-v', '--verbose', help='Verbosity of output message', action='store_true')
group = parser.add_mutually_exclusive_group()
group.add_argument('-f', '--files', help='Limit number new files to create', action='store_true')
group.add_argument('-l', '--lines', help='Limit number of lines per new file', action='store_true')
return parser.parse_args(argv)
class AssertAtLeastConst(Action):
def __call__(self, parser, namespace, values, option_string=None):
"""Ensures that an argument is greater than or equal a given constant"""
if (self.const is not None) and (values < self.const):
arg_name = self._get_arg_output_name(option_string)
parser.error("{0} must be at least {1}".format(arg_name, self.const))
else:
setattr(namespace, self.dest, values)
def _get_arg_output_name(self, option_string = None):
"""
Return the argument name, which should be taken from one of the following sources
(in order of priority): 1. Option String, 2. Metavar, 3. Dest
"""
for text in [option_string, self.metavar, self.dest]:
if text is not None:
arg_name = text
break
return arg_name
if __name__ == '__main__':
FileSplitter().execute()