postprefix
9/4/2016 - 6:36 PM

Split a text file into several smaller files

Split a text file into several smaller files

1----------------
24
532
1
432
4321
4321
4321
32
32
4
321
432----------------
4
24
21
43
321
34
321
4
324
3
24
32----------------
432
14
32
4
32
432
324
421
2143
from __future__ import division # force integer division to return a floating point number

from argparse import ArgumentParser, Action
from os.path import splitext
from math import ceil
from itertools import izip_longest

class FileSplitter(object):

    def execute(self, argv=None):
        """
        Split a text file into several smaller files. User must specify either a maximum file count or
        a maximum line count per file. If user specifies both, then the maximum file count will take priority. 
        """
        # Read arguments and split file
        args = self._parse_arguments(argv)
        if args.files:
            splitfunc = self.split_by_filecount
        elif args.lines:
            splitfunc = self.split_by_linecount
        else:
            raise ValueError("Must specify either a maximum file count or a maximum line count")
        newfiles = splitfunc(args.file, args.limit)

        # Output message
        print "Split {0} into {1} files".format(args.file, len(newfiles))
        # Verbose output message
        if args.verbose:
            for f in newfiles: print f

    def split_by_filecount(self, myfile, maxfiles):
        """
        Split a text file into, at most, <maxfiles> smaller files. 
        Returns a list of the newly created files.
        """
        lines = sum(1 for line in open(myfile))
        newfiles = self.split_by_linecount(myfile, int(ceil(lines / maxfiles)))
        return newfiles

    def split_by_linecount(self, myfile, maxlines):
        """
        Split a text file into several smaller files, each with no more than <maxlines> lines per file.
        Returns a list of the newly created files.
        """
        with open(myfile, 'r') as infile:
            # Split file into smaller chunks
            chunks = izip_longest(*[infile] * maxlines) 
            
            # Write the lines each chunk to a separate file
            newfiles = []
            for i, chunk in enumerate(chunks, start=1):
                newfile = self._write_to_file(chunk, myfile, 'w', i)
                newfiles.append(newfile)
            
        return newfiles

    def _write_to_file(self, lines, myfile, mode='w', suffix=''):
        """
        Write lines to a text file, and return the text file name if successful. 
        Can optionally supply a suffix to the filename.
        """
        # Add suffix to filename if supplied
        if suffix:
            fn, ext = splitext(myfile)
            newfile = "{0}{1}{2}".format(fn, suffix, ext)
        else:
            newfile = myfile

        # Write supplied lines to file
        with open(newfile, mode) as f:
            for line in lines:
                if line is not None:
                    f.write(line)

        return newfile

    def _parse_arguments(self, argv=None):
        """Define necessary inputs and parse command-line arguments"""
        parser = ArgumentParser(description='Split text file into several smaller files')
        parser.add_argument('file', help='File to be split')
        parser.add_argument('limit', help='Limit value', type=int, const=1, action=self.AssertAtLeastConst)    
        parser.add_argument('-v', '--verbose', help='Verbosity of output message', action='store_true')
        
        group = parser.add_mutually_exclusive_group()
        group.add_argument('-f', '--files', help='Limit number new files to create', action='store_true')   
        group.add_argument('-l', '--lines', help='Limit number of lines per new file', action='store_true')

        return parser.parse_args(argv)   
        
    class AssertAtLeastConst(Action):

        def __call__(self, parser, namespace, values, option_string=None): 
            """Ensures that an argument is greater than or equal a given constant"""
            if (self.const is not None) and (values < self.const):
                arg_name = self._get_arg_output_name(option_string)
                parser.error("{0} must be at least {1}".format(arg_name, self.const))
            else:
                setattr(namespace, self.dest, values)

        def _get_arg_output_name(self, option_string = None):
            """
            Return the argument name, which should be taken from one of the following sources
            (in order of priority): 1. Option String, 2. Metavar, 3. Dest
            """
            for text in [option_string, self.metavar, self.dest]:
                if text is not None:
                    arg_name = text
                    break

            return arg_name

if __name__ == '__main__':
    FileSplitter().execute()