Python.OS.File.Directory.Manipulation

1/9/2017 - 7:58 AM

Python.OS.File.Directory.Manipulation

Python.OS.File.Directory.Manipulation #python #Python #File #file #Directory #directory #dir #filemanipulation #OS #shutil #PythonModules

PYTHON_FILE_DIR_MANIPULATION

PYTHON_FILE_DIR.py

1.   DirCmp.py                 : Compare two directories

# #################
# Ways to load file into specific data structure
2.   filetotuple.py            : Read a file, store lines in tuple (non stripped)
3.   filetotuplestrip.py       : Read a file, store stripped lines in tuple
4.   filetolist.py             : Read a file, store lines in list (non stripped)
5.   filetoliststrip.py        : Read a file, store stripped lines in list
6.   filetodict.py             : Read a file, slice substr, populate dictionary.
7.   filetodictcounter.py      : Read a file, slice substr, populate dictionary, with counter.
8.   dicttofile.py             : Read a dictionary, output it to file formatted to be imported.

# #################
# Ways to count elements of file 
6.   word_count_file.py        : Read a file, count number of words
7.   char_count_file.py        : Read a file, count number of characters (2 versions)
8.   line_count_file.py        : Read a file, count number of total lines
9.   combi_count_file.py       : Read a file, count words, lines, characters
10.  occur_count_file.py      : Read a file, count all occurrences of substring
11.  get_file_size.py         : Get the size of a file in bytes


# ###############
# Ways to read adn write to file 
14.  ReadWritetoFile          : Shows various ways to read/write to a file
14.5 ReadWriteVLargeText.py   : Shows various ways to read/write really large Text Files
16.  MultiLineFileProc        : Basic outline to process multiline files
17.  ReadFileTwiceSeek.py     : Reads a file twice, using seek()
18.  ReadFileMultiCore.py     : Examples of reading files on multiple cores (multiprocess module)


# ###############
12.  file_abs_path.py         : Get absolute path to file
15.  PathtoRawString          : Expands path to raw string (so we can pass it as parameter)
17.  FileCopying.py           : Basic outline of variosu ways to copy files with python
18.  FileArchiving.py         : Basic outline of archiving operations for files (tag, gzip etc)
19.  CheckFileExists.py       : Checks if a specific path is a file / or a directory
21.  CountFilesinDirectory.py : Counts number of files in directory
22.  CheckforUNCpath          : Check if UNC file or dir exists
23.  GetCurrentWorking.py     : Get the current working directory
24.  get_larger_files.py      : Get files larger than a specific size. (recursive)
25.  get_smaller_files.py     : Get files smaller than a specific size (recursive)
26.  get_cwd.py               : Function that returns current working directory
27.  get_larger_files.nr.py   : Get larger or smaller files non recursive
13.  Remove_duplicates.py     : Removes duplicate lines (or based on substring)
14.  File_Backup.py           : Premade file backup on same directory as script.
15.  RecursiveDirCopy.py      : Recursively copy directory tree from a dir to another.

File_Backup.py

import sys
import os
import time
import shutil


file_in = sys.argv[1]
file_ot = str(file_in) + ".bak"


def filebackup(file_in):
    '''
    Name: filebackup
    Description: Premade copy, appending .bak
    Input: <file_in> -> path to file
    Output: None
    Usage: filebackup(file_in)
    Notes: Windows path need \\ or r' ' 
    '''
    
    
    import os
    import shutil
    
    
    file_ot = str(file_in) + ".bak"
    try:
        shutil.copy(file_in, file_ot)
    except:
        print("prob")
        
print(filebackup(file_in))
print(help(filebackup))

dicttofile.py

def dicttofilepy(dict_in, file_ot):
    '''
    Name: py_format
    Description: Outputs a file that contains the 
               definition of <input> ready to be
               imported (See Notes and sample code)
    Input: <input data structure> <filename>
    Output: <filename> that contains input in python format
    Usage: py_format(project_info, "OUTDATA.py")
    Notes: 
        py_format(project_info, "OUTDATA.py")
        import OUTDATA
        data = OUTDATA.allData
    Notes2: Check also in 'Python Useful Tricks'
    '''
    
    import pprint
    
    input = dict(input)
    
    # check if file_ot ends in .py
    if file_ot.lower().endswith('.py'):
        try:
            resultFile = open(file_ot, 'w')
            resultFile.write('allData = ' + pprint.pformat(input))
            resultFile.close()
        except:
            print("Problem loading data to: " + str(file_ot))
            return False
        else:
            return True




    

def py_format(input, file_ot):
    '''
    Name: py_format
    Description: Outputs a file that contains the 
               definition of <input> ready to be
               imported (See Notes and sample code)
    Input: <input data structure> <filename>
    Output: <filename> that contains input in python format
    Usage: py_format(project_info, "OUTDATA.py")
    Notes: 
        py_format(project_info, "OUTDATA.py")
        import OUTDATA
        data = OUTDATA.allData
    Notes2: Check also in 'Python Useful Tricks'
    '''
    
    import pprint
    
    input = dict(input)
    
    # check if file_ot ends in .py
    if file_ot.lower().endswith('.py'):
        try:
            resultFile = open(file_ot, 'w')
            resultFile.write('allData = ' + pprint.pformat(input))
            resultFile.close()
        except:
            print("Problem loading data to: " + str(file_ot))
            return False
        else:
            return True
    
    
    
def pretty_print(input, indent_depth):
    '''
    Name: pretty_print
    Description: pretty_prints a data structure
    Input: <List/Tuple/Dict>, indent_depth
    Output: stdout
    Usage: pretty_print(dict_in, 4)
    Notes: Works on any kind of data structure. 
    Requires: pprint module
    '''
    import pprint
    try:
        pprint.pprint(input)
        #pprint.PrettyPrinter(indent=indent_depth)
    except:
        print("Pretty print failed")

filetodictcounter.py

def loadfiletodictcounter(file_in, start, length, strip="yes"):
    '''
    Name:           loadfiletodictcounter
    
    Function:       loadfiletodictcounter(file_in, start, length, strip="yes")
    
    Description:    Loads a text file to a dictionary, based on specific
                    substring provided. Those substrings will be used as keys.
                    Each key will hold a list of [  counter, <line1>, <line2> .. ]
                    *line1, line2 share the same key.
    
    Input:          1. <file_in> : Input text file (Best encoded in utf-8)                   
                    2. start     : Start of substring 
                    3. length    : End of substring
                    4. [ strip="yes" | strip="no ]
    
    Output:         (<dictionary>, <int>)
                    1. <dictionary> : produced dictionary, with substrings as keys.
                    2. <int>        : number of non duplicate unique keys (Length of dict)
                    
    Usage:          my_in = loadfiletodict(file_in, 26, 3) (default)
                    my_in = loadfiletodict(file_in, 26, 3, strip="no") (non stripping)
    
    Required:       Collections (Module -> python -m pip install collections)
                    os          (Module -> Base python distro)
    
    Notes:          Modules can be embedded in fuction call. Uncomment if need.
    
                    'defaultdict' is faster, but it returns an item of type <defaultdict>,
                    which is different than <dict>. Problems may be encountered with module
                    handling of the dict (ie, pretty print). We can cast the produced defaultdict
                    into a dict by doing:
                    dict_out = dict(default_dict)
                    
                    * Differs from simple method -> key line counter.
    '''    
    
    # import collections
    try:
        import collections
    except:
        return "Cannot import collections module"
    
    # declare new default dict (use collections)
    try:
        dict_out = collections.defaultdict(list)    
    except:
        return "Could not initialize collection structure"
    
    
    # check if input file exists
    if not os.path.isfile(file_in):
        return ("ERROR: Reading dictionary.", 0)
    
    
    with open(file_in, 'r') as f:
        for line in f:
        
            # Handle new line strip 
            if strip == "yes":
                line = line.rstrip()
        
        
            # Calc start / end
            start_py = int(start) - 1
            end_py = start_py + int(length)
        
            # parse key
            key_in = line[start_py:end_py]
        
            if key_in not in dict_out.keys():
                dict_out[key_in].append(1)
                dict_out[key_in].append(line)
            else:
                dict_out[key_in][0] += 1
                dict_out[key_in].append(line)
    
    return (dict_out, len(dict_out.keys()))

filetodict.py

def loadfiletodict(file_in, start, length, strip="yes"):
    '''
    Name:           loadfiletodict
    
    Function:       loadfiletodict(file_in, start, length, strip="yes")
    
    Description:    Loads a text file to a dictionary, based on specific
                    substring provided. Those substrings will be used as keys.
                    Each key will hold a list of [  <line1>, <line2> .. ]
                    *line1, line2 share the same key.
    
    Input:          1. <file_in> : Input text file (Best encoded in utf-8)                   
                    2. start     : Start of substring 
                    3. length    : End of substring
                    4. [ strip="yes" | strip="no ]
    
    Output:         (<dictionary>, <int>)
                    1. <dictionary> : produced dictionary, with substrings as keys.
                    2. <int>        : number of non duplicate unique keys (Length of dict)
                    
    Usage:          my_in = loadfiletodict(file_in, 26, 3) (default)
                    my_in = loadfiletodict(file_in, 26, 3, strip="no") (non stripping)
    
    Required:       Collections (Module -> python -m pip install collections)
                    os          (Module -> Base python distro)
    
    Notes:          Modules can be embedded in fuction call. Uncomment if need.
    
                    'defaultdict' is faster, but it returns an item of type <defaultdict>,
                    which is different than <dict>. Problems may be encountered with module
                    handling of the dict (ie, pretty print). We can cast the produced defaultdict
                    into a dict by doing:
                        dict_out = dict(default_dict)
    '''
    # import os
    import os
    
    # import collections
    try:
        import collections
    except:
        return "Cannot import collections module"
    
    # declare new default dict (use collections)
    try:
        dict_out = collections.defaultdict(list)    
    except:
        return "Could not initialize collection structure"
    
    
    # check if input file exists
    if not os.path.isfile(file_in):
        return ("ERROR: Reading dictionary.", 0)
    
    
    with open(file_in, 'r') as f:
        for line in f:
        
            # Handle new line strip 
            if strip == "yes":
                line = line.rstrip()
            
            # Calc start / end
            start_py = int(start) - 1
            end_py = start_py + int(length)
        
            # parse key
            key_in = line[start_py:end_py]
        
            dict_out[key_in].append(line)
    
    return (dict_out, len(dict_out.keys()))

ReadFileMultiCore.py

# This is a big improvement, namely it doesn’t crash when fed 
# a big file (though also it’s shorter!). Next we should attempt 
# to speed this up a bit by making use of all these otherwise idle cores.


import multiprocessing as mp

#init objects
pool = mp.Pool(cores)
jobs = []

#create jobs
with open("input.txt") as f:
    for line in f:
        jobs.append( pool.apply_async(process,(line)) )

#wait for all jobs to finish
for job in jobs:
    job.get()

#clean up
pool.close()

##
# Provided the order of which you process the lines don’t matter, 
# the above generates a set (pool) of workers, ideally one for each 
# core, before creating a bunch of tasks (jobs), one for each line, 
# for the workers to do. I tend to use the Pool object provided by the 
# multiprocessing module due to ease of use, however, you can spawn and 
# control individual workers using mp.Process if you want finer control. 
# For mere number crunching, the Pool object is very good.

# While the above is now making use of all those cores, it sadly 
# runs into memory problems once again. We specifically use apply_async 
# function so that the pool isn’t blocked while each line processes. 
# However, in doing so, all the data is read into memory once again; this 
# time stored as individual lines associated with each job, waiting inline 
# to be processed. As such, the memory will again overflow. Ideally the 
# method will only read the line into memory when it is its turn to be processed.


import multiprocessing as mp

def process_wrapper(lineID):
    with open("input.txt") as f:
        for i,line in enumerate(f):
            if i != lineID:
                continue
            else:
                process(line)
                break

#init objects
pool = mp.Pool(cores)
jobs = []

#create jobs
with open("input.txt") as f:
    for ID,line in enumerate(f):
        jobs.append( pool.apply_async(process_wrapper,(ID)) )

#wait for all jobs to finish
for job in jobs:
    job.get()

#clean up
pool.close()



# Above we’ve now changed the function fed to pool of workers to 
# include opening the file, locating the specified line, reading 
# it into memory, and then processing it. The only input now stored 
# for each job spawned is the line number, thereby preventing the 
# memory overflow. Sadly, the overhead involved in having to locate 
# the line by reading iteratively through the file for each job is untenable, 
# getting progressively more time consuming as you get further into the file. 
# To avoid this we can use the seek function of file objects which skips you to 
# a particular location within a file. Combining with the tell function, which 
# returns the current location within a file, gives:


import multiprocessing as mp

def process_wrapper(lineByte):
    with open("input.txt") as f:
        f.seek(lineByte)
        line = f.readline()
        process(line)

#init objects
pool = mp.Pool(cores)
jobs = []

#create jobs
with open("input.txt") as f:
    nextLineByte = f.tell()
    for line in f:
        jobs.append( pool.apply_async(process_wrapper,(nextLineByte)) )
        nextLineByte = f.tell()

#wait for all jobs to finish
for job in jobs:
    job.get()

#clean up
pool.close()

# Using seek we can move directly to the correct part of the file, whereupon 
# we read a line into the memory and process it. We have to be careful to correctly 
# handle the first and last lines, but otherwise this does exactly what we set out, 
# namely using all the cores to process a given file while not overflowing the memory.

# I’ll finish this post with a slight upgrade to the above as there is a reasonable
# amount of overhead associated with opening and closing the file for each individual 
# line. If we process multiple lines of the file at a time as a chunk, we can reduce 
# these operations. The biggest technicality when doing this is noting that when you 
# jump to a location in a file, you are likely not located at the start of a line. 
# For a simple file, as in this example, this just means you need to call readline, 
# which reads to next newline character. 

# More complex file types likely require additional code to locate a suitable location
# to start/end a chunk.


import multiprocessing as mp,os

def process_wrapper(chunkStart, chunkSize):
    with open("input.txt") as f:
        f.seek(chunkStart)
        lines = f.read(chunkSize).splitlines()
        for line in lines:
            process(line)

def chunkify(fname,size=1024*1024):
    fileEnd = os.path.getsize(fname)
    with open(fname,'r') as f:
        chunkEnd = f.tell()
    while True:
        chunkStart = chunkEnd
        f.seek(size,1)
        f.readline()
        chunkEnd = f.tell()
        yield chunkStart, chunkEnd - chunkStart
        if chunkEnd > fileEnd:
            break

#init objects
pool = mp.Pool(cores)
jobs = []

#create jobs
for chunkStart,chunkSize in chunkify("input.txt"):
    jobs.append( pool.apply_async(process_wrapper,(chunkStart,chunkSize)) )

#wait for all jobs to finish
for job in jobs:
    job.get()

#clean up
pool.close()

ReadFileTwiceSeek.py

# ##########################
# READ A FILE TWICE
# To read a file twice we have to use seek()
with open('filename.txt') as fp:
    for line in fp:
        ...
    fp.seek(0)
    for line in fp:
        ...

		
#

ReadWrite.VLargeText.py

# ################################################################
# reading all file to ram before processing it - readlines()
# possible memory overflow

with open("input.txt") as f:
    data = f.readlines()
    for line in data:
        process(line)

		
for line in open('filename.txt').readlines():
    print line		


# ################################################################
# read line by line
for line in open('filename.txt'):
	print line
		

# read line by line without context manager
file_in = open("input.txt", 'r')
for line in file_in:
			process(line)


# #################################################################		
# read line by line (context manager approach - auto close file)
with open("input.txt") as f:
    for line in f:
        process(line)

# or

with open( filePath ) as infile:
  """
  # read first line and remember it
  first_line = infile.readline()
  # or only skip first line
  next( infile )
  """
  for line in infile:
    print line

	
# #################################################################			
# read line by line using .readline()
# readline(n) outputs at most n bytes of a single line of a file. It does not read more than one line.

fp = open('filename.txt')
while 1:
    line = fp.readline()
    if not line:
        break
    print line


my_file.close()
my_file=open("D:\\new_dir\\multiplelines.txt","r")
#Use print to print the line else will remain in buffer and replaced by next statement
print(my_file.readline())
# outputs first two characters of next line
print(my_file.readline(2))

	
	
# read entire file, or character by character usign read()
# read([n]) can read more than one line
my_file=open("D:\\new_dir\\multiplelines.txt","r")
my_file.read()

# The read() method just outputs the entire file if number of bytes are not given in 
# the argument. If you execute my_file.read(3), you will get back the first three characters of the file

my_file=open("D:\\new_dir\\multiplelines.txt","r")
my_file.read(3)

get_larger_files.nr.py

# -*- coding: utf-8 -*-
"""
Created on Tue Jan 31 12:45:00 2017

@author: P.Doulgeridis
"""

# necessary module to work with excel
#import openpyxl

# documentation: http://openpyxl.readthedocs.org/.




import os
import sys

rootDir = sys.argv[1]
sizing = sys.argv[2]








# for folderName, subFolders, fileNames in os.walk(rootDir):
	# for fileName in fileNames:
		# filePath = os.path.join(folderName, fileName)
		# if os.path.exists(filePath):
			# fileSize = os.path.getsize(filePath)
			# fileSize = fileSize/1024/1024 # Convert it to MB
			# if fileSize > int(sizing):
				# print("{0}\t{1}".format(fileSize,filePath))
                
                

def get_small_files(rootDir, sizing):
    r"""
    Name: get_small_files
    Function: get_small_files(rootDir, sizing)
    Input: rootDir, sizing
    Output: list
    Usage: print (get_large_files('/', 600))
    Storage: PYTHON FILE UTILITIES
    Notes: Recursive, returns a list of files.    
    """
    
    # Necessary modules
    import os
    import sys
    
    # Initiate list
    list_out = []
    
    # Iterate over os.walk, recursive
    for folderName, subFolders, fileNames in os.walk(rootDir):
        for fileName in fileNames:
            filePath = os.path.join(folderName, fileName)
            
            # check if file exists
            if os.path.exists(filePath):
                fileSize = os.path.getsize(filePath)
                fileSize = fileSize/1024/1024 # Convert it to MB
                
                # Compare and output.
                if fileSize < int(sizing):
                    print("{0}\t{1}".format(fileSize,filePath))
                    list_out.append((fileSize, filePath))
    return list_out

                
                
                
                
                
                
                
                
                
                
                
                
def get_large_files(rootDir, sizing):
    r"""
    Name: get_large_files
    Function: get_large_files(rootDir, sizing)
    Input: rootDir, sizing
    Output: list
    Usage: print (get_large_files('/', 1))
    Storage: PYTHON FILE UTILITIES
    Notes: Recursive, returns a list of files. 
    """
    
    # Necessary modules
    import os
    import sys
    
    # Initiate list
    list_out = []
    
    # Iterate over os.walk, recursive
    for folderName, subFolders, fileNames in os.walk(rootDir):
        for fileName in fileNames:
            filePath = os.path.join(folderName, fileName)
            
            # check if file exists
            if os.path.exists(filePath):
                fileSize = os.path.getsize(filePath)
                fileSize = fileSize/1024/1024 # Convert it to MB
                
                # Compare and output
                if fileSize > int(sizing):
                    print("{0}\t{1}".format(fileSize,filePath))
                    list_out.append((fileSize, filePath))
    return list_out
    
    
    
#print (get_large_files('/', 600))
print (get_small_files('/', 1))

getcwd.py

def cwd():
    r"""
    Name: cwd
    Function: Returns current working directory
    Input: None
    Output: cwd
    Usage: print(cwd())
    """
    import os
    return os.getcwd()

get_smaller_files.py

import os
import sys

rootDir = sys.argv[1]
sizing = sys.argv[2]








# for folderName, subFolders, fileNames in os.walk(rootDir):
	# for fileName in fileNames:
		# filePath = os.path.join(folderName, fileName)
		# if os.path.exists(filePath):
			# fileSize = os.path.getsize(filePath)
			# fileSize = fileSize/1024/1024 # Convert it to MB
			# if fileSize > int(sizing):
				# print("{0}\t{1}".format(fileSize,filePath))
                
                

def get_small_files(rootDir, sizing):
    r"""
    Name: get_small_files
    Function: get_small_files(rootDir, sizing)
    Input: rootDir, sizing
    Output: list
    Usage: print (get_large_files('/', 600))
    Storage: PYTHON FILE UTILITIES
    Notes: Recursive, returns a list of files.    
    """
    
    # Necessary modules
    import os
    import sys
    
    # Initiate list
    list_out = []
    
    # Iterate over os.walk, recursive
    for folderName, subFolders, fileNames in os.walk(rootDir):
        for fileName in fileNames:
            filePath = os.path.join(folderName, fileName)
            
            # check if file exists
            if os.path.exists(filePath):
                fileSize = os.path.getsize(filePath)
                fileSize = fileSize/1024/1024 # Convert it to MB
                
                # Compare and output.
                if fileSize < int(sizing):
                    print("{0}\t{1}".format(fileSize,filePath))
                    list_out.append((fileSize, filePath))
    return list_out

get_larger_files.py

def get_large_files(rootDir, sizing):
    r"""
    Name: get_large_files
    Function: get_large_files(rootDir, sizing)
    Input: rootDir, sizing
    Output: list
    Usage: print (get_large_files('/', 1))
    Storage: PYTHON FILE UTILITIES
    Notes: Recursive, returns a list of files. 
    """
    
    # Necessary modules
    import os
    import sys
    
    # Initiate list
    list_out = []
    
    # Iterate over os.walk, recursive
    for folderName, subFolders, fileNames in os.walk(rootDir):
        for fileName in fileNames:
            filePath = os.path.join(folderName, fileName)
            
            # check if file exists
            if os.path.exists(filePath):
                fileSize = os.path.getsize(filePath)
                fileSize = fileSize/1024/1024 # Convert it to MB
                
                # Compare and output
                if fileSize > int(sizing):
                    print("{0}\t{1}".format(fileSize,filePath))
                    list_out.append((fileSize, filePath))
    return list_out
    
    
    
#print (get_large_files('/', 600))
print (get_small_files('/', 1))

GetCurrentWorking.py

def cwd():
    r"""
    Name: cwd
    Function: Returns current working directory
    Input: None
    Output: cwd
    Usage: print(cwd())
    """
    import os
    return os.getcwd()

ReadWriteTofile.py

#!/usr/bin/python

"""
Playing around with slightly various ways to simulate uniq in Python.
The different strategies are timed.
Only m0() and m1() do not change the order of the data.
`in` is the input file, `out*` are output files.
"""

infile = 'in'  # Change filename to suit your needs.

def m1():
    s = set()
    with open('out1', 'w') as out:
        for line in open(infile):
            if line not in s:
                out.write(line)
                s.add(line)

def m2():
    s = set()
    out = open('out2', 'w')
    for line in open(infile):
        if line not in s:
            out.write(line)
            s.add(line)
    out.close()

def m3():
    s = set()
    for line in open(infile):
        s.add(line)
    out = open('out3', 'w')
    for line in s:
        out.write(line)
    out.close()

def m4():
    s = set()
    for line in open(infile):
        s.add(line)
    out = open('out4', 'w').writelines(s)

def m5():
    uniqlines = set(open(infile).readlines())
    out = open('out5', 'w').writelines(uniqlines)

if __name__ == '__main__':
    import timeit
    print 'm1', timeit.timeit('m1()', setup='from __main__ import m1', number=1000)
    print 'm2', timeit.timeit('m2()', setup='from __main__ import m2', number=1000)
    print 'm3', timeit.timeit('m3()', setup='from __main__ import m3', number=1000)
    print 'm4', timeit.timeit('m4()', setup='from __main__ import m4', number=1000)
    print 'm5', timeit.timeit('m5()', setup='from __main__ import m5', number=1000)
    
    
    #####################################################################################
    
    # ##########################################
# Reading / writing file as text mode

# t refers to the text mode. There is no difference between r and rt or w and wt since text mode is the default.

# Documented here:

# Character   Meaning
# 'r'     open for reading (default)
# 'w'     open for writing, truncating the file first
# 'x'     open for exclusive creation, failing if the file already exists
# 'a'     open for writing, appending to the end of the file if it exists
# 'b'     binary mode
# 't'     text mode (default)
# '+'     open a disk file for updating (reading and writing)
# 'U'     universal newlines mode (deprecated)
# The default mode is 'r' (open for reading text, synonym of 'rt').


#######################################################################

# Open output file for writing
    # Python 2, open outfile with mode 'wb' instead of 'w'. 
    # The csv.writer writes \r\n into the file directly. If 
    # you don't open the file in binary mode, it will write 
    # \r\r\n because on Windows text mode will translate each \n into \r\n.
    # In Python 3 the required syntax changed, so open outfile 
    # with the additional parameter newline='' instead.
    # Examples:
    # Python 2
    #with open('/pythonwork/thefile_subset11.csv', 'wb') as outfile:
    #   writer = csv.writer(outfile)
    # Python 3
    #with open('/pythonwork/thefile_subset11.csv', 'w', newline='') as outfile:
    #   writer = csv.writer(outfile)
    f = open(file_out, 'wt', newline = '')      #writing in text mode with no newline

makenewdir.py

# ########################        
# Create output directory if it does not exist
print("\n")
print("Creating output directory")
if not os.path.exists(output_dir):
    print("Output directory does not exist. Attempting to create...")
    try:
        os.makedirs(output_dir)
    except:
        print("Could not create directory: " + output_dir)
        sys.exit(6)
    else:
        print("Created: " + str(output_dir))
else:
    print("Directory: " + output_dir + " already exists. Skipping creation.")
    
    
# ###########################        
# Create new output folder in directory + date in filename
print("\n")
new_dir_name = str(output_dir) + '\\' + str(today.strftime('%Y%m%d%H%M') + str(h))
print("Creating sub directory : " + str(new_dir_name))

if not os.path.exists(new_dir_name):
    try:
        os.makedirs(new_dir_name)
    except:
        print("Could not create directory: " + str(new_dir_name))
    else:
        print("Created sub-directory: " + new_dir_name)
else:
    print("Directory: " + new_dir_name + " already exists.")

checkUNCpath.py

# ########################        
# Create output directory if it does not exist
print("\n")
print("Creating output directory")
if not os.path.exists(output_dir):
    print("Output directory does not exist. Attempting to create...")
    try:
        os.makedirs(output_dir)
    except:
        print("Could not create directory: " + output_dir)
        sys.exit(6)
    else:
        print("Created: " + str(output_dir))
else:
    print("Directory: " + output_dir + " already exists. Skipping creation.")
    
    
    
#############################################

countfilesindir.py


def fileCount(folder):
    "count the number of files in a directory"

    count = 0

    
    #print (folder)
    
    for filename in os.listdir(folder):
        path = os.path.join(folder, filename)

        if os.path.isfile(path):
            count += 1
        elif os.path.isfolder(path):
            count += fileCount(path)

    return count   
    

# ###########################
# Run counters
counter_dict = dict()
for input_dir in check_paths:
        counter_dict[input_dir] = 0
        try:
            counter_dict[input_dir] = fileCount(input_dir)
        except:
            counter_dict[input_dir] = "COULD NOT RESOLVE DIRECTORY"

        
for ckey in counter_dict.keys():
    print("Folder: " + str(ckey) + " contains: " + str(counter_dict[ckey]) + " files.")

#####################################################################

total = 0
for root, dirs, files in os.walk(folder):
    total += len(files)
    
    
####################################################################


import os
cpt = sum([len(files) for r, d, files in os.walk("G:\CS\PYTHONPROJECTS")])

####################################################################

checkiffileifdir.py

def checkiffile(filename):
  import os.path
  if os.path.isfile(filename):
    return True
  else:
    return False
    
    
def checkifdir(filename):
  import os.path
  if os.path.isdir(filename):
    return True
  else:
    return False
    
    

def checkdetecttype(filename):
  import os.path
  success = True
  if os.path.isfile(filename):
    return (success, "File")
  elif os.path.isdir(filename):
    return (success, "Directory")
  elif os.path.islink(filename):
    return (success, "SymLink")
  elif os.path.ismount(filename):
    return (success, "MountPoint")
  else:
    success = False
    return (success, "NotFound")

checkfileexists.py

def fileexists(filepath):
  '''
  Function: filexists
  Description: Checks for existence of file
  Input: filepath (or raw string of it)
  Output: Boolean
  Usage: if filexists(file_in):...
  Notes: Depending on system may need to 
  conver to raw string with r'file_in.
  '''
  import os.path
  if os.path.exists(filepath):
    return True
  else:
    return False
    

def direxists(filepath):
  import os.path
  if os.path.exists(filepath):
    return True
  else:
    return False
    
print(os.path.isdir("/home/el"))
print(os.path.exists("/home/el/myfile.txt"))




#####################################################################

def check_d_exists(inputpath, verbose = 'yes'):
    import os
    import sys
    check = 0
    
    if os.path.isdir(inputpath):
        check = 1
    
    if verbose == 'yes':
        if check == 1:
            print ("Childs directory located. Proceeding...")
            return True
        else:
            print ("Childs directory not located. Terminating...")
            sys.exit(5)
    else:
        if check == 1:
            return True
        else:
            return False

archivingoperations.py

High-level utilities to create and read compressed and archived files are also provided. They rely on the zipfile and tarfile modules.

shutil.make_archive(base_name, format[, root_dir[, base_dir[, verbose[, dry_run[, owner[, group[, logger]]]]]]])
Create an archive file (eg. zip or tar) and returns its name.

base_name is the name of the file to create, including the path, minus any format-specific extension. format is the archive format: one of “zip” (if the zlib module or external zip executable is available), “tar”, “gztar” (if the zlib module is available), or “bztar” (if the bz2 module is available).

root_dir is a directory that will be the root directory of the archive; ie. we typically chdir into root_dir before creating the archive.

base_dir is the directory where we start archiving from; ie. base_dir will be the common prefix of all files and directories in the archive.

root_dir and base_dir both default to the current directory.

owner and group are used when creating a tar archive. By default, uses the current owner and group.

logger must be an object compatible with PEP 282, usually an instance of logging.Logger.

New in version 2.7.

shutil.get_archive_formats()
Return a list of supported formats for archiving. Each element of the returned sequence is a tuple (name, description).

By default shutil provides these formats:

zip: ZIP file (if the zlib module or external zip executable is available).
tar: uncompressed tar file.
gztar: gzip’ed tar-file (if the zlib module is available).
bztar: bzip2’ed tar-file (if the bz2 module is available).
You can register new formats or provide your own archiver for any existing formats, by using register_archive_format().

New in version 2.7.

shutil.register_archive_format(name, function[, extra_args[, description]])
Register an archiver for the format name. function is a callable that will be used to invoke the archiver.

If given, extra_args is a sequence of (name, value) that will be used as extra keywords arguments when the archiver callable is used.

description is used by get_archive_formats() which returns the list of archivers. Defaults to an empty list.

New in version 2.7.

shutil.unregister_archive_format(name)
Remove the archive format name from the list of supported formats.

New in version 2.7.

10.10.2.1. Archiving example
In this example, we create a gzip’ed tar-file archive containing all files found in the .ssh directory of the user:

>>> from shutil import make_archive
>>> import os
>>> archive_name = os.path.expanduser(os.path.join('~', 'myarchive'))
>>> root_dir = os.path.expanduser(os.path.join('~', '.ssh'))
>>> make_archive(archive_name, 'gztar', root_dir)
'/Users/tarek/myarchive.tar.gz'
The resulting archive contains:

$ tar -tzvf /Users/tarek/myarchive.tar.gz
drwx------ tarek/staff       0 2010-02-01 16:23:40 ./
-rw-r--r-- tarek/staff     609 2008-06-09 13:26:54 ./authorized_keys
-rwxr-xr-x tarek/staff      65 2008-06-09 13:26:54 ./config
-rwx------ tarek/staff     668 2008-06-09 13:26:54 ./id_dsa
-rwxr-xr-x tarek/staff     609 2008-06-09 13:26:54 ./id_dsa.pub
-rw------- tarek/staff    1675 2008-06-09 13:26:54 ./id_rsa
-rw-r--r-- tarek/staff     397 2008-06-09 13:26:54 ./id_rsa.pub
-rw-r--r-- tarek/staff   37192 2010-02-06 18:23:10 ./known_hosts

copyfile.py

Directory and files operations
shutil.copyfileobj(fsrc, fdst[, length])
Copy the contents of the file-like object fsrc to the file-like object fdst. The integer length, if given, is the buffer size. In particular, a negative length value means to copy the data without looping over the source data in chunks; by default the data is read in chunks to avoid uncontrolled memory consumption. Note that if the current file position of the fsrc object is not 0, only the contents from the current file position to the end of the file will be copied.

shutil.copyfile(src, dst)
Copy the contents (no metadata) of the file named src to a file named dst. dst must be the complete target file name; look at shutil.copy() for a copy that accepts a target directory path. If src and dst are the same files, Error is raised. The destination location must be writable; otherwise, an IOError exception will be raised. If dst already exists, it will be replaced. Special files such as character or block devices and pipes cannot be copied with this function. src and dst are path names given as strings.

shutil.copymode(src, dst)
Copy the permission bits from src to dst. The file contents, owner, and group are unaffected. src and dst are path names given as strings.

shutil.copystat(src, dst)
Copy the permission bits, last access time, last modification time, and flags from src to dst. The file contents, owner, and group are unaffected. src and dst are path names given as strings.

shutil.copy(src, dst)
Copy the file src to the file or directory dst. If dst is a directory, a file with the same basename as src is created (or overwritten) in the directory specified. Permission bits are copied. src and dst are path names given as strings.

shutil.copy2(src, dst)
Similar to shutil.copy(), but metadata is copied as well – in fact, this is just shutil.copy() followed by copystat(). This is similar to the Unix command cp -p.

shutil.ignore_patterns(*patterns)
This factory function creates a function that can be used as a callable for copytree()’s ignore argument, ignoring files and directories that match one of the glob-style patterns provided. See the example below.

New in version 2.6.

shutil.copytree(src, dst, symlinks=False, ignore=None)
Recursively copy an entire directory tree rooted at src. The destination directory, named by dst, must not already exist; it will be created as well as missing parent directories. Permissions and times of directories are copied with copystat(), individual files are copied using shutil.copy2().

If symlinks is true, symbolic links in the source tree are represented as symbolic links in the new tree, but the metadata of the original links is NOT copied; if false or omitted, the contents and metadata of the linked files are copied to the new tree.

If ignore is given, it must be a callable that will receive as its arguments the directory being visited by copytree(), and a list of its contents, as returned by os.listdir(). Since copytree() is called recursively, the ignore callable will be called once for each directory that is copied. The callable must return a sequence of directory and file names relative to the current directory (i.e. a subset of the items in its second argument); these names will then be ignored in the copy process. ignore_patterns() can be used to create such a callable that ignores names based on glob-style patterns.

If exception(s) occur, an Error is raised with a list of reasons.

The source code for this should be considered an example rather than the ultimate tool.

Changed in version 2.3: Error is raised if any exceptions occur during copying, rather than printing a message.

Changed in version 2.5: Create intermediate directories needed to create dst, rather than raising an error. Copy permissions and times of directories using copystat().

Changed in version 2.6: Added the ignore argument to be able to influence what is being copied.

shutil.rmtree(path[, ignore_errors[, onerror]])
Delete an entire directory tree; path must point to a directory (but not a symbolic link to a directory). If ignore_errors is true, errors resulting from failed removals will be ignored; if false or omitted, such errors are handled by calling a handler specified by onerror or, if that is omitted, they raise an exception.

If onerror is provided, it must be a callable that accepts three parameters: function, path, and excinfo. The first parameter, function, is the function which raised the exception; it will be os.path.islink(), os.listdir(), os.remove() or os.rmdir(). The second parameter, path, will be the path name passed to function. The third parameter, excinfo, will be the exception information return by sys.exc_info(). Exceptions raised by onerror will not be caught.

Changed in version 2.6: Explicitly check for path being a symbolic link and raise OSError in that case.

shutil.move(src, dst)
Recursively move a file or directory (src) to another location (dst).

If the destination is an existing directory, then src is moved inside that directory. If the destination already exists but is not a directory, it may be overwritten depending on os.rename() semantics.

If the destination is on the current filesystem, then os.rename() is used. Otherwise, src is copied (using shutil.copy2()) to dst and then removed.

New in version 2.3.

exception shutil.Error
This exception collects exceptions that are raised during a multi-file operation. For copytree(), the exception argument is a list of 3-tuples (srcname, dstname, exception).

New in version 2.3.

10.10.1.1. copytree example
This example is the implementation of the copytree() function, described above, with the docstring omitted. It demonstrates many of the other functions provided by this module.

def copytree(src, dst, symlinks=False, ignore=None):
    names = os.listdir(src)
    if ignore is not None:
        ignored_names = ignore(src, names)
    else:
        ignored_names = set()

    os.makedirs(dst)
    errors = []
    for name in names:
        if name in ignored_names:
            continue
        srcname = os.path.join(src, name)
        dstname = os.path.join(dst, name)
        try:
            if symlinks and os.path.islink(srcname):
                linkto = os.readlink(srcname)
                os.symlink(linkto, dstname)
            elif os.path.isdir(srcname):
                copytree(srcname, dstname, symlinks, ignore)
            else:
                copy2(srcname, dstname)
            # XXX What about devices, sockets etc.?
        except (IOError, os.error) as why:
            errors.append((srcname, dstname, str(why)))
        # catch the Error from the recursive copytree so that we can
        # continue with other files
        except Error as err:
            errors.extend(err.args[0])
    try:
        copystat(src, dst)
    except WindowsError:
        # can't copy file access times on Windows
        pass
    except OSError as why:
        errors.extend((src, dst, str(why)))
    if errors:
        raise Error(errors)
Another example that uses the ignore_patterns() helper:

from shutil import copytree, ignore_patterns

copytree(source, destination, ignore=ignore_patterns('*.pyc', 'tmp*'))
This will copy everything except .pyc files and files or directories whose name starts with tmp.

Another example that uses the ignore argument to add a logging call:

from shutil import copytree
import logging

def _logpath(path, names):
    logging.info('Working in %s' % path)
    return []   # nothing will be ignored

copytree(source, destination, ignore=_logpath)

word_count_file.py

def wordcountfile(file):
    count = 0
    file_in = open(str(file))
    for word in file_in.read().split():
        count += 1
    return count
    

print (wordcountfile('D:\panagos\desktop\pytestfile.txt'))

remove_duplicates.py

# -*- coding: utf-8 -*-
"""
Created on Tue Jan 31 10:52:24 2017

@author: P.Doulgeridis
"""



def rdupl( file_in, file_ot, keep_first = 'yes', report = 'no' ):
#    '''
#    Function: rdupl
#    Description: Removes duplicates from file, keeps first
#    Input:  <file_in> : string
#            <file_ot> : string
#    Parameters:     <keep_first = 'no'> or omitted
#                    <report = 'yes'> or omitted
#    Output: STDOUT, file_ot
#    Returns : Tuple of (<lines-seen>, <duplicate_count>, <value-mapping>)
#    Usage: rdupl(r'C:\Users\p.doulgeridis\Desktop\testdupl.txt', r'C:\Users\p.doulgeridis\Desktop\testduplout.txt', report = 'yes')
#    Assign: a = rdupl('C:\\Users\\p.doulgeridis\\Desktop\\testdupl.txt', 'C:\\Users\\p.doulgeridis\\Desktop\\testduplout.txt', report = 'no')
#    Notes: Simple version, for entire line search.
#    Notes: Keep first is redundant.
#    '''

    # Convert input paths to Raw strings
    #path = r'%s' % variable
    in_file_conv = r'%s' % file_in
    ot_file_conv = r'%s' %file_ot
    
    # Initialize counters
    mapping = {}
    lines_seen = []
    dupl_count = 0
    
    # Keep first mode
    if keep_first == 'yes':
        outfile = open(ot_file_conv, "w")
        for line in open(in_file_conv, "r"):
            if line not in lines_seen: # not a duplicate
                outfile.write(line)
                lines_seen.append(line)
                mapping[line] = 1
            else:
                mapping[line] += 1
                dupl_count += 1        
        outfile.close()

    # Report on mode    
    if report == 'yes':
        print ("N. of Duplicates : " + str(dupl_count))
        print ("N. of Unique : " + str(len(lines_seen)))
        print ("Output file : " + str(file_ot))
        print ("Input file : " + str(file_in))
    
    return (lines_seen, dupl_count, mapping)
        
        
#rdupl(r'C:\Users\p.doulgeridis\Desktop\testdupl.txt', r'C:\Users\p.doulgeridis\Desktop\testduplout.txt', report = 'yes')

#a = rdupl('C:\\Users\\p.doulgeridis\\Desktop\\testdupl.txt', 'C:\\Users\\p.doulgeridis\\Desktop\\testduplout.txt', report = 'no')

#print (a[0])
#print (a[1])
#print (a[2])


############################################################################




def rduplsub( file_in, file_ot, substart, subend, keep_first = 'yes', report = 'no' ):
#    '''
#    Function: rdupl
#    Description: Removes duplicates from file, keeps first
#    Input:  <file_in> : string
#            <file_ot> : string
#    Parameters:     <keep_first = 'no'> or omitted
#                    <report = 'yes'> or omitted
#    Output: STDOUT, file_ot
#    Returns : Tuple of (<lines-seen>, <duplicate_count>, <value-mapping>)
#    Usage: rdupl(r'C:\Users\p.doulgeridis\Desktop\testdupl.txt', r'C:\Users\p.doulgeridis\Desktop\testduplout.txt', 4, 7, report = 'yes')
#    Assign: a = rdupl('C:\\Users\\p.doulgeridis\\Desktop\\testdupl.txt', 'C:\\Users\\p.doulgeridis\\Desktop\\testduplout.txt', 4, 7, keep_first ='no', report = 'no')
#    Notes: Simple version, for entire line search.
#    Notes: Keep first is redundant.
#    '''
    # Convert input paths to Raw strings
    #path = r'%s' % variable
    in_file_conv = r'%s' % file_in
    ot_file_conv = r'%s' %file_ot
    
    # Initialize counters
    mapping = {}
    lines_seen = []
    lines_seen2 = []
    dupl_count = 0
    util_list = []
    rerev_list = []
    substr_seen = []
    
    # Keep first mode
    if keep_first == 'yes':
        outfile = open(ot_file_conv, "w")
        for line in open(in_file_conv, "r"):
            fixed_sb = line[substart:subend]
            if fixed_sb not in lines_seen: # not a duplicate
                outfile.write(line)
                lines_seen.append(fixed_sb)
                mapping[fixed_sb] = 1
            else:
                mapping[fixed_sb] += 1
                dupl_count += 1        
        outfile.close()
        
    # Keep last mode
    if keep_first == 'no':
        #print ("Mode set to : No")
        outfile = open(ot_file_conv, "w")
        # store lines in lines_seen list
        for line in open(in_file_conv, "r"):            
            lines_seen.append(line)
        #print ("lines_seen:")
        #print (lines_seen)
        
         # util_list is the reverse
        for i in lines_seen:                            
            util_list.insert(0, i)
        #print ("Reversed list:")
        #print (util_list)
        
        # iterate over util list, extract substring, and check
        for j in util_list:                         
            fixed_sb = j[substart:subend]
            #print (fixed_sb)            
            if fixed_sb not in substr_seen:
                lines_seen2.append(j,)
                substr_seen.append(fixed_sb)
                mapping[fixed_sb] = 1
            else:
                mapping[fixed_sb] += 1
                dupl_count += 1
            #print(lines_seen2)    
        
            
        for k in lines_seen2:
            rerev_list.insert(0, k)
            
        for x in rerev_list:
            outfile.write(x)
        
        outfile.close()
        
        
            
    # Report on mode    
    if report == 'yes':
        print ("N. of Duplicates : " + str(dupl_count))
        print ("N. of Unique : " + str(len(lines_seen)))
        print ("Output file : " + str(file_ot))
        print ("Input file : " + str(file_in))
    
    return (lines_seen, dupl_count, mapping)
        
        
#rduplsub(r'C:\Users\p.doulgeridis\Desktop\testdupl.txt', r'C:\Users\p.doulgeridis\Desktop\testduplout.txt', 16, 22, keep_first = 'no', report = 'yes')

b = rduplsub('C:\\Users\\p.doulgeridis\\Desktop\\testdupl.txt', 'C:\\Users\\p.doulgeridis\\Desktop\\testduplout.txt',16 , 22, keep_first = 'no',  report = 'yes')
print (b[0])
print (b[1])
print (b[2])

occur_count_file.py

def query_count(file, text):
    occur = 0
    lines = 0
    chars = 0
    with open(str(file), 'r') as in_file:
        for line in in_file:
            lines += 1
            chars += len(line)
            if str(text) in line:
                occur += line.count(text);
    return (occur, lines, chars)
    

print (query_count('D:\panagos\desktop\pytestfile.txt', 'aaa' ))

multiline.py

# -*- coding: utf-8 -*-
"""
Created on Thu Feb  2 09:56:55 2017

@author: P.Doulgeridis
"""

# MULTILINE FILE STRUCTURE
#
# [TYPE][DATE][VALUE]
#
# This script calculates statistics for each TYPE, regardless of lines
# of input, uses dictionaries.
#
# Output :
# TYPE : [ [ LIST OF VALUES ], TYPE-OCCUR, MEAN, STDEV, MAX, MIN]


from collections import defaultdict
import numpy as np


def pretty_print(b):
    '''
    Function: pretty_print
    Description : Pretty prints a dictionary
    Input : Dictionary
    Output: STDOUT
    Usage(print) : pretty_print(b)
    Usage(Assign): b = pretty_print(b) - True
    Notes : Only prints on screen
    '''
    print ("{ ")
    for a in b.keys():
        print ( "\t" + str(a) + " : " + str(b[a]) )
    print ("}\n")

def mean(list):
    mean = 0
    for j in list:
        mean += int(j)
    return mean / len(list)

    
def pstdev(data):
    
    def _ss(data2):
        """Return sum of square deviations of sequence data."""
        c = mean(data2)
        ss = sum((int(x)-c)**2 for x in data2)
        return ss
    
    """Calculates the population standard deviation."""
    n = len(data)
    if n < 2:
        raise ValueError('variance requires at least two data points')
    ss = _ss(data)
    pvar = ss/n # the population variance
    return pvar**0.5
    
    
    
    
#counter = defaultdict(lambda: 0)
counter = {}

r'C:\Users\p.doulgeridis\Desktop\testdupl.txt', r'C:\Users\p.doulgeridis\Desktop\testduplout.txt'
ot_file_conv = r'C:\Users\p.doulgeridis\Desktop\testpyout.txt'
in_file_conv = r'C:\Users\p.doulgeridis\Desktop\testpy.txt'

outfile = open(ot_file_conv, "w")

for line in open(in_file_conv, "r"):
  # Cut TYPE
    scen = line[:3]
  # Cut VALUE
    value = line[9:12]
    if scen not in counter:
      # Initialize dictionary element
        counter[scen] = [[], 1, 0, 0, 0, 0 ]
        counter[scen][0].append(value)
        counter[scen][2] = value
    else:
      # Update dictionary element
        counter[scen][0].append(value)
        counter[scen][1] += 1
        counter[scen][2] = mean(counter[scen][0])
        counter[scen][3] = pstdev(counter[scen][0])
        counter[scen][4] = max(counter[scen][0])
        counter[scen][5] = (min(counter[scen][0]))
        #narray = np.array(counter[scen][0])
        #counter[scen][3] = narray.mean()
        outfile.close()

pretty_print(counter)

line_count_file.py

def count_file(file):
    chars = words = lines = 0
    with open(str(file), 'r') as in_file:
        for line in in_file:
            lines += 1
    return lines



def filetolist(file):
  '''
  Function: filetolist
  Description: Reads a file, stores in list
  Input: File
  Output: List
  Usage: print (filetolist("C:\\Users\\p.doulgeridis\\Desktop\\testpy.txt"))
  Notes: Path needs double \\ or reverse /
  '''
  file_in = str(file)
  lines = list(open(file_in, 'r'))
  return (lines, len(lines))

get_file_size.py

  
def get_file_count(file, mode = 0):
    import os
    return os.path.getsize(str(file))

print (get_file_count('D:\panagos\desktop\pytestfile.txt'))  
  


def GetHumanReadable(size,precision=2):
    suffixes=['B','KB','MB','GB','TB']
    suffixIndex = 0
    while size > 1024 and suffixIndex < 4:
        suffixIndex += 1 #increment the index of the suffix
        size = size/1024.0 #apply the division
    return "%.*f%s"%(precision,size,suffixes[suffixIndex])
    
    
print (GetHumanReadable(get_file_count('D:\panagos\desktop\pytestfile.txt')))



def file_size_mb(filePath): return float(os.path.getsize(filePath)) / (1024 * 1024)

filetotuplestrip.py

def filetotuplestrip(file):
  '''
  Function: filetotuplestrip
  Description: Reads a file, strips \n, stores lines in tuple
  Input: File
  Output: Tuple
  Usage: print (filetotuplestrip("C:\\Users\\p.doulgeridis\\Desktop\\testpy.txt"))
  Notes: Path needs double \\ or reverse /
  '''
    file_in = str(file)
    lines = tuple(open(file_in, 'r'))
    content = [x.strip() for x in lines] 
    return content

filetotuple.py

def filetotuple(file):
  '''
  Function: filetotuple
  Description: Reads a file, stores line in tuple
  Input: File
  Output: Tuple
  Usage: print (filetotuple("C:\\Users\\p.doulgeridis\\Desktop\\testpy.txt"))
  Notes: Path needs double \\ or reverse /
  '''
  file_in = str(file)
  lines = tuple(open(file_in, 'r'))
  return lines

filetoliststrip.py

def filetoliststrip(file):
    '''
  Function: filetoliststrip
  Description: Reads a file, stores in list (stripped)
  Input: File
  Output: List
  Usage: print (filetoliststrip("C:\\Users\\p.doulgeridis\\Desktop\\testpy.txt"))
  Notes: Path needs double \\ or reverse /
  '''
    file_in = str(file)
    lines = list(open(file_in, 'r'))
    content = [x.strip() for x in lines] 
    return content

filetolist.py

def filetolist(file):
  '''
  Function: filetolist
  Description: Reads a file, stores in list
  Input: File
  Output: List
  Usage: print (filetolist("C:\\Users\\p.doulgeridis\\Desktop\\testpy.txt"))
  Notes: Path needs double \\ or reverse /
  '''
  file_in = str(file)
  lines = list(open(file_in, 'r'))
  return lines

file_abs_path.py

def file_abs_path(file):
    import os
    return os.path.abspath(str(file))
    
    
print (file_abs_path('desktop\pytestfile.txt'))

dir_cmp.py

##########################################################################
#   DIRECTORY COMPARE V.1                                                #
#                                                                        #
#   Usage :                                                              #
#                                                                        #
#   python DirComp.py <dir1> <dir2>                                      #
#                                                                        #
#   Function :                                                           #
#                                                                        #
#   Sizes and compares directories based on filenames, prints out the    #
#   filenames that only exist in one directory of the two, for both      #
#   directories.                                                         #
#                                                                        #
#   Notes :                                                              #
#                                                                        #
#   Main function is "build_files_set" which takes a directory as input  #
#   and parses each file name in full path, relative path, and assigns   #
#   them to a set so we can do set operations.                           #
#                                                                        #
#   The compare_directories function simply calculates the differences   #
#   between the given sets.                                              #
#                                                                        #
#   Modules:                                                             #
#                                                                        #
#   import os                                                            #
#   import sys                                                           #
#   import re                                                            #
#   import subprocess                                                    #
#   import time                                                          #
##########################################################################




import os
import sys
import re
import subprocess
import time
import collections

# Get the script path
def get_script_path():
    return os.path.dirname(os.path.realpath(sys.argv[0]))

# Parse files of directory
def build_files_set(rootdir):
    root_to_subtract = re.compile(r'^.*?' + rootdir + r'[\\/]{0,1}')
    # Assign relative paths to set for comparison
    files_set = set()
    for (dirpath, dirnames, filenames) in os.walk(rootdir):
        for filename in filenames + dirnames:
            full_path = os.path.join(dirpath, filename)
            relative_path = root_to_subtract.sub('', full_path, count=1)
            files_set.add(relative_path)

    return files_set

# Compare sets 
def compare_directories(dir1, dir2):
    files_set1 = build_files_set(dir1)
    files_set2 = build_files_set(dir2)
    return (files_set1 - files_set2, files_set2 - files_set1)

    
def compare_bool(dir1, dir2):
    files_set1 = build_files_set(dir1)
    files_set2 = build_files_set(dir2)
    compare = lambda dir1, dir2: collections.Counter(files_set1) == collections.Counter(files_set2)
    return compare
    
def are_eq(a, b):
    files_set1 = build_files_set(a)
    files_set2 = build_files_set(b)
    return set(a) == set(b) and len(a) == len(b)    
    
    
# Get size - Not working
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size
 
# call to system du - working
def du(path):
    """disk usage in human readable format (e.g. '2,1GB')"""
    return subprocess.check_output(['du','-sh', path]).split()[0].decode('utf-8') 
    

def main():
    
    if __name__ == '__main__':

    # ##
    # Script parameters 
        print (' ')
        print ("Script run at: " + str(time.strftime("%H:%M:%S")))
        print ("Script run from: " + get_script_path())
    
    # ##
    # Process arguments
        print (' ')
        total = len(sys.argv)
        cmdargs = str(sys.argv)
    # print ("The total numbers of args passed to the script: %d " % total)
    # print ("Args list: %s " % cmdargs)
    
    # print ("Script name: %s" % str(sys.argv[0]))
    # print ("First argument: %s" % str(sys.argv[1]))
    # print ("Second argument: %s" % str(sys.argv[2]))

        dir1 = str(sys.argv[1])
        dir2 = str(sys.argv[2])
    
    # DEBUG
    # dir1 = '/home/tede/n55115/PD/UTILS/DirectoryComparison/ORIGINFOLDERa'
    # dir2 = '/home/tede/n55115/PD/UTILS/DirectoryComparison/BATCHINFO'
    
    # ##
    # Compare Directories
        in_dir1, in_dir2 = compare_directories(dir1, dir2)

    # ## 
    # Output 
        print (' ')
        print 'Comparing Files: '
        print (dir1)
        print (dir2)
    
    
        print (' ')
        print 'Comparing sizes: ' 
        size1 = du(dir1)
        size2 = du(dir2)
        print ("Directory " + dir1 + " size: " + str(size1))
        print ("Directory " + dir2 + " size: " + str(size2))
    
        print '\nFiles only in {}:'.format(dir1)
        for relative_path in in_dir1:
            print '* {0}'.format(relative_path) 

        print '\nFiles only in {}:'.format(dir2)
        for relative_path in in_dir2:
            print '* {0}'.format(relative_path)
    
    # Boolean
        return are_eq(dir1, dir2)
    
main()

print main()

combi_count.py

def count_file(file):
  '''
  Function:
  Description:
  Input:
  Output:
  Usage:
  Notes:
  
  
  
  '''
    chars = words = lines = 0
    with open(str(file), 'r') as in_file:
        for line in in_file:
            lines += 1
            words += len(line.split())
            chars += len(line)
    return (lines, words, chars)

char_count_file.py

def char_count_file(file):
    chars = 0
    with open(str(file), 'r') as in_file:
        for line in in_file:
            chars += len(line)
    return chars
    
    
print (char_count_file('D:\panagos\desktop\pytestfile.txt'))



################################################
################################################

def charactercountfile(file):
    
    def replace_tab(s, tabstop = 4):
        result = str()
        for c in s:
            if c == '\t':
                while (len(result) % tabstop != 0):
                    result += ' ';
            else:
                result += c    
        return result    
    
    count = 0
    file_in = open(str(file))
    for line in file_in:
        line_fixed = replace_tab(line, tabstop = 4)
        for char in line_fixed:
            count += 1
    return count

print (charactercountfile('D:\panagos\desktop\pytestfile.txt'))

PathtoRawstring.py

path = r'%s' % variable


>>> a = r'raw s\tring'
>>> b = 'raw s\\tring'
>>> a
'raw s\\tring'
>>> b
'raw s\\tring'


Double backslashes are due to r, raw string:

r'C:\Users\user\Desktop\File_%s.pdf' ,
It is used because the \ might escape some of the characters.

>>> strs = "c:\desktop\notebook"

>>> print strs                #here print thinks that \n in \notebook is the newline char
c:\desktop
otebook

>>> strs = r"c:\desktop\notebook"  #using r'' escapes the \
>>> print strs

c:\desktop\notebook

>>> print repr(strs)   #actual content of strs
'c:\\desktop\\notebook'

MakeDir_Functions.py

def makedir(output_dir):
    print("\n")
    print("Creating output directory")
if not os.path.exists(output_dir):
    print("Output directory does not exist. Attempting to create...")
    try:
        os.makedirs(output_dir)
    except:
        print("Could not create directory: " + output_dir)
        sys.exit(6)
    else:
        print("Created: " + str(output_dir))
else:
    print("Directory: " + output_dir + " already exists. Skipping creation.")
    
    
    
    
def makedir(output_dir, overwrite="no", vocal="yes")
    if vocal="no":
        if not os.path.exists(output_dir):
            try:
                os.makedirs(output_dir)
            except:
                return 'Failed to Create'
        else:
            if overwrite="no":
                return 'Exists'

ListDirFiles.py

# get all files in a directory ( no recursive ) with generator
#This is a simple generator expression:

files = (file for file in os.listdir(path) 
         if os.path.isfile(os.path.join(path, file)))

for file in files: # You could shorten this to one line, but it runs on a bit.
    ...

#Or you could make a generator function if it suited you better:

def files(path):
    for file in os.listdir(path):
        if os.path.isfile(os.path.join(path, file)):
            yield file

FindReplaceinFile.py

#!/usr/bin/python
import string
s = open("/usr2/py/mount.txt","r+")
for line in s.readlines():
   print line
   string.replace(line, 'mickey','minnie')
   print line
s.close()

GetAbsFilePathsinDir.py

def absoluteFilePathsRec(directory):
    '''
    Name: absoluteFilePathsRec
    Type: Generator
    Description: Returns all files in dir, recursive
    Input: directory
    Output: Generator object for files in dir
    Usage: for j in absoluteFilePaths(control_path): list.append(j)
    Notes: Generator object, if we want to convert it into 
           a list we ll have to iterate, and cast with list()
    Requires: os, os.path
    '''
    
    import os
    
    if os.path.isdir(directory):
        pass
    else:
        print("Provided argument is not a directory")
      
    for dirpath,_,filenames in os.walk(directory):
       for f in filenames:
           yield os.path.abspath(os.path.join(dirpath, f))


def absoluteFilePaths(directory, incl_dir=False):
    '''
    Name: absoluteFilePaths
    Type: Generator
    Description: Returns all files in dir, top level
    Input: directory, T/F
           T: include subdirectories in listing 
           F: dont include subdirectories in listing.
    Output: Generator object for files in dir, top level.
    Usage: for j in absoluteFilePaths
    Notes: Generator object, if we want to convert into a list 
           we ll have to iterate, and cast with list()
    Requires: os, os.path
    '''
    
    import os
    
    
    if os.path.isdir(directory):
        pass
    else:
        print("Provided argument is not a directory")

    for filenames in os.listdir(directory):
        if incl_dir==False:
            if os.path.isdir(os.path.join(directory, filenames)):
                pass
            else:
                yield os.path.abspath(os.path.join(directory, filenames))      
        else:
            if os.path.isdir(os.path.join(directory, filenames)):
                yield os.path.abspath(os.path.join(directory, filenames))
            else:
                yield os.path.abspath(os.path.join(directory, filenames))
           

           
for j in absoluteFilePaths(control_path, False):
    #print(j)
    lista.append(j)

pretty_print(lista, 1)  
print(len(lista))

WalkDirRecursive.py

def walklevel(some_dir, level=1):
    '''
    Name: walklevel
    Description: Walks a directorty tree, up to a certain level.
    Input: directory, level
    Output: tuple -> (rootdir, [dirs], [files])
    Usage:  # for j in walklevel(control_path, level=0):
            # print(list(j))
    Notes: Returns a tuple by default, needs parsing.
    Requires: os module
    '''
    some_dir = some_dir.rstrip(os.path.sep)
    assert os.path.isdir(some_dir)
    num_sep = some_dir.count(os.path.sep)
    for root, dirs, files in os.walk(some_dir):
        yield root, dirs, files
        num_sep_this = root.count(os.path.sep)
        if num_sep + level <= num_sep_this:
            del dirs[:]

Python.OS.File.Basics.py

import os
import shutil
import glob

# working directory
c_dir = os.getcwd()                 # show current working directory
os.listdir(c_dir)                   # shows all files in the working directory
os.chdir('~/Data')                  # change working directory


# get all files in a directory
glob.glob('/Users/sebastian/Desktop/*')

# e.g.,  ['/Users/sebastian/Desktop/untitled folder', '/Users/sebastian/Desktop/Untitled.txt']

# walk
tree = os.walk(c_dir)      
# moves through sub directories and creates a 'generator' object of tuples
# ('dir', [file1, file2, ...] [subdirectory1, subdirectory2, ...]), 
#    (...), ...

#check files: returns either True or False
os.exists('../rel_path')
os.exists('/home/abs_path')
os.isfile('./file.txt')
os.isdir('./subdir')


# file permission (True or False
os.access('./some_file', os.F_OK) # File exists? Python 2.7
os.access('./some_file', os.R_OK) # Ok to read? Python 2.7
os.access('./some_file', os.W_OK) # Ok to write? Python 2.7
os.access('./some_file', os.X_OK) # Ok to execute? Python 2.7
os.access('./some_file', os.X_OK | os.W_OK) # Ok to execute or write? Python 2.7

# join (creates operating system dependent paths)
os.path.join('a', 'b', 'c')
# 'a/b/c' on Unix/Linux
# 'a\\b\\c' on Windows
os.path.normpath('a/b/c') # converts file separators


# os.path: direcory and file names
os.path.samefile('./some_file', '/home/some_file')  # True if those are the same
os.path.dirname('./some_file')  # returns '.' (everythin but last component)
os.path.basename('./some_file') # returns 'some_file' (only last component
os.path.split('./some_file') # returns (dirname, basename) or ('.', 'some_file)
os.path.splitext('./some_file.txt') # returns ('./some_file', '.txt')
os.path.splitdrive('./some_file.txt') # returns ('', './some_file.txt')
os.path.isabs('./some_file.txt') # returns False (not an absolute path)
os.path.abspath('./some_file.txt')


# create and delete files and directories
os.mkdir('./test')  # create a new direcotory
os.rmdir('./test')  # removes an empty direcotory
os.removedirs('./test') # removes nested empty directories
os.remove('file.txt')   # removes an individual file
shutil.rmtree('./test') # removes directory (empty or not empty)

os.rename('./dir_before', './renamed') # renames directory if destination doesn't exist
shutil.move('./dir_before', './renamed') # renames directory always

shutil.copytree('./orig', './copy') # copies a directory recursively
shutil.copyfile('file', 'copy')     # copies a file

 
# Getting files of particular type from directory
files = [f for f in os.listdir(s_pdb_dir) if f.endswith(".txt")]
  
# Copy and move
shutil.copyfile("/path/to/file", "/path/to/new/file") 
shutil.copy("/path/to/file", "/path/to/directory")
shutil.move("/path/to/file","/path/to/directory")
   
# Check if file or directory exists
os.path.exists("file or directory")
os.path.isfile("file")
os.path.isdir("directory")
    
# Working directory and absolute path to files
os.getcwd()
os.path.abspath("file")

DirRecurCopy.py

import os
import sys
import shutil


#######################################################
#   Recursive Directory Copy
#
#   TODO:   
#           
#           Add both to Cacher + Text
#           Move to Laptop Lib
#
#   Author: P.Doulgeridis
#   Function: Recursively copies two directories
#   Contents: 2 functions that perform the same purpose
#             recursive_copy
#             copytree
#
#   * See function definition for usage.
#
#
########################################################



dir_origin = sys.argv[1]
dir_dest = sys.argv[2]



def recursive_copy(src, dst):
    
    """
    Name: recursive_copy
    Function: recursive_copy(src, dst)
    Description: Copies a directory tree recursively
    Input:  <src> => Directory
            <dst> => Directory
    Output: None
    Usage: recursive_copy(dir_origin, dir_dest)
    Notes: Used to recursively copy directories from one to another.
    """
    
    current = os.getcwd()
    
    os.chdir(src)
    print(os.getcwd())
    for item in os.listdir(src):

        print(item)
    
        if os.path.isfile(item):
            shutil.copy(os.path.join(src, item), os.path.join(dst, item))
            
        elif os.path.isdir(item):
            new_dst = os.path.join(dst, item)
            os.mkdir(new_dst)
            recursive_copy(os.path.abspath(item), new_dst)
            
            
            
    os.chdir(current)        
            
recursive_copy(dir_origin, dir_dest)





def copytree(src, dst, symlinks=False, ignore=None):

    """
    Name: copytree
    Function: copytree(src, dst, symlinks=False, ignore=None)
    Description: Copies a directory tree recursively
    Input:  <src> => Directory
            <dst> => Directory
            <symlinks> => T|F
            <ignore> = None default
    Output: None
    Usage: copytree(dir_origin, dir_dest)
    Notes: Used to recursively copy directories from one to another.
    """



    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)
            
            
            
            
            
copytree(dir_origin, dir_dest)

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Python.OS.File.Directory.Manipulation