onlyforbopi
1/18/2018 - 8:34 AM

Python.PDF.PDFManipulation

Python.PDF.PDFManipulation #python #Python #PythonModules #pypdf #PDF #pdf #pdfmanipulation #Modules

How to manipulate pdf (needs work)

Modules: pyPdf (Unmaintained) pyPDF2(Ok version)

#! /usr/bin/env python
 
###############################################################################
##
##  Copyright 2012 Jeet Sukumaran.
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 3 of the License, or
##  (at your option) any later version.
##
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
##
##  You should have received a copy of the GNU General Public License along
##  with this program. If not, see <http://www.gnu.org/licenses/>.
##
###############################################################################
 
"""
Extract specified pages from source PDF.
"""
 
import sys
import os
import argparse
import pyPdf
 
__prog__ = os.path.basename(__file__)
__version__ = "1.0.0"
__description__ = __doc__
__author__ = 'Jeet Sukumaran'
__copyright__ = 'Copyright (C) 2012 Jeet Sukumaran.'
 
def main():
    """
    Main CLI handler.
    """
 
    parser = argparse.ArgumentParser(description=__description__)
    parser.add_argument("--version", action="version", version="%(prog)s " + __version__)
    parser.add_argument("src_pdf",
            metavar="SOURCE-PDF",
            type=argparse.FileType('rb'),
            help="path to input pdf file")
    parser.add_argument("first_page",
            metavar="FIRST-PAGE",
            type=int,
            help="number of first page (1-based index: first page is '1')")
    parser.add_argument("last_page",
            metavar="LAST-PAGE",
            type=str,
            help="number of last page; if preceded by '+' (e.g., '+30'), specifies number of pages following first page to extract")
    parser.add_argument("-o", "--output-filepath",
            type=str,
            default=None,
            help="path to output file (if not given, will write to standard output)")
 
    args = parser.parse_args()
    first_page = args.first_page - 1
    if args.last_page.startswith("+"):
        last_page = args.last_page[1:].replace(" ", "")
        if not last_page:
            sys.exit("Need to specify number of pages")
        last_page = first_page + int(last_page)
    else:
        last_page = int(args.last_page) - 1
 
    pdf_in = pyPdf.PdfFileReader(args.src_pdf)
    pdf_out = pyPdf.PdfFileWriter()
    for pg_num in range(first_page, last_page + 1):
        pdf_out.addPage(pdf_in.getPage(pg_num))
    if args.output_filepath:
        out_stream = open(os.path.expandvars(os.path.expanduser(args.output_filepath)), "wb")
    else:
        out_stream = sys.stdout
    pdf_out.write(out_stream)
    out_stream.close()
 
if __name__ == '__main__':
    main()
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 16 09:21:50 2017

@author: P.Doulgeridis
"""





import sys
import os
import argparse
import pyPdf



try:
	from PyPDF2 import PdfFileReader, PdfFileWriter
except:
	print("Could not load module")
else:
    print("Pdf Handling module loaded successfully")
    






file_in = sys.argv[1]
page_in = sys.argv[2]







    
    
# try:
    # output = PdfFileWriter()
    # pdfIn = PdfFileReader(open(r"C:\Users\p.doulgeridis\Desktop\requests.pdf", "rb"))
# except:
    # print("Could not assign FileWriter and FileReader objects")
# else:
    # print("FileWriter and FileReader objects successfully assigned")
    

try:
    output = PdfFileWriter()
    pdfIn = PdfFileReader(open(file_in, "rb"))
except:
    print("Could not assign FileWriter and FileReader objects")
else:
    print("FileWriter and FileReader objects successfully assigned")


    
try:
    output.addPage(pdfIn.getPage(page_in))
except:
    print("Could not extract page from pdf")
else:
    print("Page succesfully extracted")
    
# output.addPage(pdfIn.getPage(15))  
    
try:    
    outputStream = open(r"output2.pdf", "wb")
    output.write(outputStream)
    outputStream.close()
except:
    print("Could not copy extracted page to new pdf")
else:
    print("Page copied successfully")
 def fnPDF_ExtractPages(xFileNameOriginal, xFileNameOutput, xPageStart, xPageEnd):
      from pyPdf import PdfFileReader, PdfFileWriter
      output = PdfFileWriter()
      pdfOne = PdfFileReader(file(xFileNameOriginal, "rb"))
      for i in range(xPageStart, xPageEnd):
          output.addPage(pdfOne.getPage(i))
          outputStream = file(xFileNameOutput, "wb")
          output.write(outputStream)
          outputStream.close()
def fnPDF_FindText(xFile, xString):
    # xfile : the PDF file in which to look
    # xString : the string to look for
    import pyPdf, re
    PageFound = -1
    pdfDoc = pyPdf.PdfFileReader(file(xFile, "rb"))
    for i in range(0, pdfDoc.getNumPages()):
        content = ""
        content += pdfDoc.getPage(i).extractText() + "\n"
        content1 = content.encode('ascii', 'ignore').lower()
        ResSearch = re.search(xString, content1)
        if ResSearch is not None:
           PageFound = i
           break
     return PageFound
     

def fnPDF_FindText(xFile, xString):
    # PYTHON 3 VERSION
    # Module can be replaced by PyPDF2
    # xfile : the PDF file in which to look
    # xString : the string to look for
    import pyPdf, re
    PageFound = -1
    pdfDoc = pyPdf.PdfFileReader(open(xFile, "rb"))
    for i in range(0, pdfDoc.getNumPages()):
        content = ""
        content += pdfDoc.getPage(i).extractText() + "\n"
        content1 = content.encode('ascii', 'ignore').lower()
        ResSearch = re.search(xString, content1)
        if ResSearch is not None:
           PageFound = i
           break
     return PageFound

# doc3333

import os
import sys
import PyPDF2


file_in = sys.argv[1]
file_ot = sys.argv[2]
pages = []
ranges = []

for j in range(len(sys.argv)):
    if j > 2:
        if '-' in str(sys.argv[j]):
            ranges.append(sys.argv[j])
        else:
            pages.append(sys.argv[j])
        
print(pages)
print(ranges)        


def expand_ranges(range_in):
    #print("in func")
    list_out = []
    #print(type(range_in))
    string_in = range_in
    #print(range_in)
    
    
    
    #print(string_in)
    
    list1 = string_in.split("-")
    #print(list1)
    
    # string_in = ''.join(range_in)
    # print("This is" ,string_in)
    
    range_start = int(list1[0])
    range_end = int(list1[1])
    
    for j in range(range_start, range_end + 1, 1):
        list_out.append(j)
    
    return list_out
    
#print(expand_ranges(ranges))


for j in ranges:
    #print(j)
    a = expand_ranges(j)
    #print(a)
    pages = pages + a
    #print(pages)


for f in range(len(pages)):
    pages[f] = int(pages[f])
    
    
print(pages)





pdf1File = open(file_in, 'rb')
pdf1Reader = PyPDF2.PdfFileReader(pdf1File)
pdfWriter = PyPDF2.PdfFileWriter()


for pageNum in range(pdf1Reader.numPages):
    if int(pageNum) in pages:
        pageObj = pdf1Reader.getPage(pageNum)
        pdfWriter.addPage(pageObj)
        
        
pdfOutputFile = open(file_ot, 'wb')
pdfWriter.write(pdfOutputFile)
pdfOutputFile.close()
pdf1File.close()