Python.PDF.PDFManipulation #python #Python #PythonModules #pypdf #PDF #pdf #pdfmanipulation #Modules
How to manipulate pdf (needs work)
Modules: pyPdf (Unmaintained) pyPDF2(Ok version)
#! /usr/bin/env python
###############################################################################
##
## Copyright 2012 Jeet Sukumaran.
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program. If not, see <http://www.gnu.org/licenses/>.
##
###############################################################################
"""
Extract specified pages from source PDF.
"""
import sys
import os
import argparse
import pyPdf
__prog__ = os.path.basename(__file__)
__version__ = "1.0.0"
__description__ = __doc__
__author__ = 'Jeet Sukumaran'
__copyright__ = 'Copyright (C) 2012 Jeet Sukumaran.'
def main():
"""
Main CLI handler.
"""
parser = argparse.ArgumentParser(description=__description__)
parser.add_argument("--version", action="version", version="%(prog)s " + __version__)
parser.add_argument("src_pdf",
metavar="SOURCE-PDF",
type=argparse.FileType('rb'),
help="path to input pdf file")
parser.add_argument("first_page",
metavar="FIRST-PAGE",
type=int,
help="number of first page (1-based index: first page is '1')")
parser.add_argument("last_page",
metavar="LAST-PAGE",
type=str,
help="number of last page; if preceded by '+' (e.g., '+30'), specifies number of pages following first page to extract")
parser.add_argument("-o", "--output-filepath",
type=str,
default=None,
help="path to output file (if not given, will write to standard output)")
args = parser.parse_args()
first_page = args.first_page - 1
if args.last_page.startswith("+"):
last_page = args.last_page[1:].replace(" ", "")
if not last_page:
sys.exit("Need to specify number of pages")
last_page = first_page + int(last_page)
else:
last_page = int(args.last_page) - 1
pdf_in = pyPdf.PdfFileReader(args.src_pdf)
pdf_out = pyPdf.PdfFileWriter()
for pg_num in range(first_page, last_page + 1):
pdf_out.addPage(pdf_in.getPage(pg_num))
if args.output_filepath:
out_stream = open(os.path.expandvars(os.path.expanduser(args.output_filepath)), "wb")
else:
out_stream = sys.stdout
pdf_out.write(out_stream)
out_stream.close()
if __name__ == '__main__':
main()
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 16 09:21:50 2017
@author: P.Doulgeridis
"""
import sys
import os
import argparse
import pyPdf
try:
from PyPDF2 import PdfFileReader, PdfFileWriter
except:
print("Could not load module")
else:
print("Pdf Handling module loaded successfully")
file_in = sys.argv[1]
page_in = sys.argv[2]
# try:
# output = PdfFileWriter()
# pdfIn = PdfFileReader(open(r"C:\Users\p.doulgeridis\Desktop\requests.pdf", "rb"))
# except:
# print("Could not assign FileWriter and FileReader objects")
# else:
# print("FileWriter and FileReader objects successfully assigned")
try:
output = PdfFileWriter()
pdfIn = PdfFileReader(open(file_in, "rb"))
except:
print("Could not assign FileWriter and FileReader objects")
else:
print("FileWriter and FileReader objects successfully assigned")
try:
output.addPage(pdfIn.getPage(page_in))
except:
print("Could not extract page from pdf")
else:
print("Page succesfully extracted")
# output.addPage(pdfIn.getPage(15))
try:
outputStream = open(r"output2.pdf", "wb")
output.write(outputStream)
outputStream.close()
except:
print("Could not copy extracted page to new pdf")
else:
print("Page copied successfully")
def fnPDF_ExtractPages(xFileNameOriginal, xFileNameOutput, xPageStart, xPageEnd):
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
pdfOne = PdfFileReader(file(xFileNameOriginal, "rb"))
for i in range(xPageStart, xPageEnd):
output.addPage(pdfOne.getPage(i))
outputStream = file(xFileNameOutput, "wb")
output.write(outputStream)
outputStream.close()
def fnPDF_FindText(xFile, xString):
# xfile : the PDF file in which to look
# xString : the string to look for
import pyPdf, re
PageFound = -1
pdfDoc = pyPdf.PdfFileReader(file(xFile, "rb"))
for i in range(0, pdfDoc.getNumPages()):
content = ""
content += pdfDoc.getPage(i).extractText() + "\n"
content1 = content.encode('ascii', 'ignore').lower()
ResSearch = re.search(xString, content1)
if ResSearch is not None:
PageFound = i
break
return PageFound
def fnPDF_FindText(xFile, xString):
# PYTHON 3 VERSION
# Module can be replaced by PyPDF2
# xfile : the PDF file in which to look
# xString : the string to look for
import pyPdf, re
PageFound = -1
pdfDoc = pyPdf.PdfFileReader(open(xFile, "rb"))
for i in range(0, pdfDoc.getNumPages()):
content = ""
content += pdfDoc.getPage(i).extractText() + "\n"
content1 = content.encode('ascii', 'ignore').lower()
ResSearch = re.search(xString, content1)
if ResSearch is not None:
PageFound = i
break
return PageFound
# doc3333
import os
import sys
import PyPDF2
file_in = sys.argv[1]
file_ot = sys.argv[2]
pages = []
ranges = []
for j in range(len(sys.argv)):
if j > 2:
if '-' in str(sys.argv[j]):
ranges.append(sys.argv[j])
else:
pages.append(sys.argv[j])
print(pages)
print(ranges)
def expand_ranges(range_in):
#print("in func")
list_out = []
#print(type(range_in))
string_in = range_in
#print(range_in)
#print(string_in)
list1 = string_in.split("-")
#print(list1)
# string_in = ''.join(range_in)
# print("This is" ,string_in)
range_start = int(list1[0])
range_end = int(list1[1])
for j in range(range_start, range_end + 1, 1):
list_out.append(j)
return list_out
#print(expand_ranges(ranges))
for j in ranges:
#print(j)
a = expand_ranges(j)
#print(a)
pages = pages + a
#print(pages)
for f in range(len(pages)):
pages[f] = int(pages[f])
print(pages)
pdf1File = open(file_in, 'rb')
pdf1Reader = PyPDF2.PdfFileReader(pdf1File)
pdfWriter = PyPDF2.PdfFileWriter()
for pageNum in range(pdf1Reader.numPages):
if int(pageNum) in pages:
pageObj = pdf1Reader.getPage(pageNum)
pdfWriter.addPage(pageObj)
pdfOutputFile = open(file_ot, 'wb')
pdfWriter.write(pdfOutputFile)
pdfOutputFile.close()
pdf1File.close()