Handy class for splitting or extracting pages from PDF documents
package be.thon.util.pdf;
import static java.lang.Integer.valueOf;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import org.apache.log4j.Logger;
import com.lowagie.text.Document;
import com.lowagie.text.pdf.PdfCopy;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.RandomAccessFileOrArray;
import com.lowagie.text.pdf.SequenceList;
import com.lowagie.text.pdf.SimpleBookmark;
import com.lowagie.text.pdf.parser.PdfTextExtractor;
/**
* Handy class for splitting or extracting pages from PDF documents.
*
* @author nitanoc
* @version 1.0, Feb 26, 2014
*/
public class PdfPageExtractor {
private final Logger log = Logger.getLogger(PdfPageExtractor.class);
// Many times the source is a file in the local file system; if the PDF document has some other source, one can
// extend this class and override #getSourcePdfReader(); this should be the only method used by extract methods
// in order to obtain the PDF content to extract from!
private String source;
private String destination; // if null, rangeName is used; if not absolute, destinationDirectory is also used
private String destinationDirectory; // if null, the current working directory is used
private PageRangeName rangeName = FIRST_LAST_PAGE_RANGE_NAME; // if null and the destination is also null
// then FIRST_LAST_PAGE_RANGE_NAME is used!
public PdfPageExtractor() {
// allow to create page extractors but set their attributes later (maybe in order to re-use an instance, etc.)
}
public PdfPageExtractor(String source) {
setSource(source);
}
public String getSource() {
return source;
}
public PdfPageExtractor setSource(String source) {
this.source = source;
return this;
}
public String getDestination() {
return destination;
}
public PdfPageExtractor setDestination(String destination) {
this.destination = destination;
return this;
}
public String getDestinationDirectory() {
return destinationDirectory;
}
public PdfPageExtractor setDestinationDirectory(String destinationDirectory) {
this.destinationDirectory = destinationDirectory;
return this;
}
public PageRangeName getRangeName() {
return rangeName;
}
public PdfPageExtractor setRangeName(PageRangeName rangeName) {
this.rangeName = rangeName;
return this;
}
public void extractByBookmarks() {
PdfReader pdf = null;
try {
pdf = getSourcePdfReader();
if (pdf == null) {
log.error("cannot extract by bookmarks from a null document");
return;
}
pdf.consolidateNamedDestinations();
@SuppressWarnings("unchecked")
List<HashMap<String, Object>> bookmarksList = SimpleBookmark.getBookmark(pdf);
if (bookmarksList == null || bookmarksList.isEmpty()) {
log.warn("no bookmarks found in the specified PDF document");
return;
}
Iterator<HashMap<String, Object>> bookmarks = bookmarksList.iterator();
HashMap<String, Object> currentBookmark, nextBookmark;
for (currentBookmark = bookmarks.next(); bookmarks.hasNext(); currentBookmark = nextBookmark) {
nextBookmark = bookmarks.next();
try {
doExtractBookmark(pdf, currentBookmark, nextBookmark);
} catch (Throwable throwable) {
log.error(
"cannot extract bookmark " +
(currentBookmark == null ? "(null)" : currentBookmark.get("Title")) + "; skipping...",
throwable);
}
}
try {
doExtractBookmark(pdf, currentBookmark, null); // we're now left with the last bookmark
} catch (Throwable throwable) {
log.error(
"cannot extract (last) bookmark " +
(currentBookmark == null ? "(null)" : currentBookmark.get("Title")) + "; skipping...",
throwable);
}
} catch (Throwable throwable) {
log.error("cannot extract bookmarks from the specified PDF document", throwable);
} finally {
if (pdf != null) {
try {
pdf.close();
} catch (Throwable throwable) {
log.warn("cannot close the PDF document; ignoring...", throwable);
}
}
}
}
/**
* Each of the specified ranges will be extracted in a new file.
*/
@SuppressWarnings("unchecked")
public void extract(String firstRange, String... otherRanges) {
PdfReader pdf = null;
try {
if ((pdf = getSourcePdfReader()) == null) {
log.error("cannot extract pages from a null document");
return;
}
int maxNumberOfPages = pdf.getNumberOfPages();
if (firstRange != null) {
doExtract(pdf, SequenceList.expand(firstRange, maxNumberOfPages));
}
if (otherRanges != null) {
for (String otherRange : otherRanges) {
if (otherRange != null) {
doExtract(pdf, SequenceList.expand(otherRange, maxNumberOfPages));
} else {
log.warn("ignoring null range...");
}
}
}
} catch (Throwable throwable) {
log.error("cannot extract pages from the specified PDF document", throwable);
} finally {
if (pdf != null) {
try {
pdf.close();
} catch (Throwable throwable) {
log.warn("cannot close the PDF document; ignoring...", throwable);
}
}
}
}
/**
* Convenience method to extract an arbitrary number of pages:
*
* <pre>
* // ...
* extractor.extract(2);
* // ...
* extractor.extract(1, 7, 10);
* </pre>
*/
public void extract(int firstPage, int... otherPages) {
List<Integer> pages = new ArrayList<Integer>();
pages.add(valueOf(firstPage));
if (otherPages != null) {
for (int otherPage : otherPages) {
pages.add(valueOf(otherPage));
}
}
extract(pages);
}
public void extract(List<Integer> pages) {
if (pages == null || pages.isEmpty()) {
log.error("no valid page range specified: " + pages);
return;
}
PdfReader pdf = null;
try {
doExtract(pdf = getSourcePdfReader(), pages);
} catch (Throwable throwable) {
log.error("cannot extract pages from the specified PDF document", throwable);
} finally {
if (pdf != null) {
try {
pdf.close();
} catch (Throwable throwable) {
log.warn("cannot close the PDF document; ignoring...", throwable);
}
}
}
}
/**
* @return -1 if {@link #getSourcePdfReader()} return <code>null</code> or an exception occurs
*/
public int getNumberOfPages() {
PdfReader pdf = null;
try {
pdf = getSourcePdfReader();
return pdf == null ? -1 : pdf.getNumberOfPages();
} catch (Throwable throwable) {
log.error("cannot open the specified PDF document", throwable);
return -1;
} finally {
if (pdf != null) {
try {
pdf.close();
} catch (Throwable throwable) {
log.warn("cannot close the PDF document; ignoring...", throwable);
}
}
}
}
/**
* <p>
* The returned {@link PdfReader} should be closed by calling methods.
* </p>
* <p>
* Since this method might be used for operations not necessarily involving full PDF parsing (e.g. page counting),
* opening the {@link PdfReader} should be as efficient as possible (i.e. parse as little as possible). For an
* efficient (?) method of obtaining {@link PdfReader}s, see <a
* href="http://stackoverflow.com/questions/6026971/page-count-of-pdf-with-java for an efficient way of opening"
* >http://stackoverflow.com/questions/6026971/page-count-of-pdf-with-java for an efficient way of opening</a>.
* </p>
*
* @see http://stackoverflow.com/questions/6026971/page-count-of-pdf-with-java for an efficient way of opening
*/
protected PdfReader getSourcePdfReader() throws IOException {
return source == null ? null : new PdfReader(new RandomAccessFileOrArray(source), new byte[0]);
}
/**
* <ol>
* <li>If a {@link #getDestination() destination file path} is {@link #setDestination(File) set}, that is used as
* the target for the extracted pages; appropriate for one-time / single range extraction</li>
* <li>Otherwise, if a {@link #setRangeName(PageRangeName) range name is set}, that is used to compute the file path
* of the destination; appropriate for repeated extractions resulting in multiple output files</li>
* <li>Otherwise, a {@link #FIRST_LAST_PAGE_RANGE_NAME default file name based on the first and last page numbers in
* the specified range} is used as the file path of the destination</li>
* </ol>
* <p>
* Should <strong>NOT!</strong> return <code>null</code>.
* </p>
*/
protected String getDestinationName(PdfReader pdfContent, List<Integer> pages) throws Exception {
if (this.destination != null) {
File destination = new File(this.destination);
return destination.isAbsolute() ? destination.getAbsolutePath() : new File(destinationDirectory,
destination.getPath()).getAbsolutePath();
}
return new File(destinationDirectory == null ? "." : destinationDirectory, (rangeName != null ? rangeName
: FIRST_LAST_PAGE_RANGE_NAME).getName(source == null ? null : source, pdfContent, pages) + ".pdf")
.getAbsolutePath();
}
protected void doExtractBookmark(PdfReader pdf, HashMap<String, Object> currentBookmark,
HashMap<String, Object> nextBookmark) throws Exception {
if (pdf == null) {
log.error("cannot extract bookmarks from a null document");
return;
}
if (currentBookmark == null) {
log.error("cannot extract a null bookmark");
return;
}
int firstPage = valueOf(currentBookmark.get("Page").toString().split("\\s+")[0]);
int lastPage =
nextBookmark == null ? pdf.getNumberOfPages() + 1 : valueOf(nextBookmark.get("Page").toString()
.split("\\s+")[0]);
List<Integer> pages = new ArrayList<Integer>();
for (int i = firstPage; i < lastPage; i++) {
pages.add(valueOf(i));
}
doExtract(pdf, pages);
}
protected void doExtract(PdfReader pdf, List<Integer> pages) throws Exception {
if (pdf == null) {
log.error("cannot extract pages from a null document");
return;
}
final Integer ZERO = valueOf(0);
PdfCopy pdfCopy = null;
Document document = null;
try {
String destinationName = getDestinationName(pdf, pages);
pdfCopy = new PdfCopy(document = new Document(), new FileOutputStream(destinationName));
document.open();
for (Integer page : pages) {
if (page.compareTo(ZERO) < 0) {
log.error("cannot copy page with negative index: " + page + "; ignoring...");
continue;
}
pdfCopy.addPage(pdfCopy.getImportedPage(pdf, page));
}
log.info(destinationName + " PDF document created");
} finally {
if (document != null) {
document.close();
}
if (pdfCopy != null) {
try {
pdfCopy.close();
} catch (Throwable throwable) {
log.warn("cannot close the PDF copy; ignoring...", throwable);
}
}
}
}
/**
* Used to provide a meaningful name for a range of pages (not necessarily consecutive) that can be utilized, for
* example, to name a file containing the extracted pages.
*
* @author nitanoc
* @version 1.0, Feb 26, 2014
*/
public interface PageRangeName { // a nested interface is automatically "static"!
/**
* Should <strong>NOT!</strong> return <code>null</code>.
*/
String getName(String pdfName, PdfReader pdfContent, List<Integer> pages) throws Exception;
}
/**
* {@link PageRangeName} that {@link PageRangeName#getName(String, PdfReader, List) generates} a random name.
*/
public static PageRangeName RANDOM_PAGE_RANGE_NAME = new PageRangeName() {
@Override
public String getName(String pdfSource, PdfReader pdfReader, List<Integer> pages) {
String baseName = pdfSource == null ? "pdf-part" : pdfSource.replace(".pdf", "");
return baseName.substring(0, Math.min(8, baseName.length())) + "-" +
Math.abs(UUID.randomUUID().getMostSignificantBits());
}
};
/**
* {@link PageRangeName} that {@link PageRangeName#getName(String, PdfReader, List) generates} a name based on the
* first and last page numbers in the specified range.
*/
public static PageRangeName FIRST_LAST_PAGE_RANGE_NAME = new PageRangeName() {
@Override
public String getName(String pdfSource, PdfReader pdfReader, List<Integer> pages) {
String baseName = pdfSource == null ? "pdf-part" : new File(pdfSource).getName().replace(".pdf", "");
String suffix = "-p";
int pageCount = pages.size();
if (pageCount > 0) {
suffix += pages.get(0) + "~p" + pages.get(pageCount - 1);
}
return baseName.substring(0, Math.min(25, baseName.length())) + suffix;
}
};
public static void main(String[] args) {
String fileToSplit = System.getProperty("user.home") + "\\Desktop\\ALL_Ethics_Issues_Table_PoC_2014.pdf";
String destinationDirectory = System.getProperty("user.home") + "\\Desktop\\Temp";
new PdfPageExtractor(fileToSplit).setDestinationDirectory(destinationDirectory)
.setRangeName(new PageRangeName() {
private final Logger log = Logger.getLogger(PdfPageExtractor.class);
@Override
public String getName(String pdfName, PdfReader pdfContent, List<Integer> pages) {
if (pdfContent == null) {
log.error("cannot compute the range name for a null PDF document");
return "";
}
if (pages == null || pages.isEmpty()) {
log.error("cannot compute the range name for an empty range");
return "";
}
if (pdfName == null) {
pdfName = "the PDF document";
}
try {
String[] lines =
new PdfTextExtractor(pdfContent).getTextFromPage(pages.get(0)).split("\\r?\\n");
// MODIFY BELOW WHEN REPORT DESIGN CHANGES!
//
// Currently, 'line' 4 in the PDF structure contains (only) the proposal number and 'line' 6 -
// (only) the proposal acronym.
String proposalNumber = "error";
if (lines.length > 4 && lines[4] != null) {
proposalNumber = lines[4].trim().split("\\s+")[0];
}
String acronym = "";
if (lines.length > 6 && lines[6] != null) {
acronym = lines[6].trim().split("\\s+")[0];
}
return proposalNumber + "_Ethics_Issues_Table_" + acronym + ".pdf";
} catch (IOException exception) {
log.error("cannot extract text from the first page in range " + pages + " in " + pdfName,
exception);
return "";
}
}
}).extractByBookmarks();
}
}