octavian-nita
9/10/2014 - 6:54 AM

Handy class for splitting or extracting pages from PDF documents

Handy class for splitting or extracting pages from PDF documents

package be.thon.util.pdf;

import static java.lang.Integer.valueOf;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;

import org.apache.log4j.Logger;

import com.lowagie.text.Document;
import com.lowagie.text.pdf.PdfCopy;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.RandomAccessFileOrArray;
import com.lowagie.text.pdf.SequenceList;
import com.lowagie.text.pdf.SimpleBookmark;
import com.lowagie.text.pdf.parser.PdfTextExtractor;

/**
 * Handy class for splitting or extracting pages from PDF documents.
 *
 * @author nitanoc
 * @version 1.0, Feb 26, 2014
 */
public class PdfPageExtractor {

    private final Logger log = Logger.getLogger(PdfPageExtractor.class);

    // Many times the source is a file in the local file system; if the PDF document has some other source, one can
    // extend this class and override #getSourcePdfReader(); this should be the only method used by extract methods
    // in order to obtain the PDF content to extract from!
    private String source;

    private String destination; // if null, rangeName is used; if not absolute, destinationDirectory is also used

    private String destinationDirectory; // if null, the current working directory is used

    private PageRangeName rangeName = FIRST_LAST_PAGE_RANGE_NAME; // if null and the destination is also null

    // then FIRST_LAST_PAGE_RANGE_NAME is used!

    public PdfPageExtractor() {
        // allow to create page extractors but set their attributes later (maybe in order to re-use an instance, etc.)
    }

    public PdfPageExtractor(String source) {
        setSource(source);
    }

    public String getSource() {
        return source;
    }

    public PdfPageExtractor setSource(String source) {
        this.source = source;
        return this;
    }

    public String getDestination() {
        return destination;
    }

    public PdfPageExtractor setDestination(String destination) {
        this.destination = destination;
        return this;
    }

    public String getDestinationDirectory() {
        return destinationDirectory;
    }

    public PdfPageExtractor setDestinationDirectory(String destinationDirectory) {
        this.destinationDirectory = destinationDirectory;
        return this;
    }

    public PageRangeName getRangeName() {
        return rangeName;
    }

    public PdfPageExtractor setRangeName(PageRangeName rangeName) {
        this.rangeName = rangeName;
        return this;
    }

    public void extractByBookmarks() {
        PdfReader pdf = null;
        try {
            pdf = getSourcePdfReader();
            if (pdf == null) {
                log.error("cannot extract by bookmarks from a null document");
                return;
            }

            pdf.consolidateNamedDestinations();
            @SuppressWarnings("unchecked")
            List<HashMap<String, Object>> bookmarksList = SimpleBookmark.getBookmark(pdf);
            if (bookmarksList == null || bookmarksList.isEmpty()) {
                log.warn("no bookmarks found in the specified PDF document");
                return;
            }
            Iterator<HashMap<String, Object>> bookmarks = bookmarksList.iterator();

            HashMap<String, Object> currentBookmark, nextBookmark;
            for (currentBookmark = bookmarks.next(); bookmarks.hasNext(); currentBookmark = nextBookmark) {
                nextBookmark = bookmarks.next();
                try {
                    doExtractBookmark(pdf, currentBookmark, nextBookmark);
                } catch (Throwable throwable) {
                    log.error(
                        "cannot extract bookmark " +
                            (currentBookmark == null ? "(null)" : currentBookmark.get("Title")) + "; skipping...",
                        throwable);
                }
            }

            try {
                doExtractBookmark(pdf, currentBookmark, null); // we're now left with the last bookmark
            } catch (Throwable throwable) {
                log.error(
                    "cannot extract (last) bookmark " +
                        (currentBookmark == null ? "(null)" : currentBookmark.get("Title")) + "; skipping...",
                    throwable);
            }
        } catch (Throwable throwable) {
            log.error("cannot extract bookmarks from the specified PDF document", throwable);
        } finally {
            if (pdf != null) {
                try {
                    pdf.close();
                } catch (Throwable throwable) {
                    log.warn("cannot close the PDF document; ignoring...", throwable);
                }
            }
        }
    }

    /**
     * Each of the specified ranges will be extracted in a new file.
     */
    @SuppressWarnings("unchecked")
    public void extract(String firstRange, String... otherRanges) {
        PdfReader pdf = null;
        try {
            if ((pdf = getSourcePdfReader()) == null) {
                log.error("cannot extract pages from a null document");
                return;
            }
            int maxNumberOfPages = pdf.getNumberOfPages();

            if (firstRange != null) {
                doExtract(pdf, SequenceList.expand(firstRange, maxNumberOfPages));
            }

            if (otherRanges != null) {
                for (String otherRange : otherRanges) {
                    if (otherRange != null) {
                        doExtract(pdf, SequenceList.expand(otherRange, maxNumberOfPages));
                    } else {
                        log.warn("ignoring null range...");
                    }
                }
            }
        } catch (Throwable throwable) {
            log.error("cannot extract pages from the specified PDF document", throwable);
        } finally {
            if (pdf != null) {
                try {
                    pdf.close();
                } catch (Throwable throwable) {
                    log.warn("cannot close the PDF document; ignoring...", throwable);
                }
            }
        }
    }

    /**
     * Convenience method to extract an arbitrary number of pages:
     *
     * <pre>
     * // ...
     * extractor.extract(2);
     * // ...
     * extractor.extract(1, 7, 10);
     * </pre>
     */
    public void extract(int firstPage, int... otherPages) {
        List<Integer> pages = new ArrayList<Integer>();
        pages.add(valueOf(firstPage));
        if (otherPages != null) {
            for (int otherPage : otherPages) {
                pages.add(valueOf(otherPage));
            }
        }
        extract(pages);
    }

    public void extract(List<Integer> pages) {
        if (pages == null || pages.isEmpty()) {
            log.error("no valid page range specified: " + pages);
            return;
        }

        PdfReader pdf = null;
        try {
            doExtract(pdf = getSourcePdfReader(), pages);
        } catch (Throwable throwable) {
            log.error("cannot extract pages from the specified PDF document", throwable);
        } finally {
            if (pdf != null) {
                try {
                    pdf.close();
                } catch (Throwable throwable) {
                    log.warn("cannot close the PDF document; ignoring...", throwable);
                }
            }
        }
    }

    /**
     * @return -1 if {@link #getSourcePdfReader()} return <code>null</code> or an exception occurs
     */
    public int getNumberOfPages() {
        PdfReader pdf = null;
        try {
            pdf = getSourcePdfReader();
            return pdf == null ? -1 : pdf.getNumberOfPages();
        } catch (Throwable throwable) {
            log.error("cannot open the specified PDF document", throwable);
            return -1;
        } finally {
            if (pdf != null) {
                try {
                    pdf.close();
                } catch (Throwable throwable) {
                    log.warn("cannot close the PDF document; ignoring...", throwable);
                }
            }
        }
    }

    /**
     * <p>
     * The returned {@link PdfReader} should be closed by calling methods.
     * </p>
     * <p>
     * Since this method might be used for operations not necessarily involving full PDF parsing (e.g. page counting),
     * opening the {@link PdfReader} should be as efficient as possible (i.e. parse as little as possible). For an
     * efficient (?) method of obtaining {@link PdfReader}s, see <a
     * href="http://stackoverflow.com/questions/6026971/page-count-of-pdf-with-java for an efficient way of opening"
     * >http://stackoverflow.com/questions/6026971/page-count-of-pdf-with-java for an efficient way of opening</a>.
     * </p>
     *
     * @see http://stackoverflow.com/questions/6026971/page-count-of-pdf-with-java for an efficient way of opening
     */
    protected PdfReader getSourcePdfReader() throws IOException {
        return source == null ? null : new PdfReader(new RandomAccessFileOrArray(source), new byte[0]);
    }

    /**
     * <ol>
     * <li>If a {@link #getDestination() destination file path} is {@link #setDestination(File) set}, that is used as
     * the target for the extracted pages; appropriate for one-time / single range extraction</li>
     * <li>Otherwise, if a {@link #setRangeName(PageRangeName) range name is set}, that is used to compute the file path
     * of the destination; appropriate for repeated extractions resulting in multiple output files</li>
     * <li>Otherwise, a {@link #FIRST_LAST_PAGE_RANGE_NAME default file name based on the first and last page numbers in
     * the specified range} is used as the file path of the destination</li>
     * </ol>
     * <p>
     * Should <strong>NOT!</strong> return <code>null</code>.
     * </p>
     */
    protected String getDestinationName(PdfReader pdfContent, List<Integer> pages) throws Exception {
        if (this.destination != null) {
            File destination = new File(this.destination);
            return destination.isAbsolute() ? destination.getAbsolutePath() : new File(destinationDirectory,
                destination.getPath()).getAbsolutePath();
        }

        return new File(destinationDirectory == null ? "." : destinationDirectory, (rangeName != null ? rangeName
            : FIRST_LAST_PAGE_RANGE_NAME).getName(source == null ? null : source, pdfContent, pages) + ".pdf")
            .getAbsolutePath();
    }

    protected void doExtractBookmark(PdfReader pdf, HashMap<String, Object> currentBookmark,
        HashMap<String, Object> nextBookmark) throws Exception {

        if (pdf == null) {
            log.error("cannot extract bookmarks from a null document");
            return;
        }
        if (currentBookmark == null) {
            log.error("cannot extract a null bookmark");
            return;
        }

        int firstPage = valueOf(currentBookmark.get("Page").toString().split("\\s+")[0]);
        int lastPage =
            nextBookmark == null ? pdf.getNumberOfPages() + 1 : valueOf(nextBookmark.get("Page").toString()
                .split("\\s+")[0]);

        List<Integer> pages = new ArrayList<Integer>();
        for (int i = firstPage; i < lastPage; i++) {
            pages.add(valueOf(i));
        }
        doExtract(pdf, pages);
    }

    protected void doExtract(PdfReader pdf, List<Integer> pages) throws Exception {
        if (pdf == null) {
            log.error("cannot extract pages from a null document");
            return;
        }
        final Integer ZERO = valueOf(0);

        PdfCopy pdfCopy = null;
        Document document = null;
        try {
            String destinationName = getDestinationName(pdf, pages);

            pdfCopy = new PdfCopy(document = new Document(), new FileOutputStream(destinationName));
            document.open();
            for (Integer page : pages) {
                if (page.compareTo(ZERO) < 0) {
                    log.error("cannot copy page with negative index: " + page + "; ignoring...");
                    continue;
                }
                pdfCopy.addPage(pdfCopy.getImportedPage(pdf, page));
            }

            log.info(destinationName + " PDF document created");
        } finally {
            if (document != null) {
                document.close();
            }
            if (pdfCopy != null) {
                try {
                    pdfCopy.close();
                } catch (Throwable throwable) {
                    log.warn("cannot close the PDF copy; ignoring...", throwable);
                }
            }
        }
    }

    /**
     * Used to provide a meaningful name for a range of pages (not necessarily consecutive) that can be utilized, for
     * example, to name a file containing the extracted pages.
     *
     * @author nitanoc
     * @version 1.0, Feb 26, 2014
     */
    public interface PageRangeName { // a nested interface is automatically "static"!

        /**
         * Should <strong>NOT!</strong> return <code>null</code>.
         */
        String getName(String pdfName, PdfReader pdfContent, List<Integer> pages) throws Exception;
    }

    /**
     * {@link PageRangeName} that {@link PageRangeName#getName(String, PdfReader, List) generates} a random name.
     */
    public static PageRangeName RANDOM_PAGE_RANGE_NAME = new PageRangeName() {

        @Override
        public String getName(String pdfSource, PdfReader pdfReader, List<Integer> pages) {
            String baseName = pdfSource == null ? "pdf-part" : pdfSource.replace(".pdf", "");
            return baseName.substring(0, Math.min(8, baseName.length())) + "-" +
                Math.abs(UUID.randomUUID().getMostSignificantBits());
        }
    };

    /**
     * {@link PageRangeName} that {@link PageRangeName#getName(String, PdfReader, List) generates} a name based on the
     * first and last page numbers in the specified range.
     */
    public static PageRangeName FIRST_LAST_PAGE_RANGE_NAME = new PageRangeName() {

        @Override
        public String getName(String pdfSource, PdfReader pdfReader, List<Integer> pages) {
            String baseName = pdfSource == null ? "pdf-part" : new File(pdfSource).getName().replace(".pdf", "");
            String suffix = "-p";
            int pageCount = pages.size();
            if (pageCount > 0) {
                suffix += pages.get(0) + "~p" + pages.get(pageCount - 1);
            }
            return baseName.substring(0, Math.min(25, baseName.length())) + suffix;
        }
    };

    public static void main(String[] args) {

        String fileToSplit = System.getProperty("user.home") + "\\Desktop\\ALL_Ethics_Issues_Table_PoC_2014.pdf";

        String destinationDirectory = System.getProperty("user.home") + "\\Desktop\\Temp";

        new PdfPageExtractor(fileToSplit).setDestinationDirectory(destinationDirectory)
            .setRangeName(new PageRangeName() {

                private final Logger log = Logger.getLogger(PdfPageExtractor.class);

                @Override
                public String getName(String pdfName, PdfReader pdfContent, List<Integer> pages) {
                    if (pdfContent == null) {
                        log.error("cannot compute the range name for a null PDF document");
                        return "";
                    }
                    if (pages == null || pages.isEmpty()) {
                        log.error("cannot compute the range name for an empty range");
                        return "";
                    }

                    if (pdfName == null) {
                        pdfName = "the PDF document";
                    }

                    try {
                        String[] lines =
                            new PdfTextExtractor(pdfContent).getTextFromPage(pages.get(0)).split("\\r?\\n");

                        // MODIFY BELOW WHEN REPORT DESIGN CHANGES!
                        //
                        // Currently, 'line' 4 in the PDF structure contains (only) the proposal number and 'line' 6 -
                        // (only) the proposal acronym.
                        String proposalNumber = "error";
                        if (lines.length > 4 && lines[4] != null) {
                            proposalNumber = lines[4].trim().split("\\s+")[0];
                        }
                        String acronym = "";
                        if (lines.length > 6 && lines[6] != null) {
                            acronym = lines[6].trim().split("\\s+")[0];
                        }

                        return proposalNumber + "_Ethics_Issues_Table_" + acronym + ".pdf";
                    } catch (IOException exception) {
                        log.error("cannot extract text from the first page in range " + pages + " in " + pdfName,
                            exception);
                        return "";
                    }
                }
            }).extractByBookmarks();
    }
}