retrography
2/4/2016 - 11:52 PM

PDF highlight and annotation extractor

PDF highlight and annotation extractor

#!/usr/bin/env python

__author__ = 'Mahmood S. Zargar'

import poppler
import sys
import urllib
import os

def main():
    if sys.argv.__len__() < 2:
        print 'Input file required. Please mention at least one.'
        print 'Syntax: annotex input_file1.pdf [input_file2.pdf ...]'
        sys.exit(1)

    for file_name in sys.argv[1:]:
        print

        document = poppler.document_new_from_file('file://%s' % \
                                                  urllib.pathname2url(os.path.abspath(file_name)), None)
        print os.path.basename(file_name)
        doc_title = document.get_property('title')
        if doc_title is not None and doc_title != '':
            print '(' + doc_title + ')'
        print '-----\n'

        n_pages = document.get_n_pages()
        all_annots = 0

        for i in range(n_pages):
            page = document.get_page(i)
            annot_mappings = page.get_annot_mapping()
            num_annots = len(annot_mappings)
            if num_annots > 0:
                for annot_mapping in annot_mappings:
                    annot = annot_mapping.annot
                    annot_type = annot.get_annot_type().value_nick
                    annot_type = annot_type[0].upper() + annot_type[1:]
                    if annot_type in ['Underline', 'Highlight', 'Strike-out', 'Squiggly', 'Text', 'Free-text', 'Caret']:
                        all_annots += 1
                        page_no = str(page.get_index() + 1)
                        page_label = page.props.label
                        page_prompt = '[p. ' + page_no + ']'
                        if page_no != page_label:
                            page_prompt = page_prompt + '(' + page_label + ')'
                        page_prompt += ': \n'
                        area = annot_mapping.area.copy()

                        (width, height) = page.get_size()
                        area.x1, area.x2 = area.x2, area.x1
                        area.y1, area.y2 = area.y2, area.y1
                        area.y1 = height - area.y1
                        area.y2 = height - area.y2

                        annot_text = page.get_selected_text("POPPLER_SELECTION_WORD", area).strip()
                        annot_cont = annot.get_contents()
                        if annot_text is not None and annot_text not in ['', page_no, page_label]:
                            print annot_type + ' Text ' + page_prompt + annot_text + '\n'
                        if annot_cont is not None and annot_cont != '':
                            print annot_type + ' Note ' + page_prompt + annot_cont + '\n'

        print '-----\n' + str(all_annots) + " annotation(s) found" + '\n\n'

if __name__ == "__main__":
    main()