largomst
8/16/2017 - 3:46 PM

convert_kindle_html_to_markdown.py

import sys
import os
from bs4 import BeautifulSoup


def remove_linefeed(text):
    text = text.replace('\n', '')
    return text


def convert_to_markdown(text, class_):
    if class_ == 'bodyContainer':
        text = ''
    elif class_ == 'notebookFor':
        text = '# ' + text + ' '
    elif class_ == 'bookTitle':
        text = text + '\n\n'
    elif class_ == 'authors':
        text = '**' + text + '**' + '\n\n'
    elif class_ == 'citation':
        text = ''
    elif class_ == 'sectionHeading':
        text = '## ' + text + '\n\n'
    elif class_ == 'noteHeading':
        if text.split('-')[0].strip() in ['Note', 'Bookmark']:
            text = '*' + text + '*' + '\n\n'
        else:
            text = '*' + text + '*' + '\n\n' + '> '
    elif class_ == 'noteText':
        text = text + '\n\n'
    return text


def main():
    file_name = sys.argv[1]
    prefix, suffix = os.path.splitext(file_name)

    soup = BeautifulSoup(open(file_name, encoding='utf-8'), 'lxml')

    text = ''
    for div in soup.select('div'):
        tmp_text = remove_linefeed(div.get_text().strip())
        text += convert_to_markdown(tmp_text, div['class'][0])

    with open(prefix + '.md', 'w', encoding='utf-8') as f:
        f.write(text)


if __name__ == '__main__':
    main()