HTML 2 DOCX @ Python

7/4/2019 - 6:07 PM
# -*- coding: utf-8 -*-
import io

import requests
from bs4 import NavigableString, BeautifulSoup
from docx import Document as DocxDocument
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Cm, Pt, Mm
from docx.image.exceptions import UnrecognizedImageError
from lxml import etree

A4_PAGE_HEIGHT = 297
A4_PAGE_WIDHT = 210
DEFAULT_MARGINS = 25.4
MAX_PICTURE_SIZE = 602

def parser(html):
    soup = BeautifulSoup(html)
    body = soup.body
    document = DocxDocument()
    parsing(html, document)
    document.save("output/teste.docx")

def parsing(html, document):
    soup = BeautifulSoup(html, features="lxml")
    body = soup.body
    remove_newlines(body)
    print(body.prettify())
    process_node(body, document)

    return body


def process_node(node, document, parent=None, paragraph=None, style=[]):
    try:
        if node.contents:
            pass

        if node.name in ['ul', 'ol']:
            process_list(node, document)
            return None

        if node.name == 'math':
            process_math(node, document, paragraph)
            return None

        if node.name == 'table':
            process_table(document, node)
            return None

        if node.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            paragraph = document.add_paragraph('')
            style = []

        if not node.get('style') is None:
            style = style + process_style(node['style'])

        for child in node.children:
            paragraph = process_node(child, document, node, paragraph, style)

        if node.name == 'img':
            process_image(node, document, paragraph, style)
            return None

        if node.name == 'br':
            document.add_paragraph('')
            return None

    except AttributeError as error:
        process_string(node, document, parent, paragraph, style)

    return paragraph


def process_string(str_node, document, parent, paragraph, style=[]):
    if paragraph is None:
        paragraph = document.add_paragraph('')

    try:
        if str_node == '\n' or str_node == u'\xa0':
            return
        text = str_node.strip()

        run = paragraph.add_run(text)

        if (parent.name == 'strong') or ('bold' in style):
            run.font.bold = True
        if (parent.name == 'em') or ('italic' in style):
            run.font.italic = True
        if 'small' in style:
            run.font.size = Pt(8)
        if 'underline' in style:
            run.font.underline = True
        if 'left' in style:
            paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
        if 'right' in style:
            paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
        if 'center' in style:
            paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
        if 'justify' in style:
            paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY

        print(text)
    except Exception as e:
        pass


def process_list(node, document):
    list_style = 'List Bullet' if node.name == 'ul' else 'List Number'

    for child in node.children:
        if child.name == 'li':
            paragraph = document.add_paragraph('', style=list_style)
            process_list_item(child, document, node, paragraph)


def process_list_item(node, document, parent, paragraph):
    try:
        remove_newlines(node)

        if node.contents:
            for child in node.children:
                if child.name == 'math':
                    process_math(child, document, paragraph)
                    continue

                process_list_item(child, document, node, paragraph)

    except AttributeError as error:
        process_string(node, document, parent, paragraph)


def process_image(img_node, document, paragraph, styles=[]):
    if paragraph and paragraph.text == '':
        delete_paragraph(paragraph)

    url = img_node['src']
    image = download_image(url)

    try:
        if should_use_width(img_node):
            print("IMG: W #" + url)
            size = convert_image_size(img_node['width'])
            document.add_picture(image, width=Mm(size))
        else:
            print("IMG: H #" + url)
            size = convert_image_size(img_node['height'])
            document.add_picture(image, height=Mm(size))

        if should_be_centered(img_node, styles):
            try:
                picture_paragraph = document.paragraphs[-1]
            except AttributeError:
                picture_paragraph = document._parent
            picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
    except UnrecognizedImageError as error:
        rollbar.report_message('UnrecognizedImageError at html_parser.py', 'warning')
        document.add_paragraph('### Imagem não pode ser incluida, entre em contato com o Lecionas e informe o problema')

def process_math(node, document, paragraph=None):
    import re
    from django.conf import settings

    project_path = re.sub(r'/main$', '%s', settings.BASE_DIR)
    mml2omml_stylesheet_path = project_path % "/export/MML2OMML.XSL"
    element_tree = etree.fromstring(str(node))
    xslt = etree.parse(mml2omml_stylesheet_path)
    transform = etree.XSLT(xslt)
    new_dom = transform(element_tree)

    if paragraph is None:
        paragraph = document.add_paragraph()

    paragraph._element.append(new_dom.getroot())

def should_be_centered(img_node, styles=[]):
    centered = False
    try:
        style_str = img_node['style']
        centered = ("margin-left: auto" in style_str) and ("margin-right: auto" in style_str)
    except:
        pass

    if 'center' in styles:
        centered = True

    return centered

def convert_image_size(orginal_size):
    size_in_px = float(orginal_size)
    if(size_in_px > MAX_PICTURE_SIZE):
        size_in_px = MAX_PICTURE_SIZE
    document_size = round(A4_PAGE_WIDHT - (DEFAULT_MARGINS*2), 1)
    size_in_mn = round((size_in_px * document_size)/MAX_PICTURE_SIZE)
    return size_in_mn

def should_use_width(img_node):
    try:
        width = 0
        height = 0
        width = int(img_node['width'])
        height = int(img_node['height'])
    except:
        pass

    use_width = True
    if height > width:
        use_width = False

    return use_width


def process_table(document, table_node):
    all_rows =  table_node.find_all("tr")
    rows_count = len(all_rows)
    cells_count = list(map(lambda x: len(x.find_all(["th","td"])), all_rows))
    max_columns = max(cells_count)
    table = document.add_table(rows=rows_count, cols=max_columns)

    current_row = 0
    for html_row in all_rows:
        row_cells = table.rows[current_row].cells

        current_cell_index = 0
        for html_cell in html_row.find_all(["th","td"]):
            table_cell = row_cells[current_cell_index]
            process_table_cell(document, html_cell, table_cell, [],True)
            if not (html_cell.get('colspan') is None):
                index_to_merge = current_cell_index + int(html_cell['colspan']) - 1
                cell_to_merge = row_cells[index_to_merge]
                table_cell.merge(cell_to_merge)
            current_cell_index = current_cell_index + 1
        current_row = current_row + 1

def process_table_cell(document, node, cell, styles=[], skip_newlines=False):
    # still not considering new paragraphs inside table_cell
    # https://github.com/python-openxml/python-docx/issues/216
    if not node.get('style') is None:
        styles = styles + process_style(node['style'])

    if not cell.paragraphs:
        paragraph = cell.add_paragraph()
    else:
        paragraph = cell.paragraphs[-1]

    for child in node.children:
        if isinstance(child, NavigableString):
            if skip_newlines and (not child.strip()): continue

            process_string(child, document, node, paragraph, styles)
        elif child.name == 'math':
            process_math(child, document, paragraph)
        elif child.name == 'img':
            paragraph = cell.add_paragraph()
            run = paragraph.add_run()
            process_image(child, run, None, styles)
            cell.add_paragraph()
        else:
            process_table_cell(document, child, cell, styles)

def process_style(style_str=''):
    import re
    default_size = 11
    styles = []

    if style_str is None:
        return styles

    sizeMatch = re.search('font-size: (\d+)pt', style_str)
    if sizeMatch:
        size = int(sizeMatch.group(1))
        if size < default_size:
            styles.append('small')

    weightMatch = re.search('font-weight: (700|800|900|bold(er)?)', style_str)
    if weightMatch:
        styles.append('bold')

    styleMatch = re.search('font-style: (oblique|italic)', style_str)
    if styleMatch:
        styles.append('italic')

    underMatch = re.search('text-decoration: underline', style_str)
    if underMatch:
        styles.append('underline')

    alignMatch = re.search('text-align: (left|right|center|justify)', style_str)
    if alignMatch:
        styles.append(alignMatch.group(1))

    return styles


def delete_paragraph(paragraph):
    p = paragraph._element
    p.getparent().remove(p)
    p._p = p._element = None

def remove_newlines(node):
    all_elements = node.contents
    for element in all_elements:
        if isinstance(element, NavigableString):
            if element.strip() == '':
                element.extract()

def download_image(url):
    response = requests.get(url, stream=True)
    image = io.BytesIO(response.content)
    return image

def remove_table_border(table):
    from docx.oxml.shared import OxmlElement  # Necessary Import

    # https://github.com/python-openxml/python-docx/issues/433
    tbl = table._tbl  # get xml element in table
    for cell in tbl.iter_tcs():
        tcPr = cell.tcPr  # get tcPr element, in which we can define style of borders
        tcBorders = OxmlElement('w:tcBorders')
        borders = []

        for element in ['w:top', 'w:left', 'w:bottom', 'w:right']:
            el = OxmlElement(element)
            el.set(qn('w:val'), 'nil')
            borders.append(el)

        for b in borders:
            tcBorders.append(b)

        tcPr.append(tcBorders)
Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

HTML 2 DOCX @ Python