Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Table styling #57

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 140 additions & 9 deletions html2docx/html2docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
from html.parser import HTMLParser
from typing import Any, Dict, Iterator, List, Optional, Tuple

import webcolors
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls
from docx.oxml.shared import OxmlElement, qn
from docx.shared import Pt, RGBColor
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from tinycss2 import parse_declaration_list
Expand Down Expand Up @@ -70,6 +74,9 @@ def __init__(self, title: str):
self.doc.core_properties.title = title
self.list_style: List[str] = []
self.href = ""
self.anchor = ""
self.style = ""
self.tag: Optional[str] = None
self._reset()

def _reset(self) -> None:
Expand All @@ -96,11 +103,53 @@ def init_p(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
elif style_decl["name"] == "padding-left" and style_decl["unit"] == "px":
self.padding_left = Pt(style_decl["value"])

def init_table(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
self.table_data: List[List[Tuple[str, str]]] = []

def finish_p(self) -> None:
if self.r is not None:
self.r.text = self.r.text.rstrip()
self._reset()

def finish_table(self) -> None:
if self.table_data:
# remove empty header
header = True
if not self.table_data[0]:
del self.table_data[0]
header = False

# create table
rows = len(self.table_data)
cols = len(self.table_data[-1])
table = self.doc.add_table(rows=rows, cols=cols)

# copy data
for row in range(rows):
for col in range(cols):
cell = table.cell(row, col)
text, style = self.table_data[row][col]
cell.text = text
if style:
for style_decl in style_to_css(style):
if style_decl["name"] == "background":
rgb = webcolors.name_to_hex(style_decl["value"])[1:]
shading = parse_xml(
r'<w:shd {} w:fill="{}"/>'.format(nsdecls("w"), rgb)
)
cell._tc.get_or_add_tcPr().append(shading)

elif style_decl["name"] == "color":
rgb = webcolors.name_to_rgb(style_decl["value"])
for p in cell.paragraphs:
for r in p.runs:
r.font.color.rgb = RGBColor(*rgb)

if header and row == 0:
for p in cell.paragraphs:
for r in p.runs:
r.font.bold = True
Comment on lines +148 to +151
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think everybody wants to have their header row in bold style. However, it would be great to support font-weight as a style attribute or the <b> markup.


def init_run(self, attrs: List[Tuple[str, Any]]) -> None:
self.attrs.append(attrs)
if attrs:
Expand All @@ -124,7 +173,67 @@ def add_text(self, data: str) -> None:
for attrs in self.attrs:
for font_attr, value in attrs:
setattr(self.r.font, font_attr, value)
self.r.add_text(data)
if self.href:
self.add_hyperlink(self.href, data)
elif self.anchor:
self.add_bookmark(self.anchor, data)
else:
self.r.add_text(data)

def add_hyperlink(self, href: str, text: str) -> None:
if not href.startswith("#"): # TODO external links
if text.endswith(" "):
text += href + " "
else:
text += " " + href
if self.r:
self.r.add_text(text)
return

hyperlink = OxmlElement("w:hyperlink")
hyperlink.set(qn("w:anchor"), href[1:])

new_run = OxmlElement("w:r")

rPr = OxmlElement("w:rPr")

rColor = OxmlElement("w:color")
rColor.set(qn("w:val"), "000080")
rPr.append(rColor)

rU = OxmlElement("w:u")
rU.set(qn("w:val"), "single")
rPr.append(rU)

new_run.append(rPr)
new_run.text = text

hyperlink.append(new_run)

if self.p:
self.p._p.append(hyperlink)
self.r = None

def add_bookmark(self, anchor: str, text: str) -> None:
if self.r:
tag = self.r._r
start = OxmlElement("w:bookmarkStart")
start.set(qn("w:id"), "0")
start.set(qn("w:name"), anchor)
tag.addprevious(start)
end = OxmlElement("w:bookmarkEnd")
end.set(qn("w:id"), "0")
tag.addnext(end)

self.r.add_text(self.anchor + " " + text)

def add_code(self, data: str) -> None:
lines = data.splitlines()
for linenr, line in enumerate(lines):
self.add_text(line.strip())
if linenr < len(lines) - 1:
if self.r:
self.r.add_break()

def add_list_style(self, name: str) -> None:
self.finish_p()
Expand All @@ -149,8 +258,10 @@ def add_picture(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
run.add_picture(image_buffer, **size)

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
self.tag = tag
if tag == "a":
self.href = get_attr(attrs, "href")
self.anchor = get_attr(attrs, "id")
self.init_run([])
elif tag in ["b", "strong"]:
self.init_run([("bold", True)])
Expand Down Expand Up @@ -183,28 +294,48 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N
self.init_run([("underline", True)])
elif tag == "ul":
self.add_list_style("List Bullet")
elif tag == "table":
self.init_table(attrs)
elif tag == "tr":
self.table_data.append([])
elif tag == "td":
styles = [b for a, b in attrs if a == "style" and b]
if styles:
self.style = styles[0]
Comment on lines +302 to +304
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can use the get_attr helper.


def handle_data(self, data: str) -> None:
if self.tag == "style":
return
elif self.tag in ("td", "th"):
if self.table_data:
self.table_data[-1].append((data, self.style))
return
if not self.pre:
data = re.sub(WHITESPACE_RE, " ", data)
if self.collapse_space:
data = data.lstrip()
if data:
if self.href:
if data.endswith(" "):
data += self.href + " "
else:
data += " " + self.href
self.href = ""
self.collapse_space = data.endswith(" ")
self.add_text(data)

if self.tag == "code":
self.add_code(data)
else:
self.add_text(data)

def handle_endtag(self, tag: str) -> None:
if tag in ["a", "b", "code", "em", "i", "span", "strong", "sub", "sup", "u"]:
self.finish_run()
if tag == "a":
self.href = ""
self.anchor = ""
elif tag in ["td", "tr"]:
self.style = ""
elif tag in ["h1", "h2", "h3", "h4", "h5", "h6", "li", "ol", "p", "pre", "ul"]:
self.finish_p()
if tag in ["ol", "ul"]:
del self.list_style[-1]
elif tag == "pre":
self.pre = False
elif tag == "table":
self.finish_table()
self.tag = None
5 changes: 5 additions & 0 deletions tests/data/code-multiline.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<pre><code>
value = get_value(arg)
do_something(value)
return
</code></pre>
11 changes: 11 additions & 0 deletions tests/data/code-multiline.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"text": "value = get_value(arg)\ndo_something(value)\nreturn",
"runs": [
{
"text": "value = get_value(arg)\ndo_something(value)\nreturn",
"name": "Mono"
}
]
}
]
1 change: 1 addition & 0 deletions tests/data/style.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<style>ignore this</style>
1 change: 1 addition & 0 deletions tests/data/style.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[]
15 changes: 15 additions & 0 deletions tests/links.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<p>
<a href="#1.1">link to bookmark</a>
</p>

<p>some text</p>
<p>some text</p>
<p>some text</p>
<p>some text</p>
<p>some text</p>

<p>
<a id="1.1"><h1>Bookmark<h1></a>
</p>

<p>more text</p>
5 changes: 5 additions & 0 deletions tests/table.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<table>
<tr><th>column1</th><th>column2</th></tr>
<tr><td>1</td><td>2</td></tr>
<tr><td>3</td><td>4</td></tr>
</table>
51 changes: 51 additions & 0 deletions tests/test_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os

import docx

from html2docx import html2docx

from .utils import TEST_DIR


def strip_ns(x):
pos = x.find("}") + 1
return x[pos:]


def attrib(d, key):
for k, v in d.items():
if k.endswith("}" + key):
return v


def test_links():
html_path = os.path.join(TEST_DIR, "links.html")
html = open(html_path).read()
buf = html2docx(html, title="links")

doc = docx.Document(buf)

assert len(doc.paragraphs) == 9

# check hyperlink
run = doc.paragraphs[0]._p[1]
assert strip_ns(run.tag) == "hyperlink"
assert attrib(run.attrib, "anchor") == "1.1"

children = run.getchildren()
assert len(children) == 1

wR = children[0]
assert strip_ns(wR.tag) == "r"
assert wR.text == "link to bookmark"

# check bookmark
run = doc.paragraphs[6]._p[1]
assert strip_ns(run.tag) == "bookmarkStart"

run = doc.paragraphs[6]._p[2]
assert strip_ns(run.tag) == "r"
assert run.text == "1.1 Bookmark"

run = doc.paragraphs[6]._p[3]
assert strip_ns(run.tag) == "bookmarkEnd"
31 changes: 31 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os

import docx

from html2docx import html2docx

from .utils import TEST_DIR


def test_table():
html_path = os.path.join(TEST_DIR, "table.html")
html = open(html_path).read()
buf = html2docx(html, title="table")

doc = docx.Document(buf)

assert len(doc.tables) == 1
table = doc.tables[0]

assert len(table.rows) == 3
assert len(table.columns) == 2

contents = [
("column1", "column2"),
("1", "2"),
("3", "4"),
]

for r, row in enumerate(contents):
for c, text in enumerate(row):
assert table.cell(r, c).text == text
3 changes: 2 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ commands = pytest {posargs}
deps =
Pillow
pytest
webcolors

[testenv:black]
commands = black --target-version=py36 --check --diff .
Expand All @@ -24,7 +25,7 @@ deps = flake8
skip_install = true

[testenv:isort]
commands = isort --recursive --check-only --diff
commands = isort --check-only --diff --verbose .
deps = isort
skip_install = true

Expand Down