This repository has been archived by the owner on Jan 9, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
/
htmodel.py
405 lines (341 loc) · 15.8 KB
/
htmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
import re
import html as _html
from html.entities import codepoint2name as _entities
from io import StringIO
import textwrap
class Element:
__slots__ = ['name', 'attrs', 'style', 'content']
def __init__(self, name, attrs, style, content):
assert isinstance(name, str)
self.name = name
assert attrs is None or isinstance(attrs, dict)
self.attrs = attrs or {}
assert style is None or isinstance(style, dict)
self.style = style
assert not isinstance(content, str)
self.content = list(content)
assert all(isinstance(item, (str, Element)) for item in self.content)
def is_block(self):
""" True if this is a block element.
Block elements have the property that extra whitespace before or after
either the start tag or the end tag is ignored. (Some elements, like
<section>, are expected to contain only block elements and no text; but
of course there are also block elements, like <p>, that contain inline
elements and text.)"""
return _tag_data[self.name][0] == 'B'
def to_html(self):
f = StringIO()
write_html(f, self, strict=False)
return f.getvalue()
__repr__ = to_html
def kids(self, name=None):
""" Return an iterator over (index, child) pairs where each child is a
child element of self. The iterator is robust against mutating
self.content to the right of i. """
for i, kid in enumerate(self.content):
if isinstance(kid, Element) and (name is None or kid.name == name):
yield i, kid
def with_content(self, replaced_content):
""" Return a copy of self with different content. """
return Element(self.name, self.attrs, self.style, replaced_content)
def with_content_slice(self, start, stop, replaced_content):
copy = self.content[:]
copy[start:stop] = replaced_content
return self.with_content(copy)
def with_(self, name=None, attrs=None, style=None, content=None):
if name is None: name = self.name
if attrs is None: attrs = self.attrs
if style is None: style = self.style
if content is None: content = self.content
return Element(name, attrs, style, content)
def find(self, matcher):
match_result = matcher(self)
assert match_result is True or match_result is False or match_result is None
if match_result is None:
return
for kid in self.content:
if isinstance(kid, Element):
for x in kid.find(matcher):
yield x
if match_result:
yield self
def find_replace(self, matcher, replacement):
""" A sort of map() on htmodel content.
self - An Element to transform.
matcher - A function mapping an Element to True, False, or None.
replacement - A function mapping an Element to a content list.
Walk the entire tree under the Element self; for each element e such
that matcher(e) is True, call replacement(e); return a tree with each
such Element replaced by the content in replacement(e).
If matcher(e) is False or None, replacement(e) is not called, and the
element is left in the result tree; the difference is that if
matcher(e) is False, e's descendants are visited; if it is None, the
descendants are skipped entirely and left unchanged in the result tree.
self is left unmodified, but the result is not a deep copy: it may be
self or an Element whose tree shares some parts of self.
"""
def map_element(e):
match_result = matcher(e)
assert match_result is True or match_result is False or match_result is None
if match_result is None:
return [e]
replaced_content = map_content(e.content)
if replaced_content is e.content:
e2 = e
else:
e2 = e.with_content(replaced_content)
if match_result:
return list(replacement(e2))
else:
return [e2]
def map_content(source):
changed = False
result = []
for child in source:
if isinstance(child, str):
result.append(child)
else:
seq = map_element(child)
changed = changed or seq != [child]
result += seq
if changed:
return result
else:
return source
result_content = map_element(self)
if len(result_content) != 1:
raise ValueError("replaced root element with {} pieces of content".format(len(result_content)))
result_elt = result_content[0]
if not isinstance(result_elt, Element):
raise ValueError("replaced root element with non-element content")
return result_elt
def replace(self, name, replacement):
""" A sort of map() on htmodel content.
self - An Element to transform.
name - A string.
replacement - A function taking a single Element and returning a content list
(that is, a list of Elements and/or strings).
Walk the entire tree under the Element self; for each Element e with the given
name, call replacement(e); return a tree with each such Element replaced by
the content in replacement(e).
If self.name == name and list(replacement(self)) is not a list consisting of
exactly one Element, raise a ValueError.
self is left unmodified, but the result is not a deep copy: it may be
self or an Element whose tree shares some parts of self.
"""
assert isinstance(name, str) # detect common bug
return self.find_replace(lambda e: e.name == name, replacement)
def escape(s, quote=False):
""" Escape the string s for HTML output.
This escapes characters that are special in HTML (& < >) and all non-ASCII characters.
If 'quote' is true, escape quotes (' ") as well.
Why use character entity references for non-ASCII characters? The program
encodes the output as UTF-8, so we should be fine without escaping. We
escape only for maximum robustness against broken tools.
"""
def replace(m):
c = ord(m.group(0))
if c in _entities:
return '&' + _entities[c] + ';'
return '&#x{:x};'.format(c)
# The stdlib takes care of & > < ' " for us.
s = _html.escape(s, quote)
# Now we only need to worry about non-ascii characters.
return re.sub('[^\n -~]', replace, s)
empty_tags = {'meta', 'br', 'hr', 'link'}
non_indenting_tags = {'html', 'body'}
def save_html(filename, ht, strict=True):
assert ht.name == 'html'
with open(filename, 'w', encoding='utf-8') as f:
f.write("<!doctype html>\n")
write_html(f, ht, strict=strict)
# When wrap_hack is true, space characters in attribute values are replaced
# with this character (so that a big chunk of HTML code can be word-wrapped
# without fear of breaking inside an attribute). The character is in the
# Private Use area.
NOT_A_SPACE = '\uF123'
def write_html(f, ht, indent='', strict=True, wrap_hack=False):
WIDTH = 130
def htmlify(name):
""" Convert a pythonified tag name or attribute name back to HTML. """
if name.endswith('_'):
name = name[:-1]
name = name.replace('_', '-')
return name
def escape_attr_value(v, wrap_hack):
ev = escape(v, True)
if wrap_hack:
ev = ev.replace(" ", NOT_A_SPACE)
return ev
def start_tag(ht, wrap_hack):
attrs = ''.join(' {0}="{1}"'.format(htmlify(k), escape_attr_value(v, wrap_hack))
for k, v in sorted(ht.attrs.items()))
if ht.style:
assert 'style' not in ht.attrs
style = '; '.join(name + ": " + value for name, value in sorted(ht.style.items()))
attrs += ' style="{0}"'.format(style)
return '<{0}{1}>'.format(ht.name, attrs)
def is_ht_inline(ht):
return isinstance(ht, str) or _tag_data[ht.name][0] == 'I'
def write_inline_content(f, content, indent, allow_last_block, strict, strict_blame, wrap_hack):
if (allow_last_block
and isinstance(content[-1], Element)
and content[-1].name in ('ol', 'ul', 'table', 'figure')):
last = content[-1]
content = content[:-1]
else:
last = None
for kid in content:
if isinstance(kid, str):
f.write(escape(kid))
else:
if strict and not is_ht_inline(kid):
raise ValueError("block element <{}> can't appear in inline content:\n".format(kid.name)
+ repr(strict_blame))
write_html(f, kid, indent, strict, wrap_hack)
if last is not None:
f.write('\n')
write_html(f, last, indent, strict, wrap_hack)
f.write(indent[:-2])
if isinstance(ht, str):
assert not strict
f.write(escape(ht))
return
info = _tag_data[ht.name]
content = ht.content
assert info[1] != '0' or len(content) == 0 # empty tag is empty
if (ht.name in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')
or (ht.name == 'li' and ht.content and is_ht_inline(ht.content[0]))):
# Word-wrap the inline content in this block element.
# First, find out if this is an li element with a trailing list.
if ht.name == 'li' and isinstance(ht.content[-1], Element) and ht.content[-1].name in ('ol', 'ul', 'table'):
content_to_wrap = content[:-1]
last_block = content[-1]
else:
content_to_wrap = content
last_block = None
# Dump content_to_wrap to a temporary buffer.
# We are going to word-wrap this later, so use wrap_hack=True
# which makes sure no spaces appear in attributes.
tmpf = StringIO()
tmpf.write(start_tag(ht, True))
write_inline_content(tmpf, content_to_wrap, indent + ' ',
allow_last_block=ht.name == 'li', strict=strict, strict_blame=ht,
wrap_hack=True)
if last_block is None:
tmpf.write('</{}>'.format(ht.name))
text = tmpf.getvalue()
# Write the output to f.
if '\n' in text:
# This is unexpected; don't word-wrap. Write it verbatim, with newlines.
text = text.replace(NOT_A_SPACE, ' ')
f.write(indent + text + "\n")
else:
# The usual case. Word-wrap and write.
subsequent_indent = indent
if ht.name != 'p':
subsequent_indent += ' '
text = textwrap.fill(text, WIDTH, expand_tabs=False, replace_whitespace=False,
initial_indent=indent, subsequent_indent=subsequent_indent,
break_long_words=False, break_on_hyphens=False)
# Now that word wrapping is done, we can change the hacked spaces
# back to real spaces.
text = text.replace(NOT_A_SPACE, ' ')
f.write(text + "\n")
# If we had a trailing block, dump it now (and the end tag we skipped before).
if last_block:
write_html(f, last_block, indent + ' ', strict=strict,
wrap_hack=wrap_hack)
f.write(indent + "</{}>\n".format(ht.name))
elif info[0] == 'B':
# Block.
f.write(indent + start_tag(ht, wrap_hack))
if info != 'B0':
if content:
if is_ht_inline(content[0]):
if strict and info[1] not in 'i?s':
if isinstance(content[0], str):
raise ValueError("<{}> element may only contain tags, not text".format(ht.name))
else:
raise ValueError("<{}> element may not contain inline element <{}>".format(ht.name, content[0].name))
write_inline_content(f, content, indent + ' ', ht.name == 'li', strict, strict_blame=ht, wrap_hack=wrap_hack)
else:
if strict and info[1] not in 'b?':
raise ValueError("<{}> element may not contain block element <{}>".format(ht.name, content[0].name))
inner_indent = indent
if ht.name not in non_indenting_tags:
inner_indent += ' '
f.write('\n')
prev_needs_space = False
first = True
for kid in content:
if strict and is_ht_inline(kid):
if isinstance(kid, str):
raise ValueError("<{}> element may contain either text or block content, not both".format(ht.name))
else:
raise ValueError("<{}> element may contain either blocks (like <{}>) "
"or inline content (like <{}>), not both".format(
ht.name, content[0].name, kid.name))
needs_space = ((strict or isinstance(kid, Element))
and (kid.name in {'p', 'figure', 'section'}
or (kid.name in {'div', 'li', 'td'}
and kid.content and not is_ht_inline(kid.content[0]))))
if not first and (prev_needs_space or needs_space):
f.write('\n')
write_html(f, kid, inner_indent, strict,
wrap_hack=wrap_hack)
prev_needs_space = needs_space
first = False
f.write(indent)
f.write('</{}>'.format(ht.name))
f.write('\n')
else:
# Inline. Content must be inline too.
assert info in ('Ii', 'I0')
f.write(start_tag(ht, wrap_hack))
if info != 'I0':
write_inline_content(f, content, indent + ' ', False, strict, ht, wrap_hack)
f.write('</{}>'.format(ht.name))
_tag_data = {}
def _init(v):
# Every tag is one of:
#
# - like section, table, tr, ol, ul: block containing only blocks
#
# - like p, h1: block containing only inline content
#
# - like span, em, strong, i, b: inline containing only inline content
#
# - like li or td: block containing either block or inline content (or in the
# unique case of li, inline followed by a list)
#
# - like hr, img: block containing nothing
#
# - like br, wbr: inline containing nothing
_tag_raw_data = '''\
html head body section hgroup table tbody thead tfoot tr ol\
ul blockquote ol ul dl figure: Bb
title p h1 h2 h3 h4 h5 h6 address figcaption pre: Bi
a em strong small s cite q dfn abbr data time code var samp\
kbd sub sup i b u mark bdi bdo span: Ii
br wbr: I0
div li td th noscript object dt dd: B?
meta link hr img: B0
style script: Bs'''
def element_constructor(name):
def construct(*content, **attrs):
if 'class_' in attrs:
attrs['class'] = attrs['class_']
del attrs['class_']
return Element(name, attrs, None, list(content))
construct.__name__ = name
return construct
for line in _tag_raw_data.splitlines():
names, colon, info = line.partition(':')
assert colon
info = info.strip()
for name in names.split():
v[name] = element_constructor(name)
_tag_data[name] = info
_init(vars())
__all__ = ['Element'] + list(_tag_data.keys())