2 # Copyright 2013-2015 Bronto Software, Inc. and contributors
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
17 from __future__ import unicode_literals
18 from builtins import str
23 from xml.sax.saxutils import escape as html_escape
24 from bs4 import BeautifulSoup
26 Cell = collections.namedtuple('Cell', ['type', 'rowspan', 'colspan', 'contents'])
28 class Converter(object):
29 def __init__(self, parser):
30 self._unknown_tags = set()
31 self._clear = '\n\n..\n\n'
34 self._preprocess_anchors = re.compile(r'<a\s+name\s*=\s*["\']?(.+?)["\']?\s*>')
35 self._post_process_empty_lines = re.compile(r'^\s+$', re.MULTILINE)
36 self._post_process_compress_lines = re.compile(r'\n{3,}')
37 self._whitespace_with_newline = re.compile(r'[\s\n]+')
38 self._whitespace = re.compile(r'\s+')
39 self._html_tag = re.compile(r'<.*?>')
41 self._preprocess_entity = re.compile(r'&(nbsp|lt|gt|amp)([^;]|[\n])')
44 # --------------------------------------------------------------------------
45 # ---- reST Utility Methods ----
47 def _unicode(self, s):
48 if isinstance(s, unicode):
51 return unicode(s, 'utf8')
53 def _separate(self, s):
54 return u'\n\n' + s + u'\n\n'
56 def _escape_inline(self, s):
57 return '\\ ' + s + '\\ '
59 def _inline(self, tag, s):
60 # Seems fishy if our inline markup spans lines. We will instead just return
70 return self._escape_inline(tag + s.strip() + tag)
72 def _role(self, role, s, label=None):
74 return self._escape_inline(':%s:`%s <%s>`' % (role, label, s))
76 return self._escape_inline(':%s:`%s`' % (role, s))
78 def _directive(self, directive, body=None):
79 header = '\n\n.. %s::\n\n' % (directive,)
82 return header + self._left_justify(body, 3) + '\n\n'
86 def _hyperlink(self, target, label):
87 return self._escape_inline('`%s <%s>`_' % (label, target))
89 def _listing(self, marker, items):
90 items = [self._left_justify(item, len(marker) + 1) for item in items]
91 items = [marker + item[len(marker):] for item in items]
92 return self._separate('..') + self._separate('\n'.join(items))
94 def _left_justify(self, s, indent=0):
95 lines = [l.rstrip() for l in s.split('\n')]
96 indents = [len(l) - len(l.lstrip()) for l in lines if l]
101 shift = indent - min(indents)
104 return '\n'.join(l[-shift:] for l in lines)
107 return '\n'.join(prefix + l for l in lines)
109 def _compress_whitespace(self, s, replace=' ', newlines=True):
111 return self._whitespace_with_newline.sub(replace, s)
113 return self._whitespace.sub(replace, s)
115 # --------------------------------------------------------------------------
116 # ---- DOM Tree Processing ----
118 def _process_table_cells(self, table):
119 """ Compile all the table cells.
121 Returns a list of rows. The rows may have different lengths because of
128 for i, tr in enumerate(table.find_all('tr')):
131 for c in tr.contents:
132 cell_type = getattr(c, 'name', None)
134 if cell_type not in ('td', 'th'):
137 rowspan = int(c.attrs.get('rowspan', 1))
138 colspan = int(c.attrs.get('colspan', 1))
139 contents = self._process_children(c).strip()
141 if cell_type == 'th' and i > 0:
142 contents = self._inline('**', contents)
144 row.append(Cell(cell_type, rowspan, colspan, contents))
150 def _process_table(self, node):
151 rows = self._process_table_cells(node)
156 table_num_columns = max(sum(c.colspan for c in row) for row in rows)
161 row_num_columns = sum(c.colspan for c in row)
163 if row_num_columns < table_num_columns:
164 cell_type = row[-1].type if row else 'td'
165 row.append(Cell(cell_type, 1, table_num_columns - row_num_columns, ''))
167 col_widths = [0] * table_num_columns
168 row_heights = [0] * len(rows)
170 for i, row in enumerate(rows):
173 current_w = sum(col_widths[j:j + cell.colspan])
174 required_w = max(len(l) for l in cell.contents.split('\n'))
176 if required_w > current_w:
177 additional = required_w - current_w
178 col_widths[j] += additional - (cell.colspan - 1) * (additional // cell.colspan)
179 for jj in range(j + 1, j + cell.colspan):
180 col_widths[jj] += (additional // cell.colspan)
182 current_h = row_heights[i]
183 required_h = len(cell.contents.split('\n'))
185 if required_h > current_h:
186 row_heights[i] = required_h
190 row_sep = '+' + '+'.join('-' * (l + 2) for l in col_widths) + '+'
191 header_sep = '+' + '+'.join('=' * (l + 2) for l in col_widths) + '+'
194 for i, row in enumerate(rows):
195 for y in range(0, row_heights[i]):
199 w = sum(n + 3 for n in col_widths[j:j+c.colspan]) - 2
203 cell_lines = c.contents.split('\n')
204 content = cell_lines[y] if y < len(cell_lines) else ''
205 line.append(content.ljust(w))
210 lines.append(''.join(line))
212 if i == 0 and all(c.type == 'th' for c in row):
213 lines.append(header_sep)
215 lines.append(row_sep)
217 return self._separate('\n'.join(lines))
219 def _process_children(self, node):
223 for c in node.contents:
224 part = self._process(c)
231 is_newline = part.endswith('\n')
233 return ''.join(parts)
235 def _process_text(self, node):
236 return ''.join(node.strings)
238 def _process(self, node):
239 if isinstance(node, str):
240 return self._compress_whitespace(node)
243 'b' : lambda s: self._inline('**', s),
244 'strong' : lambda s: self._inline('**', s),
245 'i' : lambda s: self._inline('*', s),
246 'em' : lambda s: self._inline('*', s),
247 'tt' : lambda s: self._inline('``', s),
248 'code' : lambda s: self._inline('``', s),
249 'h1' : lambda s: self._inline('**', s),
250 'h2' : lambda s: self._inline('**', s),
251 'h3' : lambda s: self._inline('**', s),
252 'h4' : lambda s: self._inline('**', s),
253 'h5' : lambda s: self._inline('**', s),
254 'h6' : lambda s: self._inline('**', s),
255 'sub' : lambda s: self._role('sub', s),
256 'sup' : lambda s: self._role('sup', s),
257 'hr' : lambda s: self._separate('') # Transitions not allowed
260 if node.name in simple_tags:
261 return simple_tags[node.name](self._process_text(node))
264 return self._separate(self._process_children(node).strip())
266 if node.name == 'pre':
267 return self._directive('parsed-literal', self._process_text(node))
270 if 'name' in node.attrs:
271 return self._separate('.. _' + node['name'] + ':')
272 elif 'href' in node.attrs:
273 target = node['href']
274 label = self._compress_whitespace(self._process_text(node).strip('\n'))
276 if target.startswith('#'):
277 return self._role('ref', target[1:], label)
278 elif target.startswith('@'):
279 return self._role('java:ref', target[1:], label)
281 return self._hyperlink(target, label)
283 if node.name == 'ul':
284 items = [self._process(n) for n in node.find_all('li', recursive=False)]
285 return self._listing('*', items)
287 if node.name == 'ol':
288 items = [self._process(n) for n in node.find_all('li', recursive=False)]
289 return self._listing('#.', items)
291 if node.name == 'li':
292 s = self._process_children(node)
295 # If it's multiline clear the end to correcly support nested lists
301 if node.name == 'table':
302 return self._process_table(node)
304 self._unknown_tags.add(node.name)
306 return self._process_children(node)
308 # --------------------------------------------------------------------------
309 # ---- HTML Preprocessing ----
311 def _preprocess_inline_javadoc_replace(self, tag, f, s):
315 start_length = len(start)
323 # Find a closing bracket such that the brackets are balanced between
324 # them. This is necessary since code examples containing { and } are
325 # commonly wrapped in {@code ...} tags
328 j = s.find('}', i + start_length) + 1
329 while s.count('{', i, j) != s.count('}', i, j):
330 j = s.index('}', j) + 1
332 raise ValueError('Unbalanced {} brackets in ' + tag + ' tag')
334 parts.append(f(s[i + start_length:j - 1].strip()))
339 return ''.join(parts)
341 def _preprocess_replace_javadoc_link(self, s):
342 s = self._compress_whitespace(s)
352 while s.count('(', 0, i) != s.count(')', 0, i):
353 i = s.find(' ', i + 1)
365 target = target.replace('#', '.').replace(' ', '').strip()
367 # Strip HTML tags from the target
368 target = self._html_tag.sub('', target)
370 label = label.strip()
372 return '<a href="@%s">%s</a>' % (target, label)
374 def _preprocess_close_anchor_tags(self, s):
375 # Add closing tags to all anchors so they are better handled by the parser
376 return self._preprocess_anchors.sub(r'<a name="\1"></a>', s)
378 def _preprocess_fix_entities(self, s):
379 return self._preprocess_entity.sub(r'&\1;\2', s)
381 def _preprocess(self, s_html):
382 to_tag = lambda t: lambda m: '<%s>%s</%s>' % (t, html_escape(m), t)
383 s_html = self._preprocess_inline_javadoc_replace('code', to_tag('code'), s_html)
384 s_html = self._preprocess_inline_javadoc_replace('literal', to_tag('span'), s_html)
385 s_html = self._preprocess_inline_javadoc_replace('docRoot', lambda m: '', s_html)
386 s_html = self._preprocess_inline_javadoc_replace('linkplain', self._preprocess_replace_javadoc_link, s_html)
387 s_html = self._preprocess_inline_javadoc_replace('link', self._preprocess_replace_javadoc_link, s_html)
389 # Make sure all anchor tags are closed
390 s_html = self._preprocess_close_anchor_tags(s_html)
392 # Fix up some entitities without closing ;
393 s_html = self._preprocess_fix_entities(s_html)
397 # --------------------------------------------------------------------------
398 # ---- Conversion entry point ----
400 def convert(self, s_html):
401 if not isinstance(s_html, str):
402 s_html = str(s_html, 'utf8')
404 s_html = self._preprocess(s_html)
406 if not s_html.strip():
409 soup = BeautifulSoup(s_html, self._parser)
412 result = self._process_children(top)
415 result = self._post_process_empty_lines.sub('', result)
416 result = self._post_process_compress_lines.sub('\n\n', result)
417 result = result.strip()