2 # Copyright 2013-2015 Bronto Software, Inc. and contributors
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
17 from __future__ import unicode_literals
18 from builtins import str
23 from xml.sax.saxutils import escape as html_escape
24 from bs4 import BeautifulSoup
26 Cell = collections.namedtuple('Cell', ['type', 'rowspan', 'colspan', 'contents'])
28 class Converter(object):
29 def __init__(self, parser):
30 self._unknown_tags = set()
31 self._clear = '\n\n..\n\n'
34 self._preprocess_anchors = re.compile(r'<a\s+name\s*=\s*["\']?(.+?)["\']?\s*>')
35 self._post_process_empty_lines = re.compile(r'^\s+$', re.MULTILINE)
36 self._post_process_compress_lines = re.compile(r'\n{3,}')
37 self._whitespace_with_newline = re.compile(r'[\s\n]+')
38 self._whitespace = re.compile(r'\s+')
39 self._html_tag = re.compile(r'<.*?>')
41 self._preprocess_entity = re.compile(r'&(nbsp|lt|gt|amp)([^;]|[\n])')
44 # --------------------------------------------------------------------------
45 # ---- reST Utility Methods ----
47 def _unicode(self, s):
48 if isinstance(s, unicode):
50 return unicode(s, 'utf8')
52 def _separate(self, s):
53 return u'\n\n' + s + u'\n\n'
55 def _escape_inline(self, s):
56 return '\\ ' + s + '\\ '
58 def _inline(self, tag, s):
59 # Seems fishy if our inline markup spans lines. We will instead just return
69 return self._escape_inline(tag + s.strip() + tag)
71 def _role(self, role, s, label=None):
73 return self._escape_inline(':%s:`%s <%s>`' % (role, label, s))
74 return self._escape_inline(':%s:`%s`' % (role, s))
76 def _directive(self, directive, body=None):
77 header = '\n\n.. %s::\n\n' % (directive,)
80 return header + self._left_justify(body, 3) + '\n\n'
83 def _hyperlink(self, target, label):
84 return self._escape_inline('`%s <%s>`_' % (label, target))
86 def _listing(self, marker, items):
87 items = [self._left_justify(item, len(marker) + 1) for item in items]
88 items = [marker + item[len(marker):] for item in items]
89 return self._separate('..') + self._separate('\n'.join(items))
91 def _left_justify(self, s, indent=0):
92 lines = [l.rstrip() for l in s.split('\n')]
93 indents = [len(l) - len(l.lstrip()) for l in lines if l]
98 shift = indent - min(indents)
101 return '\n'.join(l[-shift:] for l in lines)
104 return '\n'.join(prefix + l for l in lines)
106 def _compress_whitespace(self, s, replace=' ', newlines=True):
108 return self._whitespace_with_newline.sub(replace, s)
109 return self._whitespace.sub(replace, s)
111 # --------------------------------------------------------------------------
112 # ---- DOM Tree Processing ----
114 def _process_table_cells(self, table):
115 """ Compile all the table cells.
117 Returns a list of rows. The rows may have different lengths because of
124 for i, tr in enumerate(table.find_all('tr')):
127 for c in tr.contents:
128 cell_type = getattr(c, 'name', None)
130 if cell_type not in ('td', 'th'):
133 rowspan = int(c.attrs.get('rowspan', 1))
134 colspan = int(c.attrs.get('colspan', 1))
135 contents = self._process_children(c).strip()
137 if cell_type == 'th' and i > 0:
138 contents = self._inline('**', contents)
140 row.append(Cell(cell_type, rowspan, colspan, contents))
146 def _process_table(self, node):
147 rows = self._process_table_cells(node)
152 table_num_columns = max(sum(c.colspan for c in row) for row in rows)
157 row_num_columns = sum(c.colspan for c in row)
159 if row_num_columns < table_num_columns:
160 cell_type = row[-1].type if row else 'td'
161 row.append(Cell(cell_type, 1, table_num_columns - row_num_columns, ''))
163 col_widths = [0] * table_num_columns
164 row_heights = [0] * len(rows)
166 for i, row in enumerate(rows):
169 current_w = sum(col_widths[j:j + cell.colspan])
170 required_w = max(len(l) for l in cell.contents.split('\n'))
172 if required_w > current_w:
173 additional = required_w - current_w
174 col_widths[j] += additional - (cell.colspan - 1) * (additional // cell.colspan)
175 for jj in range(j + 1, j + cell.colspan):
176 col_widths[jj] += (additional // cell.colspan)
178 current_h = row_heights[i]
179 required_h = len(cell.contents.split('\n'))
181 if required_h > current_h:
182 row_heights[i] = required_h
186 row_sep = '+' + '+'.join('-' * (l + 2) for l in col_widths) + '+'
187 header_sep = '+' + '+'.join('=' * (l + 2) for l in col_widths) + '+'
190 for i, row in enumerate(rows):
191 for y in range(0, row_heights[i]):
195 w = sum(n + 3 for n in col_widths[j:j+c.colspan]) - 2
199 cell_lines = c.contents.split('\n')
200 content = cell_lines[y] if y < len(cell_lines) else ''
201 line.append(content.ljust(w))
206 lines.append(''.join(line))
208 if i == 0 and all(c.type == 'th' for c in row):
209 lines.append(header_sep)
211 lines.append(row_sep)
213 return self._separate('\n'.join(lines))
215 def _process_children(self, node):
219 for c in node.contents:
220 part = self._process(c)
227 is_newline = part.endswith('\n')
229 return ''.join(parts)
231 def _process_text(self, node):
232 return ''.join(node.strings)
234 def _process(self, node):
235 if isinstance(node, str):
236 return self._compress_whitespace(node)
239 'b' : lambda s: self._inline('**', s),
240 'strong' : lambda s: self._inline('**', s),
241 'i' : lambda s: self._inline('*', s),
242 'em' : lambda s: self._inline('*', s),
243 'tt' : lambda s: self._inline('``', s),
244 'code' : lambda s: self._inline('``', s),
245 'h1' : lambda s: self._inline('**', s),
246 'h2' : lambda s: self._inline('**', s),
247 'h3' : lambda s: self._inline('**', s),
248 'h4' : lambda s: self._inline('**', s),
249 'h5' : lambda s: self._inline('**', s),
250 'h6' : lambda s: self._inline('**', s),
251 'sub' : lambda s: self._role('sub', s),
252 'sup' : lambda s: self._role('sup', s),
253 'hr' : lambda s: self._separate('') # Transitions not allowed
256 if node.name in simple_tags:
257 return simple_tags[node.name](self._process_text(node))
260 return self._separate(self._process_children(node).strip())
262 if node.name == 'pre':
263 return self._directive('parsed-literal', self._process_text(node))
266 if 'name' in node.attrs:
267 return self._separate('.. _' + node['name'] + ':')
268 if 'href' in node.attrs:
269 target = node['href']
270 label = self._compress_whitespace(self._process_text(node).strip('\n'))
272 if target.startswith('#'):
273 return self._role('ref', target[1:], label)
274 if target.startswith('@'):
275 return self._role('java:ref', target[1:], label)
276 return self._hyperlink(target, label)
278 if node.name == 'ul':
279 items = [self._process(n) for n in node.find_all('li', recursive=False)]
280 return self._listing('*', items)
282 if node.name == 'ol':
283 items = [self._process(n) for n in node.find_all('li', recursive=False)]
284 return self._listing('#.', items)
286 if node.name == 'li':
287 s = self._process_children(node)
290 # If it's multiline clear the end to correcly support nested lists
296 if node.name == 'table':
297 return self._process_table(node)
299 self._unknown_tags.add(node.name)
301 return self._process_children(node)
303 # --------------------------------------------------------------------------
304 # ---- HTML Preprocessing ----
306 def _preprocess_inline_javadoc_replace(self, tag, f, s):
310 start_length = len(start)
318 # Find a closing bracket such that the brackets are balanced between
319 # them. This is necessary since code examples containing { and } are
320 # commonly wrapped in {@code ...} tags
323 j = s.find('}', i + start_length) + 1
324 while s.count('{', i, j) != s.count('}', i, j):
325 j = s.index('}', j) + 1
327 raise ValueError('Unbalanced {} brackets in ' + tag + ' tag')
329 parts.append(f(s[i + start_length:j - 1].strip()))
334 return ''.join(parts)
336 def _preprocess_replace_javadoc_link(self, s):
337 s = self._compress_whitespace(s)
347 while s.count('(', 0, i) != s.count(')', 0, i):
348 i = s.find(' ', i + 1)
360 target = target.replace('#', '.').replace(' ', '').strip()
362 # Strip HTML tags from the target
363 target = self._html_tag.sub('', target)
365 label = label.strip()
367 return '<a href="@%s">%s</a>' % (target, label)
369 def _preprocess_close_anchor_tags(self, s):
370 # Add closing tags to all anchors so they are better handled by the parser
371 return self._preprocess_anchors.sub(r'<a name="\1"></a>', s)
373 def _preprocess_fix_entities(self, s):
374 return self._preprocess_entity.sub(r'&\1;\2', s)
376 def _preprocess(self, s_html):
377 to_tag = lambda t: lambda m: '<%s>%s</%s>' % (t, html_escape(m), t)
378 s_html = self._preprocess_inline_javadoc_replace('code', to_tag('code'), s_html)
379 s_html = self._preprocess_inline_javadoc_replace('literal', to_tag('span'), s_html)
380 s_html = self._preprocess_inline_javadoc_replace('docRoot', lambda m: '', s_html)
381 s_html = self._preprocess_inline_javadoc_replace('linkplain', self._preprocess_replace_javadoc_link, s_html)
382 s_html = self._preprocess_inline_javadoc_replace('link', self._preprocess_replace_javadoc_link, s_html)
384 # Make sure all anchor tags are closed
385 s_html = self._preprocess_close_anchor_tags(s_html)
387 # Fix up some entitities without closing ;
388 s_html = self._preprocess_fix_entities(s_html)
392 # --------------------------------------------------------------------------
393 # ---- Conversion entry point ----
395 def convert(self, s_html):
396 if not isinstance(s_html, str):
397 s_html = str(s_html, 'utf8')
399 s_html = self._preprocess(s_html)
401 if not s_html.strip():
404 soup = BeautifulSoup(s_html, self._parser)
407 result = self._process_children(top)
410 result = self._post_process_empty_lines.sub('', result)
411 result = self._post_process_compress_lines.sub('\n\n', result)
412 result = result.strip()