docs/source/_ext/javasphinx/htmlrst.py

   1 #
   2 # Copyright 2013-2015 Bronto Software, Inc. and contributors
   3 #
   4 # Licensed under the Apache License, Version 2.0 (the "License");
   5 # you may not use this file except in compliance with the License.
   6 # You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 # See the License for the specific language governing permissions and
  14 # limitations under the License.
  15 #
  16
  17 from __future__ import unicode_literals
  18 from builtins import str
  19
  20 import collections
  21 import re
  22
  23 from xml.sax.saxutils import escape as html_escape
  24 from bs4 import BeautifulSoup
  25
  26 Cell = collections.namedtuple('Cell', ['type', 'rowspan', 'colspan', 'contents'])
  27
  28 class Converter(object):
  29     def __init__(self, parser):
  30         self._unknown_tags = set()
  31         self._clear = '\n\n..\n\n'
  32
  33         # Regular expressions
  34         self._preprocess_anchors = re.compile(r'<a\s+name\s*=\s*["\']?(.+?)["\']?\s*>')
  35         self._post_process_empty_lines = re.compile(r'^\s+$', re.MULTILINE)
  36         self._post_process_compress_lines = re.compile(r'\n{3,}')
  37         self._whitespace_with_newline = re.compile(r'[\s\n]+')
  38         self._whitespace = re.compile(r'\s+')
  39         self._html_tag = re.compile(r'<.*?>')
  40
  41         self._preprocess_entity = re.compile(r'&(nbsp|lt|gt|amp)([^;]|[\n])')
  42         self._parser = parser
  43
  44     # --------------------------------------------------------------------------
  45     # ---- reST Utility Methods ----
  46
  47     def _unicode(self, s):
  48         if isinstance(s, unicode):
  49             return s
  50         else:
  51             return unicode(s, 'utf8')
  52
  53     def _separate(self, s):
  54         return u'\n\n' + s + u'\n\n'
  55
  56     def _escape_inline(self, s):
  57         return '\\ ' + s + '\\ '
  58
  59     def _inline(self, tag, s):
  60         # Seems fishy if our inline markup spans lines. We will instead just return
  61         # the string as is
  62         if '\n' in s:
  63             return s
  64
  65         s = s.strip()
  66
  67         if not s:
  68             return s
  69
  70         return self._escape_inline(tag + s.strip() + tag)
  71
  72     def _role(self, role, s, label=None):
  73         if label:
  74             return self._escape_inline(':%s:`%s <%s>`' % (role, label, s))
  75         else:
  76             return self._escape_inline(':%s:`%s`' % (role, s))
  77
  78     def _directive(self, directive, body=None):
  79         header = '\n\n.. %s::\n\n' % (directive,)
  80
  81         if body:
  82             return header + self._left_justify(body, 3) + '\n\n'
  83         else:
  84             return header + '\n'
  85
  86     def _hyperlink(self, target, label):
  87         return self._escape_inline('`%s <%s>`_' % (label, target))
  88
  89     def _listing(self, marker, items):
  90         items = [self._left_justify(item, len(marker) + 1) for item in items]
  91         items = [marker + item[len(marker):] for item in items]
  92         return self._separate('..') + self._separate('\n'.join(items))
  93
  94     def _left_justify(self, s, indent=0):
  95         lines = [l.rstrip() for l in s.split('\n')]
  96         indents = [len(l) - len(l.lstrip()) for l in lines if l]
  97
  98         if not indents:
  99             return s
 100
 101         shift = indent - min(indents)
 102
 103         if shift < 0:
 104             return '\n'.join(l[-shift:] for l in lines)
 105         else:
 106             prefix = ' ' * shift
 107             return '\n'.join(prefix + l for l in lines)
 108
 109     def _compress_whitespace(self, s, replace=' ', newlines=True):
 110         if newlines:
 111             return self._whitespace_with_newline.sub(replace, s)
 112         else:
 113             return self._whitespace.sub(replace, s)
 114
 115     # --------------------------------------------------------------------------
 116     # ---- DOM Tree Processing ----
 117
 118     def _process_table_cells(self, table):
 119         """ Compile all the table cells.
 120
 121         Returns a list of rows. The rows may have different lengths because of
 122         column spans.
 123
 124         """
 125
 126         rows = []
 127
 128         for i, tr in enumerate(table.find_all('tr')):
 129             row = []
 130
 131             for c in tr.contents:
 132                 cell_type = getattr(c, 'name', None)
 133
 134                 if cell_type not in ('td', 'th'):
 135                     continue
 136
 137                 rowspan = int(c.attrs.get('rowspan', 1))
 138                 colspan = int(c.attrs.get('colspan', 1))
 139                 contents = self._process_children(c).strip()
 140
 141                 if cell_type == 'th' and i > 0:
 142                     contents = self._inline('**', contents)
 143
 144                 row.append(Cell(cell_type, rowspan, colspan, contents))
 145
 146             rows.append(row)
 147
 148         return rows
 149
 150     def _process_table(self, node):
 151         rows = self._process_table_cells(node)
 152
 153         if not rows:
 154             return ''
 155
 156         table_num_columns = max(sum(c.colspan for c in row) for row in rows)
 157
 158         normalized = []
 159
 160         for row in rows:
 161             row_num_columns = sum(c.colspan for c in row)
 162
 163             if row_num_columns < table_num_columns:
 164                 cell_type = row[-1].type if row else 'td'
 165                 row.append(Cell(cell_type, 1, table_num_columns - row_num_columns, ''))
 166
 167         col_widths = [0] * table_num_columns
 168         row_heights = [0] * len(rows)
 169
 170         for i, row in enumerate(rows):
 171             j = 0
 172             for cell in row:
 173                 current_w = sum(col_widths[j:j + cell.colspan])
 174                 required_w = max(len(l) for l in cell.contents.split('\n'))
 175
 176                 if required_w > current_w:
 177                     additional = required_w - current_w
 178                     col_widths[j] += additional - (cell.colspan - 1) * (additional // cell.colspan)
 179                     for jj in range(j + 1, j + cell.colspan):
 180                         col_widths[jj] += (additional // cell.colspan)
 181
 182                 current_h = row_heights[i]
 183                 required_h = len(cell.contents.split('\n'))
 184
 185                 if required_h > current_h:
 186                     row_heights[i] = required_h
 187
 188                 j += cell.colspan
 189
 190         row_sep = '+' + '+'.join('-' * (l + 2) for l in col_widths) + '+'
 191         header_sep = '+' + '+'.join('=' * (l + 2) for l in col_widths) + '+'
 192         lines = [row_sep]
 193
 194         for i, row in enumerate(rows):
 195             for y in range(0, row_heights[i]):
 196                 line = []
 197                 j = 0
 198                 for c in row:
 199                     w = sum(n + 3 for n in col_widths[j:j+c.colspan]) - 2
 200                     h = row_heights[i]
 201
 202                     line.append('| ')
 203                     cell_lines = c.contents.split('\n')
 204                     content = cell_lines[y] if y < len(cell_lines) else ''
 205                     line.append(content.ljust(w))
 206
 207                     j += c.colspan
 208
 209                 line.append('|')
 210                 lines.append(''.join(line))
 211
 212             if i == 0 and all(c.type == 'th' for c in row):
 213                 lines.append(header_sep)
 214             else:
 215                 lines.append(row_sep)
 216
 217         return self._separate('\n'.join(lines))
 218
 219     def _process_children(self, node):
 220         parts = []
 221         is_newline = False
 222
 223         for c in node.contents:
 224             part = self._process(c)
 225
 226             if is_newline:
 227                 part = part.lstrip()
 228
 229             if part:
 230                 parts.append(part)
 231                 is_newline = part.endswith('\n')
 232
 233         return ''.join(parts)
 234
 235     def _process_text(self, node):
 236         return ''.join(node.strings)
 237
 238     def _process(self, node):
 239         if isinstance(node, str):
 240             return self._compress_whitespace(node)
 241
 242         simple_tags = {
 243             'b'      : lambda s: self._inline('**', s),
 244             'strong' : lambda s: self._inline('**', s),
 245             'i'      : lambda s: self._inline('*', s),
 246             'em'     : lambda s: self._inline('*', s),
 247             'tt'     : lambda s: self._inline('``', s),
 248             'code'   : lambda s: self._inline('``', s),
 249             'h1'     : lambda s: self._inline('**', s),
 250             'h2'     : lambda s: self._inline('**', s),
 251             'h3'     : lambda s: self._inline('**', s),
 252             'h4'     : lambda s: self._inline('**', s),
 253             'h5'     : lambda s: self._inline('**', s),
 254             'h6'     : lambda s: self._inline('**', s),
 255             'sub'    : lambda s: self._role('sub', s),
 256             'sup'    : lambda s: self._role('sup', s),
 257             'hr'     : lambda s: self._separate('') # Transitions not allowed
 258             }
 259
 260         if node.name in simple_tags:
 261             return simple_tags[node.name](self._process_text(node))
 262
 263         if node.name == 'p':
 264             return self._separate(self._process_children(node).strip())
 265
 266         if node.name == 'pre':
 267             return self._directive('parsed-literal', self._process_text(node))
 268
 269         if node.name == 'a':
 270             if 'name' in node.attrs:
 271                 return self._separate('.. _' + node['name'] + ':')
 272             elif 'href' in node.attrs:
 273                 target = node['href']
 274                 label = self._compress_whitespace(self._process_text(node).strip('\n'))
 275
 276                 if target.startswith('#'):
 277                     return self._role('ref', target[1:], label)
 278                 elif target.startswith('@'):
 279                     return self._role('java:ref', target[1:], label)
 280                 else:
 281                     return self._hyperlink(target, label)
 282
 283         if node.name == 'ul':
 284             items = [self._process(n) for n in node.find_all('li', recursive=False)]
 285             return self._listing('*', items)
 286
 287         if node.name == 'ol':
 288             items = [self._process(n) for n in node.find_all('li', recursive=False)]
 289             return self._listing('#.', items)
 290
 291         if node.name == 'li':
 292             s = self._process_children(node)
 293             s = s.strip()
 294
 295             # If it's multiline clear the end to correcly support nested lists
 296             if '\n' in s:
 297                 s = s + '\n\n'
 298
 299             return s
 300
 301         if node.name == 'table':
 302             return self._process_table(node)
 303
 304         self._unknown_tags.add(node.name)
 305
 306         return self._process_children(node)
 307
 308     # --------------------------------------------------------------------------
 309     # ---- HTML Preprocessing ----
 310
 311     def _preprocess_inline_javadoc_replace(self, tag, f, s):
 312         parts = []
 313
 314         start = '{@' + tag
 315         start_length = len(start)
 316
 317         i = s.find(start)
 318         j = 0
 319
 320         while i != -1:
 321             parts.append(s[j:i])
 322
 323             # Find a closing bracket such that the brackets are balanced between
 324             # them. This is necessary since code examples containing { and } are
 325             # commonly wrapped in {@code ...} tags
 326
 327             try:
 328                 j = s.find('}', i + start_length) + 1
 329                 while s.count('{', i, j) != s.count('}', i, j):
 330                     j = s.index('}', j) + 1
 331             except ValueError:
 332                 raise ValueError('Unbalanced {} brackets in ' + tag + ' tag')
 333
 334             parts.append(f(s[i + start_length:j - 1].strip()))
 335             i = s.find(start, j)
 336
 337         parts.append(s[j:])
 338
 339         return ''.join(parts)
 340
 341     def _preprocess_replace_javadoc_link(self, s):
 342         s = self._compress_whitespace(s)
 343
 344         target = None
 345         label = ''
 346
 347         if ' ' not in s:
 348             target = s
 349         else:
 350             i = s.find(' ')
 351
 352             while s.count('(', 0, i) != s.count(')', 0, i):
 353                 i = s.find(' ', i + 1)
 354
 355                 if i == -1:
 356                     i = len(s)
 357                     break
 358
 359             target = s[:i]
 360             label = s[i:]
 361
 362         if target[0] == '#':
 363             target = target[1:]
 364
 365         target = target.replace('#', '.').replace(' ', '').strip()
 366
 367         # Strip HTML tags from the target
 368         target = self._html_tag.sub('', target)
 369
 370         label = label.strip()
 371
 372         return '<a href="@%s">%s</a>' % (target, label)
 373
 374     def _preprocess_close_anchor_tags(self, s):
 375         # Add closing tags to all anchors so they are better handled by the parser
 376         return self._preprocess_anchors.sub(r'<a name="\1"></a>', s)
 377
 378     def _preprocess_fix_entities(self, s):
 379         return self._preprocess_entity.sub(r'&\1;\2', s)
 380
 381     def _preprocess(self, s_html):
 382         to_tag = lambda t: lambda m: '<%s>%s</%s>' % (t, html_escape(m), t)
 383         s_html = self._preprocess_inline_javadoc_replace('code', to_tag('code'), s_html)
 384         s_html = self._preprocess_inline_javadoc_replace('literal', to_tag('span'), s_html)
 385         s_html = self._preprocess_inline_javadoc_replace('docRoot', lambda m: '', s_html)
 386         s_html = self._preprocess_inline_javadoc_replace('linkplain', self._preprocess_replace_javadoc_link, s_html)
 387         s_html = self._preprocess_inline_javadoc_replace('link', self._preprocess_replace_javadoc_link, s_html)
 388
 389         # Make sure all anchor tags are closed
 390         s_html = self._preprocess_close_anchor_tags(s_html)
 391
 392         # Fix up some entitities without closing ;
 393         s_html = self._preprocess_fix_entities(s_html)
 394
 395         return s_html
 396
 397     # --------------------------------------------------------------------------
 398     # ---- Conversion entry point ----
 399
 400     def convert(self, s_html):
 401         if not isinstance(s_html, str):
 402             s_html = str(s_html, 'utf8')
 403
 404         s_html = self._preprocess(s_html)
 405
 406         if not s_html.strip():
 407             return ''
 408
 409         soup = BeautifulSoup(s_html, self._parser)
 410         top = soup.html.body
 411
 412         result = self._process_children(top)
 413
 414         # Post processing
 415         result = self._post_process_empty_lines.sub('', result)
 416         result = self._post_process_compress_lines.sub('\n\n', result)
 417         result = result.strip()
 418
 419         return result