docs/source/_ext/javasphinx/javasphinx/htmlrst.py

   1 #
   2 # Copyright 2013-2015 Bronto Software, Inc. and contributors
   3 #
   4 # Licensed under the Apache License, Version 2.0 (the "License");
   5 # you may not use this file except in compliance with the License.
   6 # You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 # See the License for the specific language governing permissions and
  14 # limitations under the License.
  15 #
  16
  17 from __future__ import unicode_literals
  18 from builtins import str
  19
  20 import collections
  21 import re
  22
  23 from xml.sax.saxutils import escape as html_escape
  24 from bs4 import BeautifulSoup
  25
  26 Cell = collections.namedtuple('Cell', ['type', 'rowspan', 'colspan', 'contents'])
  27
  28 class Converter(object):
  29     def __init__(self, parser):
  30         self._unknown_tags = set()
  31         self._clear = '\n\n..\n\n'
  32
  33         # Regular expressions
  34         self._preprocess_anchors = re.compile(r'<a\s+name\s*=\s*["\']?(.+?)["\']?\s*>')
  35         self._post_process_empty_lines = re.compile(r'^\s+$', re.MULTILINE)
  36         self._post_process_compress_lines = re.compile(r'\n{3,}')
  37         self._whitespace_with_newline = re.compile(r'[\s\n]+')
  38         self._whitespace = re.compile(r'\s+')
  39         self._html_tag = re.compile(r'<.*?>')
  40
  41         self._preprocess_entity = re.compile(r'&(nbsp|lt|gt|amp)([^;]|[\n])')
  42         self._parser = parser
  43
  44     # --------------------------------------------------------------------------
  45     # ---- reST Utility Methods ----
  46
  47     def _unicode(self, s):
  48         if isinstance(s, unicode):
  49             return s
  50         return unicode(s, 'utf8')
  51
  52     def _separate(self, s):
  53         return u'\n\n' + s + u'\n\n'
  54
  55     def _escape_inline(self, s):
  56         return '\\ ' + s + '\\ '
  57
  58     def _inline(self, tag, s):
  59         # Seems fishy if our inline markup spans lines. We will instead just return
  60         # the string as is
  61         if '\n' in s:
  62             return s
  63
  64         s = s.strip()
  65
  66         if not s:
  67             return s
  68
  69         return self._escape_inline(tag + s.strip() + tag)
  70
  71     def _role(self, role, s, label=None):
  72         if label:
  73             return self._escape_inline(':%s:`%s <%s>`' % (role, label, s))
  74         return self._escape_inline(':%s:`%s`' % (role, s))
  75
  76     def _directive(self, directive, body=None):
  77         header = '\n\n.. %s::\n\n' % (directive,)
  78
  79         if body:
  80             return header + self._left_justify(body, 3) + '\n\n'
  81         return header + '\n'
  82
  83     def _hyperlink(self, target, label):
  84         return self._escape_inline('`%s <%s>`_' % (label, target))
  85
  86     def _listing(self, marker, items):
  87         items = [self._left_justify(item, len(marker) + 1) for item in items]
  88         items = [marker + item[len(marker):] for item in items]
  89         return self._separate('..') + self._separate('\n'.join(items))
  90
  91     def _left_justify(self, s, indent=0):
  92         lines = [l.rstrip() for l in s.split('\n')]
  93         indents = [len(l) - len(l.lstrip()) for l in lines if l]
  94
  95         if not indents:
  96             return s
  97
  98         shift = indent - min(indents)
  99
 100         if shift < 0:
 101             return '\n'.join(l[-shift:] for l in lines)
 102
 103         prefix = ' ' * shift
 104         return '\n'.join(prefix + l for l in lines)
 105
 106     def _compress_whitespace(self, s, replace=' ', newlines=True):
 107         if newlines:
 108             return self._whitespace_with_newline.sub(replace, s)
 109         return self._whitespace.sub(replace, s)
 110
 111     # --------------------------------------------------------------------------
 112     # ---- DOM Tree Processing ----
 113
 114     def _process_table_cells(self, table):
 115         """ Compile all the table cells.
 116
 117         Returns a list of rows. The rows may have different lengths because of
 118         column spans.
 119
 120         """
 121
 122         rows = []
 123
 124         for i, tr in enumerate(table.find_all('tr')):
 125             row = []
 126
 127             for c in tr.contents:
 128                 cell_type = getattr(c, 'name', None)
 129
 130                 if cell_type not in ('td', 'th'):
 131                     continue
 132
 133                 rowspan = int(c.attrs.get('rowspan', 1))
 134                 colspan = int(c.attrs.get('colspan', 1))
 135                 contents = self._process_children(c).strip()
 136
 137                 if cell_type == 'th' and i > 0:
 138                     contents = self._inline('**', contents)
 139
 140                 row.append(Cell(cell_type, rowspan, colspan, contents))
 141
 142             rows.append(row)
 143
 144         return rows
 145
 146     def _process_table(self, node):
 147         rows = self._process_table_cells(node)
 148
 149         if not rows:
 150             return ''
 151
 152         table_num_columns = max(sum(c.colspan for c in row) for row in rows)
 153
 154         normalized = []
 155
 156         for row in rows:
 157             row_num_columns = sum(c.colspan for c in row)
 158
 159             if row_num_columns < table_num_columns:
 160                 cell_type = row[-1].type if row else 'td'
 161                 row.append(Cell(cell_type, 1, table_num_columns - row_num_columns, ''))
 162
 163         col_widths = [0] * table_num_columns
 164         row_heights = [0] * len(rows)
 165
 166         for i, row in enumerate(rows):
 167             j = 0
 168             for cell in row:
 169                 current_w = sum(col_widths[j:j + cell.colspan])
 170                 required_w = max(len(l) for l in cell.contents.split('\n'))
 171
 172                 if required_w > current_w:
 173                     additional = required_w - current_w
 174                     col_widths[j] += additional - (cell.colspan - 1) * (additional // cell.colspan)
 175                     for jj in range(j + 1, j + cell.colspan):
 176                         col_widths[jj] += (additional // cell.colspan)
 177
 178                 current_h = row_heights[i]
 179                 required_h = len(cell.contents.split('\n'))
 180
 181                 if required_h > current_h:
 182                     row_heights[i] = required_h
 183
 184                 j += cell.colspan
 185
 186         row_sep = '+' + '+'.join('-' * (l + 2) for l in col_widths) + '+'
 187         header_sep = '+' + '+'.join('=' * (l + 2) for l in col_widths) + '+'
 188         lines = [row_sep]
 189
 190         for i, row in enumerate(rows):
 191             for y in range(0, row_heights[i]):
 192                 line = []
 193                 j = 0
 194                 for c in row:
 195                     w = sum(n + 3 for n in col_widths[j:j+c.colspan]) - 2
 196                     h = row_heights[i]
 197
 198                     line.append('| ')
 199                     cell_lines = c.contents.split('\n')
 200                     content = cell_lines[y] if y < len(cell_lines) else ''
 201                     line.append(content.ljust(w))
 202
 203                     j += c.colspan
 204
 205                 line.append('|')
 206                 lines.append(''.join(line))
 207
 208             if i == 0 and all(c.type == 'th' for c in row):
 209                 lines.append(header_sep)
 210             else:
 211                 lines.append(row_sep)
 212
 213         return self._separate('\n'.join(lines))
 214
 215     def _process_children(self, node):
 216         parts = []
 217         is_newline = False
 218
 219         for c in node.contents:
 220             part = self._process(c)
 221
 222             if is_newline:
 223                 part = part.lstrip()
 224
 225             if part:
 226                 parts.append(part)
 227                 is_newline = part.endswith('\n')
 228
 229         return ''.join(parts)
 230
 231     def _process_text(self, node):
 232         return ''.join(node.strings)
 233
 234     def _process(self, node):
 235         if isinstance(node, str):
 236             return self._compress_whitespace(node)
 237
 238         simple_tags = {
 239             'b'      : lambda s: self._inline('**', s),
 240             'strong' : lambda s: self._inline('**', s),
 241             'i'      : lambda s: self._inline('*', s),
 242             'em'     : lambda s: self._inline('*', s),
 243             'tt'     : lambda s: self._inline('``', s),
 244             'code'   : lambda s: self._inline('``', s),
 245             'h1'     : lambda s: self._inline('**', s),
 246             'h2'     : lambda s: self._inline('**', s),
 247             'h3'     : lambda s: self._inline('**', s),
 248             'h4'     : lambda s: self._inline('**', s),
 249             'h5'     : lambda s: self._inline('**', s),
 250             'h6'     : lambda s: self._inline('**', s),
 251             'sub'    : lambda s: self._role('sub', s),
 252             'sup'    : lambda s: self._role('sup', s),
 253             'hr'     : lambda s: self._separate('') # Transitions not allowed
 254             }
 255
 256         if node.name in simple_tags:
 257             return simple_tags[node.name](self._process_text(node))
 258
 259         if node.name == 'p':
 260             return self._separate(self._process_children(node).strip())
 261
 262         if node.name == 'pre':
 263             return self._directive('parsed-literal', self._process_text(node))
 264
 265         if node.name == 'a':
 266             if 'name' in node.attrs:
 267                 return self._separate('.. _' + node['name'] + ':')
 268             if 'href' in node.attrs:
 269                 target = node['href']
 270                 label = self._compress_whitespace(self._process_text(node).strip('\n'))
 271
 272                 if target.startswith('#'):
 273                     return self._role('ref', target[1:], label)
 274                 if target.startswith('@'):
 275                     return self._role('java:ref', target[1:], label)
 276                 return self._hyperlink(target, label)
 277
 278         if node.name == 'ul':
 279             items = [self._process(n) for n in node.find_all('li', recursive=False)]
 280             return self._listing('*', items)
 281
 282         if node.name == 'ol':
 283             items = [self._process(n) for n in node.find_all('li', recursive=False)]
 284             return self._listing('#.', items)
 285
 286         if node.name == 'li':
 287             s = self._process_children(node)
 288             s = s.strip()
 289
 290             # If it's multiline clear the end to correcly support nested lists
 291             if '\n' in s:
 292                 s = s + '\n\n'
 293
 294             return s
 295
 296         if node.name == 'table':
 297             return self._process_table(node)
 298
 299         self._unknown_tags.add(node.name)
 300
 301         return self._process_children(node)
 302
 303     # --------------------------------------------------------------------------
 304     # ---- HTML Preprocessing ----
 305
 306     def _preprocess_inline_javadoc_replace(self, tag, f, s):
 307         parts = []
 308
 309         start = '{@' + tag
 310         start_length = len(start)
 311
 312         i = s.find(start)
 313         j = 0
 314
 315         while i != -1:
 316             parts.append(s[j:i])
 317
 318             # Find a closing bracket such that the brackets are balanced between
 319             # them. This is necessary since code examples containing { and } are
 320             # commonly wrapped in {@code ...} tags
 321
 322             try:
 323                 j = s.find('}', i + start_length) + 1
 324                 while s.count('{', i, j) != s.count('}', i, j):
 325                     j = s.index('}', j) + 1
 326             except ValueError:
 327                 raise ValueError('Unbalanced {} brackets in ' + tag + ' tag')
 328
 329             parts.append(f(s[i + start_length:j - 1].strip()))
 330             i = s.find(start, j)
 331
 332         parts.append(s[j:])
 333
 334         return ''.join(parts)
 335
 336     def _preprocess_replace_javadoc_link(self, s):
 337         s = self._compress_whitespace(s)
 338
 339         target = None
 340         label = ''
 341
 342         if ' ' not in s:
 343             target = s
 344         else:
 345             i = s.find(' ')
 346
 347             while s.count('(', 0, i) != s.count(')', 0, i):
 348                 i = s.find(' ', i + 1)
 349
 350                 if i == -1:
 351                     i = len(s)
 352                     break
 353
 354             target = s[:i]
 355             label = s[i:]
 356
 357         if target[0] == '#':
 358             target = target[1:]
 359
 360         target = target.replace('#', '.').replace(' ', '').strip()
 361
 362         # Strip HTML tags from the target
 363         target = self._html_tag.sub('', target)
 364
 365         label = label.strip()
 366
 367         return '<a href="@%s">%s</a>' % (target, label)
 368
 369     def _preprocess_close_anchor_tags(self, s):
 370         # Add closing tags to all anchors so they are better handled by the parser
 371         return self._preprocess_anchors.sub(r'<a name="\1"></a>', s)
 372
 373     def _preprocess_fix_entities(self, s):
 374         return self._preprocess_entity.sub(r'&\1;\2', s)
 375
 376     def _preprocess(self, s_html):
 377         to_tag = lambda t: lambda m: '<%s>%s</%s>' % (t, html_escape(m), t)
 378         s_html = self._preprocess_inline_javadoc_replace('code', to_tag('code'), s_html)
 379         s_html = self._preprocess_inline_javadoc_replace('literal', to_tag('span'), s_html)
 380         s_html = self._preprocess_inline_javadoc_replace('docRoot', lambda m: '', s_html)
 381         s_html = self._preprocess_inline_javadoc_replace('linkplain', self._preprocess_replace_javadoc_link, s_html)
 382         s_html = self._preprocess_inline_javadoc_replace('link', self._preprocess_replace_javadoc_link, s_html)
 383
 384         # Make sure all anchor tags are closed
 385         s_html = self._preprocess_close_anchor_tags(s_html)
 386
 387         # Fix up some entitities without closing ;
 388         s_html = self._preprocess_fix_entities(s_html)
 389
 390         return s_html
 391
 392     # --------------------------------------------------------------------------
 393     # ---- Conversion entry point ----
 394
 395     def convert(self, s_html):
 396         if not isinstance(s_html, str):
 397             s_html = str(s_html, 'utf8')
 398
 399         s_html = self._preprocess(s_html)
 400
 401         if not s_html.strip():
 402             return ''
 403
 404         soup = BeautifulSoup(s_html, self._parser)
 405         top = soup.html.body
 406
 407         result = self._process_children(top)
 408
 409         # Post processing
 410         result = self._post_process_empty_lines.sub('', result)
 411         result = self._post_process_compress_lines.sub('\n\n', result)
 412         result = result.strip()
 413
 414         return result