.mutt/markdown2html

   1 #!/usr/bin/python3
   2 #
   3 # markdown2html.py — simple Markdown-to-HTML converter for use with Mutt
   4 #
   5 # Mutt recently learnt [how to compose `multipart/alternative`
   6 # emails][1]. This script assumes a message has been composed using Markdown
   7 # (with a lot of pandoc extensions enabled), and translates it to `text/html`
   8 # for Mutt to tie into such a `multipart/alternative` message.
   9 #
  10 # [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354)
  11 #
  12 # Configuration:
  13 #   muttrc:
  14 #     set send_multipart_alternative=yes
  15 #     set send_multipart_alternative_filter=/path/to/markdown2html.py
  16 #
  17 # Optionally, Custom CSS styles will be read from `~/.mutt/markdown2html.css`,
  18 # if present.
  19 #
  20 # Requirements:
  21 #   - python3
  22 #   - PyPandoc (and pandoc installed, or downloaded)
  23 #   - Pynliner
  24 #
  25 # Optional:
  26 #   - Pygments, if installed, then syntax highlighting is enabled
  27 #
  28 # Latest version:
  29 #   https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html
  30 #
  31 # Copyright © 2019 martin f. krafft <madduck@madduck.net>
  32 # Released under the GPL-2+ licence, just like Mutt itself.
  33 #
  34
  35 import pypandoc
  36 import pynliner
  37 import re
  38 import os
  39 import sys
  40
  41 try:
  42     from pygments.formatters import get_formatter_by_name
  43     formatter = get_formatter_by_name('html', style='default')
  44     DEFAULT_CSS = formatter.get_style_defs('.sourceCode')
  45
  46 except ImportError:
  47     DEFAULT_CSS = ""
  48
  49
  50 DEFAULT_CSS += '''
  51 .block {
  52     padding: 0 0.5em;
  53     margin: 0;
  54     border-left: 2px solid #eee;
  55 }
  56 .quote, blockquote {
  57     padding: 0 0.5em;
  58     margin: 0;
  59     font-style: italic;
  60     border-left: 2px solid #666;
  61     color: #666;
  62     font-size: 80%;
  63 }
  64 .quotelead {
  65     margin-bottom: -1em;
  66     font-size: 80%;
  67 }
  68 .quotechar { display: none; }
  69 .footnote-ref, .footnote-back { text-decoration: none;}
  70 .signature {
  71     color: #999;
  72     font-family: monospace;
  73     white-space: pre;
  74     margin: 1em 0 0 0;
  75     font-size: 80%;
  76 }
  77 table, th, td {
  78     border-collapse: collapse;
  79     border: 1px solid #999;
  80 }
  81 th, td { padding: 0.5em; }
  82 .header {
  83     background: #eee;
  84 }
  85 .even { background: #eee; }
  86 h1, h2, h3, h4, h5, h6 {
  87     color: #666;
  88     background-color: #eee;
  89     padding-left: 0.5em
  90 }
  91 h1 { font-size: 130%; }
  92 h2 { font-size: 120%; }
  93 h3 { font-size: 110%; }
  94 h4 { font-size: 107%; }
  95 h5 { font-size: 103%; }
  96 h6 { font-size: 100%; }
  97 p { padding: 0 0.5em; }
  98 '''
  99
 100 STYLESHEET = os.path.join(os.path.expanduser('~/.mutt'),
 101                           'markdown2html.css')
 102 if os.path.exists(STYLESHEET):
 103     DEFAULT_CSS += open(STYLESHEET).read()
 104
 105 SIGNATURE_HTML = \
 106         '<div class="signature"><span class="leader">-- </span>{sig}</div>'
 107
 108
 109 def _preprocess_signature(sig):
 110     '''
 111     Preprocess the signature before markdown processing.
 112     '''
 113     return sig
 114
 115 def _preprocess_markdown(mdwn):
 116     '''
 117     Preprocess Markdown for handling by the converter.
 118     '''
 119     # convert hard line breaks within paragraphs to 2 trailing spaces, which
 120     # is the markdown way of representing hard line breaks. Note how the
 121     # regexp will not match between paragraphs.
 122     ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1>  \n\g<2>', mdwn, flags=re.MULTILINE)
 123
 124     # Clients like Thunderbird need the leading '>' to be able to properly
 125     # create nested quotes, so we duplicate the symbol, the first instance
 126     # will tell pandoc to create a blockquote, while the second instance will
 127     # be a <span> containing the character, along with a class that causes CSS
 128     # to actually hide it from display. However, this does not work with the
 129     # text-mode HTML2text converters, and so it's left commented for now.
 130     #ret = re.sub(r'\n>', r'  \n>[>]{.quotechar}', ret, flags=re.MULTILINE)
 131
 132     # With the autolink_bare_uris extension, we do not need to put links into
 133     # angle brackets to have them converted, so let's conserve the brackets
 134     # when used around email addresses. Note that this needs a postprocessing
 135     # hack because the pandoc autolink converted includes the ambersand
 136     # (https://github.com/jgm/pandoc/issues/7398).
 137     ret = re.sub(r'<([^@]+@.+\.[^>]+)>', r'&lt;\g<1> -PANDOC_BUG_7398-&gt;', ret)
 138
 139     return ret
 140
 141
 142 def _identify_quotes_for_later(mdwn):
 143     '''
 144     Email quoting such as:
 145
 146     ```
 147     On 1970-01-01, you said:
 148     > The Flat Earth Society has members all around the globe.
 149     ```
 150
 151     isn't really properly handled by Markdown, so let's do our best to
 152     identify the individual elements, and mark them, using a syntax similar to
 153     what pandoc uses already in some cases. As pandoc won't actually use these
 154     data (yet?), we call `self._reformat_quotes` later to use these markers
 155     to slap the appropriate classes on the HTML tags.
 156     '''
 157
 158     def generate_lines_with_context(mdwn):
 159         '''
 160         Iterates the input string line-wise, returning a triplet of
 161         previous, current, and next line, the first and last of which
 162         will be None on the first and last line of the input data
 163         respectively.
 164         '''
 165         prev = cur = nxt = None
 166         lines = iter(mdwn.splitlines())
 167         cur = next(lines)
 168         for nxt in lines:
 169             yield prev, cur, nxt
 170             prev = cur
 171             cur = nxt
 172         yield prev, cur, None
 173
 174     ret = []
 175     for prev, cur, nxt in generate_lines_with_context(mdwn):
 176
 177         # The lead-in to a quote is a single line immediately preceding the
 178         # quote, and ending with ':'. Note that there could be multiple of
 179         # these:
 180         if re.match(r'^.+:\s*$', cur) and nxt.startswith('>'):
 181             ret.append(f'{{.quotelead}}{cur.strip()}')
 182             # pandoc needs an empty line before the blockquote, so
 183             # we enter one for the purpose of HTML rendition:
 184             ret.append('')
 185             continue
 186
 187         # The first blockquote after such a lead-in gets marked as the
 188         # "initial" quote:
 189         elif prev and re.match(r'^.+:\s*$', prev) and cur.startswith('>'):
 190             ret.append(re.sub(r'^(\s*>\s*)+(.+)',
 191                               r'\g<1>{.quoteinitial}\g<2>',
 192                               cur, flags=re.MULTILINE))
 193
 194         # All other occurrences of blockquotes get the "subsequent" marker:
 195         elif cur.startswith('>') and prev and not prev.startswith('>'):
 196             ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)',
 197                               r'\g<1>{.quotesubsequent}\g<2>',
 198                               cur, flags=re.MULTILINE))
 199
 200         else: # pass through everything else.
 201             ret.append(cur)
 202
 203     return '\n'.join(ret)
 204
 205
 206 def _reformat_quotes(html):
 207     '''
 208     Earlier in the pipeline, we marked email quoting, using markers, which we
 209     now need to turn into HTML classes, so that we can use CSS to style them.
 210     '''
 211     ret = html.replace('<p>{.quotelead}', '<p class="quotelead">')
 212     ret = re.sub(r'<blockquote>\n((?:<blockquote>\n)*)<p>(?:\{\.quote(\w+)\})',
 213                  r'<blockquote class="quote \g<2>">\n\g<1><p>', ret, flags=re.MULTILINE)
 214     return ret
 215
 216
 217
 218 def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5',
 219                          ext_enabled=None, ext_disabled=None,
 220                          standalone=True, selfcontained=True, title=None):
 221     '''
 222     Invoke pandoc to do the actual conversion of Markdown to HTML5.
 223     '''
 224     if not ext_enabled:
 225         ext_enabled = [ 'backtick_code_blocks',
 226                        'line_blocks',
 227                        'fancy_lists',
 228                        'startnum',
 229                        'definition_lists',
 230                        'example_lists',
 231                        'table_captions',
 232                        'simple_tables',
 233                        'multiline_tables',
 234                        'grid_tables',
 235                        'pipe_tables',
 236                        'all_symbols_escapable',
 237                        'intraword_underscores',
 238                        'strikeout',
 239                        'superscript',
 240                        'subscript',
 241                        'fenced_divs',
 242                        'bracketed_spans',
 243                        'footnotes',
 244                        'inline_notes',
 245                        'emoji',
 246                        'tex_math_double_backslash',
 247                        'autolink_bare_uris'
 248                       ]
 249     if not ext_disabled:
 250         ext_disabled = [ 'tex_math_single_backslash',
 251                          'tex_math_dollars',
 252                          'smart',
 253                          'raw_html'
 254                        ]
 255
 256     enabled = '+'.join(ext_enabled)
 257     disabled = '-'.join(ext_disabled)
 258     inputfmt = f'{inputfmt}+{enabled}-{disabled}'
 259
 260     args = []
 261     if standalone:
 262         args.append('--standalone')
 263     if selfcontained:
 264         args.append('--self-contained')
 265     if title:
 266         args.append(f'--metadata=pagetitle:"{title}"')
 267
 268     return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt,
 269                                  extra_args=args)
 270
 271
 272 def _apply_styling(html):
 273     '''
 274     Inline all styles defined and used into the individual HTML tags.
 275     '''
 276     return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run()
 277
 278
 279 def _postprocess_html(html):
 280     '''
 281     Postprocess the generated and styled HTML.
 282     '''
 283
 284     # Preprocessing leaves a sentinel to work around
 285     # https://github.com/jgm/pandoc/issues/7398, and so we need to remove it:
 286     html = html.replace('</a> -PANDOC_BUG_7398-&gt;', '</a>&gt;')
 287     return html
 288
 289
 290 def convert_markdown_to_html(mdwn):
 291     '''
 292     Converts the input Markdown to HTML, handling separately the body, as well
 293     as an optional signature.
 294     '''
 295     parts = re.split(r'^-- $', mdwn, 1, flags=re.MULTILINE)
 296     body = parts[0]
 297     if len(parts) == 2:
 298         sig = parts[1]
 299     else:
 300         sig = None
 301
 302     html=''
 303     if body:
 304         body = _preprocess_markdown(body)
 305         body = _identify_quotes_for_later(body)
 306         html = _convert_with_pandoc(body, standalone=True, selfcontained=True,
 307                                     title=None)
 308         html = html.replace('<title>Untitled</title>\n','')
 309         html = _reformat_quotes(html)
 310
 311     if sig:
 312         sig = _preprocess_signature(sig)
 313         sig = _preprocess_markdown(sig)
 314         sig = _convert_with_pandoc(sig, standalone=False, selfcontained=False)
 315         sig = SIGNATURE_HTML.format(sig='<br/>'.join(sig.splitlines()))
 316         eob = html.find('</body>')
 317         html = f'{html[:eob]}{sig}\n{html[eob:]}'
 318
 319     html = _apply_styling(html)
 320     html = _postprocess_html(html)
 321
 322     return html
 323
 324
 325 def main():
 326     '''
 327     Convert text on stdin to HTML, and print it to stdout, like mutt would
 328     expect.
 329     '''
 330     html = convert_markdown_to_html(sys.stdin.read())
 331     if html:
 332         # mutt expects the content type in the first line, so:
 333         print(f'text/html\n\n{html}')
 334
 335
 336 if __name__ == '__main__':
 337     main()