.mutt/markdown2html

   1 #!/usr/bin/python3
   2 #
   3 # markdown2html.py — simple Markdown-to-HTML converter for use with Mutt
   4 #
   5 # Mutt recently learnt [how to compose `multipart/alternative`
   6 # emails][1]. This script assumes a message has been composed using Markdown
   7 # (with a lot of pandoc extensions enabled), and translates it to `text/html`
   8 # for Mutt to tie into such a `multipart/alternative` message.
   9 #
  10 # [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354)
  11 #
  12 # Configuration:
  13 #   muttrc:
  14 #     set send_multipart_alternative=yes
  15 #     set send_multipart_alternative_filter=/path/to/markdown2html.py
  16 #
  17 # Optionally, Custom CSS styles will be read from `~/.mutt/markdown2html.css`,
  18 # if present.
  19 #
  20 # Requirements:
  21 #   - python3
  22 #   - PyPandoc (and pandoc installed, or downloaded)
  23 #   - Pynliner
  24 #
  25 # Optional:
  26 #   - Pygments, if installed, then syntax highlighting is enabled
  27 #
  28 # Latest version:
  29 #   https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html
  30 #
  31 # Copyright © 2019 martin f. krafft <madduck@madduck.net>
  32 # Released under the GPL-2+ licence, just like Mutt itself.
  33 #
  34
  35 import pypandoc
  36 import pynliner
  37 import re
  38 import os
  39 import sys
  40
  41 try:
  42     from pygments.formatters import get_formatter_by_name
  43     formatter = get_formatter_by_name('html', style='default')
  44     DEFAULT_CSS = formatter.get_style_defs('.sourceCode')
  45
  46 except ImportError:
  47     DEFAULT_CSS = ""
  48
  49
  50 DEFAULT_CSS += '''
  51 .quote {
  52     padding: 0 0.5em;
  53     margin: 0;
  54     font-style: italic;
  55     border-left: 2px solid #ccc;
  56     color: #999;
  57     font-size: 80%;
  58 }
  59 .quotelead {
  60     font-style: italic;
  61     margin-bottom: -1em;
  62     color: #999;
  63     font-size: 80%;
  64 }
  65 .footnote-ref, .footnote-back { text-decoration: none;}
  66 .signature {
  67     color: #999;
  68     font-family: monospace;
  69     white-space: pre;
  70     margin: 1em 0 0 0;
  71     font-size: 80%;
  72 }'''
  73
  74 STYLESHEET = os.path.join(os.path.expanduser('~/.mutt'),
  75                           'markdown2html.css')
  76 if os.path.exists(STYLESHEET):
  77     DEFAULT_CSS += open(STYLESHEET).read()
  78
  79 HTML_DOCUMENT = '''<!DOCTYPE html>
  80 <html><head>
  81 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  82 <meta charset="utf-8"/>
  83 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes"/>
  84 <title>HTML E-Mail</title>
  85 </head><body class="email">
  86 {htmlbody}
  87 </body></html>'''
  88
  89
  90 SIGNATURE_HTML = \
  91         '<div class="signature"><span class="leader">-- </span>{sig}</div>'
  92
  93
  94 def _preprocess_markdown(mdwn):
  95     '''
  96     Preprocess Markdown for handling by the converter.
  97     '''
  98     # convert hard line breaks within paragraphs to 2 trailing spaces, which
  99     # is the markdown way of representing hard line breaks. Note how the
 100     # regexp will not match between paragraphs.
 101     ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1>  \n\g<2>', mdwn, re.MULTILINE)
 102
 103     return ret
 104
 105
 106 def _identify_quotes_for_later(mdwn):
 107     '''
 108     Email quoting such as:
 109
 110     ```
 111     On 1970-01-01, you said:
 112     > The Flat Earth Society has members all around the globe.
 113     ```
 114
 115     isn't really properly handled by Markdown, so let's do our best to
 116     identify the individual elements, and mark them, using a syntax similar to
 117     what pandoc uses already in some cases. As pandoc won't actually use these
 118     data (yet?), we call `self._reformat_quotes` later to use these markers
 119     to slap the appropriate classes on the HTML tags.
 120     '''
 121
 122     def generate_lines_with_context(mdwn):
 123         '''
 124         Iterates the input string line-wise, returning a triplet of
 125         previous, current, and next line, the first and last of which
 126         will be None on the first and last line of the input data
 127         respectively.
 128         '''
 129         prev = cur = nxt = None
 130         lines = iter(mdwn.splitlines())
 131         cur = next(lines)
 132         for nxt in lines:
 133             yield prev, cur, nxt
 134             prev = cur
 135             cur = nxt
 136         yield prev, cur, None
 137
 138     ret = []
 139     for prev, cur, nxt in generate_lines_with_context(mdwn):
 140
 141         # The lead-in to a quote is a single line immediately preceding the
 142         # quote, and ending with ':'. Note that there could be multiple of
 143         # these:
 144         if re.match(r'^.+:\s*$', cur) and nxt.startswith('>'):
 145             ret.append(f'{{.quotelead}}{cur.strip()}')
 146             # pandoc needs an empty line before the blockquote, so
 147             # we enter one for the purpose of HTML rendition:
 148             ret.append('')
 149             continue
 150
 151         # The first blockquote after such a lead-in gets marked as the
 152         # "initial" quote:
 153         elif prev and re.match(r'^.+:\s*$', prev) and cur.startswith('>'):
 154             ret.append(re.sub(r'^(\s*>\s*)+(.+)',
 155                               r'\g<1>{.quoteinitial}\g<2>',
 156                               cur, re.MULTILINE))
 157
 158         # All other occurrences of blockquotes get the "subsequent" marker:
 159         elif cur.startswith('>') and not prev.startswith('>'):
 160             ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)',
 161                               r'\g<1>{.quotesubsequent}\g<2>',
 162                               cur, re.MULTILINE))
 163
 164         else: # pass through everything else.
 165             ret.append(cur)
 166
 167     return '\n'.join(ret)
 168
 169
 170 def _reformat_quotes(html):
 171     '''
 172     Earlier in the pipeline, we marked email quoting, using markers, which we
 173     now need to turn into HTML classes, so that we can use CSS to style them.
 174     '''
 175     ret = html.replace('<p>{.quotelead}', '<p class="quotelead">')
 176     ret = re.sub(r'<blockquote>\n((?:<blockquote>\n)*)<p>(?:\{\.quote(\w+)\})',
 177                  r'<blockquote class="quote \g<2>">\n\g<1><p>', ret, re.MULTILINE)
 178     return ret
 179
 180
 181
 182 def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5',
 183                          ext_enabled=None, ext_disabled=None,
 184                          standalone=True, title="HTML E-Mail"):
 185     '''
 186     Invoke pandoc to do the actual conversion of Markdown to HTML5.
 187     '''
 188     if not ext_enabled:
 189         ext_enabled = [ 'backtick_code_blocks',
 190                        'line_blocks',
 191                        'fancy_lists',
 192                        'startnum',
 193                        'definition_lists',
 194                        'example_lists',
 195                        'table_captions',
 196                        'simple_tables',
 197                        'multiline_tables',
 198                        'grid_tables',
 199                        'pipe_tables',
 200                        'all_symbols_escapable',
 201                        'intraword_underscores',
 202                        'strikeout',
 203                        'superscript',
 204                        'subscript',
 205                        'fenced_divs',
 206                        'bracketed_spans',
 207                        'footnotes',
 208                        'inline_notes',
 209                        'emoji',
 210                        'tex_math_double_backslash',
 211                       ]
 212     if not ext_disabled:
 213         ext_disabled = [ 'tex_math_single_backslash',
 214                          'tex_math_dollars',
 215                          'raw_html'
 216                        ]
 217
 218     enabled = '+'.join(ext_enabled)
 219     disabled = '-'.join(ext_disabled)
 220     inputfmt = f'{inputfmt}+{enabled}-{disabled}'
 221
 222     args = []
 223     if standalone:
 224         args.append('--standalone')
 225     if title:
 226         args.append(f'--metadata=pagetitle:"{title}"')
 227
 228     return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt,
 229                                  extra_args=args)
 230
 231
 232 def _apply_styling(html):
 233     '''
 234     Inline all styles defined and used into the individual HTML tags.
 235     '''
 236     return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run()
 237
 238
 239 def _postprocess_html(html):
 240     '''
 241     Postprocess the generated and styled HTML.
 242     '''
 243     return html
 244
 245
 246 def convert_markdown_to_html(mdwn):
 247     '''
 248     Converts the input Markdown to HTML, handling separately the body, as well
 249     as an optional signature.
 250     '''
 251     parts = re.split(r'^-- $', mdwn, 1, flags=re.MULTILINE)
 252     body = parts[0]
 253     if len(parts) == 2:
 254         sig = parts[1]
 255     else:
 256         sig = None
 257
 258     html=''
 259     if body:
 260         body = _preprocess_markdown(body)
 261         body = _identify_quotes_for_later(body)
 262         html = _convert_with_pandoc(body, standalone=False)
 263         html = _reformat_quotes(html)
 264
 265     if sig:
 266         sig = _preprocess_markdown(sig)
 267         html += SIGNATURE_HTML.format(sig='<br/>'.join(sig.splitlines()))
 268
 269     html = HTML_DOCUMENT.format(htmlbody=html)
 270     html = _apply_styling(html)
 271     html = _postprocess_html(html)
 272
 273     return html
 274
 275
 276 def main():
 277     '''
 278     Convert text on stdin to HTML, and print it to stdout, like mutt would
 279     expect.
 280     '''
 281     html = convert_markdown_to_html(sys.stdin.read())
 282     if html:
 283         # mutt expects the content type in the first line, so:
 284         print(f'text/html\n\n{html}')
 285
 286
 287 if __name__ == '__main__':
 288     main()