.mutt/markdown2html

   1 #!/usr/bin/python3
   2 #
   3 # markdown2html.py — simple Markdown-to-HTML converter for use with Mutt
   4 #
   5 # Mutt recently learnt [how to compose `multipart/alternative`
   6 # emails][1]. This script assumes a message has been composed using Markdown
   7 # (with a lot of pandoc extensions enabled), and translates it to `text/html`
   8 # for Mutt to tie into such a `multipart/alternative` message.
   9 #
  10 # [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354)
  11 #
  12 # Configuration:
  13 #   muttrc:
  14 #     set send_multipart_alternative=yes
  15 #     set send_multipart_alternative_filter=/path/to/markdown2html.py
  16 #
  17 # Optionally, Custom CSS styles will be read from `~/.mutt/markdown2html.css`,
  18 # if present.
  19 #
  20 # Requirements:
  21 #   - python3
  22 #   - PyPandoc (and pandoc installed, or downloaded)
  23 #   - Pynliner
  24 #
  25 # Optional:
  26 #   - Pygments, if installed, then syntax highlighting is enabled
  27 #
  28 # Latest version:
  29 #   https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html
  30 #
  31 # Copyright © 2019 martin f. krafft <madduck@madduck.net>
  32 # Released under the GPL-2+ licence, just like Mutt itself.
  33 #
  34
  35 import pypandoc
  36 import pynliner
  37 import re
  38 import os
  39 import sys
  40
  41 try:
  42     from pygments.formatters import get_formatter_by_name
  43     formatter = get_formatter_by_name('html', style='default')
  44     DEFAULT_CSS = formatter.get_style_defs('.sourceCode')
  45
  46 except ImportError:
  47     DEFAULT_CSS = ""
  48
  49
  50 DEFAULT_CSS += '''
  51 .quote, blockquote {
  52     padding: 0 0.5em;
  53     margin: 0;
  54     font-style: italic;
  55     border-left: 2px solid #666;
  56     color: #666;
  57     font-size: 80%;
  58 }
  59 .quotelead {
  60     margin-bottom: -1em;
  61     font-size: 80%;
  62 }
  63 .quotechar { display: none; }
  64 .footnote-ref, .footnote-back { text-decoration: none;}
  65 .signature {
  66     color: #999;
  67     font-family: monospace;
  68     white-space: pre;
  69     margin: 1em 0 0 0;
  70     font-size: 80%;
  71 }
  72 table, th, td {
  73     border-collapse: collapse;
  74     border: 1px solid #999;
  75 }
  76 th, td { padding: 0.5em; }
  77 .header {
  78     background: #eee;
  79 }
  80 .even { background: #eee; }
  81 h1, h2, h3, h4, h5, h6 {
  82     color: #666;
  83     background-color: #eee;
  84     padding-left: 0.5em
  85 }
  86 h1 { font-size: 130%; }
  87 h2 { font-size: 120%; }
  88 h3 { font-size: 110%; }
  89 h4 { font-size: 107%; }
  90 h5 { font-size: 103%; }
  91 h6 { font-size: 100%; }
  92 p { padding: 0 0.5em; }
  93 '''
  94
  95 STYLESHEET = os.path.join(os.path.expanduser('~/.mutt'),
  96                           'markdown2html.css')
  97 if os.path.exists(STYLESHEET):
  98     DEFAULT_CSS += open(STYLESHEET).read()
  99
 100 HTML_DOCUMENT = '''<!DOCTYPE html>
 101 <html><head>
 102 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
 103 <meta charset="utf-8"/>
 104 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes"/>
 105 <title>HTML E-Mail</title>
 106 </head><body class="email">
 107 {htmlbody}
 108 </body></html>'''
 109
 110
 111 SIGNATURE_HTML = \
 112         '<div class="signature"><span class="leader">-- </span>{sig}</div>'
 113
 114
 115 def _preprocess_markdown(mdwn):
 116     '''
 117     Preprocess Markdown for handling by the converter.
 118     '''
 119     # convert hard line breaks within paragraphs to 2 trailing spaces, which
 120     # is the markdown way of representing hard line breaks. Note how the
 121     # regexp will not match between paragraphs.
 122     ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1>  \n\g<2>', mdwn, flags=re.MULTILINE)
 123
 124     # Clients like Thunderbird need the leading '>' to be able to properly
 125     # create nested quotes, so we duplicate the symbol, the first instance
 126     # will tell pandoc to create a blockquote, while the second instance will
 127     # be a <span> containing the character, along with a class that causes CSS
 128     # to actually hide it from display. However, this does not work with the
 129     # text-mode HTML2text converters, and so it's left commented for now.
 130     #ret = re.sub(r'\n>', r'  \n>[>]{.quotechar}', ret, flags=re.MULTILINE)
 131
 132     return ret
 133
 134
 135 def _identify_quotes_for_later(mdwn):
 136     '''
 137     Email quoting such as:
 138
 139     ```
 140     On 1970-01-01, you said:
 141     > The Flat Earth Society has members all around the globe.
 142     ```
 143
 144     isn't really properly handled by Markdown, so let's do our best to
 145     identify the individual elements, and mark them, using a syntax similar to
 146     what pandoc uses already in some cases. As pandoc won't actually use these
 147     data (yet?), we call `self._reformat_quotes` later to use these markers
 148     to slap the appropriate classes on the HTML tags.
 149     '''
 150
 151     def generate_lines_with_context(mdwn):
 152         '''
 153         Iterates the input string line-wise, returning a triplet of
 154         previous, current, and next line, the first and last of which
 155         will be None on the first and last line of the input data
 156         respectively.
 157         '''
 158         prev = cur = nxt = None
 159         lines = iter(mdwn.splitlines())
 160         cur = next(lines)
 161         for nxt in lines:
 162             yield prev, cur, nxt
 163             prev = cur
 164             cur = nxt
 165         yield prev, cur, None
 166
 167     ret = []
 168     for prev, cur, nxt in generate_lines_with_context(mdwn):
 169
 170         # The lead-in to a quote is a single line immediately preceding the
 171         # quote, and ending with ':'. Note that there could be multiple of
 172         # these:
 173         if re.match(r'^.+:\s*$', cur) and nxt.startswith('>'):
 174             ret.append(f'{{.quotelead}}{cur.strip()}')
 175             # pandoc needs an empty line before the blockquote, so
 176             # we enter one for the purpose of HTML rendition:
 177             ret.append('')
 178             continue
 179
 180         # The first blockquote after such a lead-in gets marked as the
 181         # "initial" quote:
 182         elif prev and re.match(r'^.+:\s*$', prev) and cur.startswith('>'):
 183             ret.append(re.sub(r'^(\s*>\s*)+(.+)',
 184                               r'\g<1>{.quoteinitial}\g<2>',
 185                               cur, flags=re.MULTILINE))
 186
 187         # All other occurrences of blockquotes get the "subsequent" marker:
 188         elif cur.startswith('>') and prev and not prev.startswith('>'):
 189             ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)',
 190                               r'\g<1>{.quotesubsequent}\g<2>',
 191                               cur, flags=re.MULTILINE))
 192
 193         else: # pass through everything else.
 194             ret.append(cur)
 195
 196     return '\n'.join(ret)
 197
 198
 199 def _reformat_quotes(html):
 200     '''
 201     Earlier in the pipeline, we marked email quoting, using markers, which we
 202     now need to turn into HTML classes, so that we can use CSS to style them.
 203     '''
 204     ret = html.replace('<p>{.quotelead}', '<p class="quotelead">')
 205     ret = re.sub(r'<blockquote>\n((?:<blockquote>\n)*)<p>(?:\{\.quote(\w+)\})',
 206                  r'<blockquote class="quote \g<2>">\n\g<1><p>', ret, flags=re.MULTILINE)
 207     return ret
 208
 209
 210
 211 def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5',
 212                          ext_enabled=None, ext_disabled=None,
 213                          standalone=True, title="HTML E-Mail"):
 214     '''
 215     Invoke pandoc to do the actual conversion of Markdown to HTML5.
 216     '''
 217     if not ext_enabled:
 218         ext_enabled = [ 'backtick_code_blocks',
 219                        'line_blocks',
 220                        'fancy_lists',
 221                        'startnum',
 222                        'definition_lists',
 223                        'example_lists',
 224                        'table_captions',
 225                        'simple_tables',
 226                        'multiline_tables',
 227                        'grid_tables',
 228                        'pipe_tables',
 229                        'all_symbols_escapable',
 230                        'intraword_underscores',
 231                        'strikeout',
 232                        'superscript',
 233                        'subscript',
 234                        'fenced_divs',
 235                        'bracketed_spans',
 236                        'footnotes',
 237                        'inline_notes',
 238                        'emoji',
 239                        'tex_math_double_backslash',
 240                        'autolink_bare_uris'
 241                       ]
 242     if not ext_disabled:
 243         ext_disabled = [ 'tex_math_single_backslash',
 244                          'tex_math_dollars',
 245                          'smart',
 246                          'raw_html'
 247                        ]
 248
 249     enabled = '+'.join(ext_enabled)
 250     disabled = '-'.join(ext_disabled)
 251     inputfmt = f'{inputfmt}+{enabled}-{disabled}'
 252
 253     args = []
 254     if standalone:
 255         args.append('--standalone')
 256     if title:
 257         args.append(f'--metadata=pagetitle:"{title}"')
 258
 259     return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt,
 260                                  extra_args=args)
 261
 262
 263 def _apply_styling(html):
 264     '''
 265     Inline all styles defined and used into the individual HTML tags.
 266     '''
 267     return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run()
 268
 269
 270 def _postprocess_html(html):
 271     '''
 272     Postprocess the generated and styled HTML.
 273     '''
 274     return html
 275
 276
 277 def convert_markdown_to_html(mdwn):
 278     '''
 279     Converts the input Markdown to HTML, handling separately the body, as well
 280     as an optional signature.
 281     '''
 282     parts = re.split(r'^-- $', mdwn, 1, flags=re.MULTILINE)
 283     body = parts[0]
 284     if len(parts) == 2:
 285         sig = parts[1]
 286     else:
 287         sig = None
 288
 289     html=''
 290     if body:
 291         body = _preprocess_markdown(body)
 292         body = _identify_quotes_for_later(body)
 293         html = _convert_with_pandoc(body, standalone=False)
 294         html = _reformat_quotes(html)
 295
 296     if sig:
 297         sig = _preprocess_markdown(sig)
 298         html += SIGNATURE_HTML.format(sig='<br/>'.join(sig.splitlines()))
 299
 300     html = HTML_DOCUMENT.format(htmlbody=html)
 301     html = _apply_styling(html)
 302     html = _postprocess_html(html)
 303
 304     return html
 305
 306
 307 def main():
 308     '''
 309     Convert text on stdin to HTML, and print it to stdout, like mutt would
 310     expect.
 311     '''
 312     html = convert_markdown_to_html(sys.stdin.read())
 313     if html:
 314         # mutt expects the content type in the first line, so:
 315         print(f'text/html\n\n{html}')
 316
 317
 318 if __name__ == '__main__':
 319     main()