All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
3 # markdown2html.py — simple Markdown-to-HTML converter for use with Mutt
5 # Mutt recently learnt [how to compose `multipart/alternative`
6 # emails][1]. This script assumes a message has been composed using Markdown
7 # (with a lot of pandoc extensions enabled), and translates it to `text/html`
8 # for Mutt to tie into such a `multipart/alternative` message.
10 # [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354)
14 # set send_multipart_alternative=yes
15 # set send_multipart_alternative_filter=/path/to/markdown2html.py
17 # Optionally, Custom CSS styles will be read from `~/.config/mutt/markdown2html.css`,
22 # - PyPandoc (and pandoc installed, or downloaded)
26 # - Pygments, if installed, then syntax highlighting is enabled
29 # https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html
31 # Copyright © 2019 martin f. krafft <madduck@madduck.net>
32 # Released under the GPL-2+ licence, just like Mutt itself.
42 from pygments.formatters import get_formatter_by_name
43 formatter = get_formatter_by_name('html', style='default')
44 DEFAULT_CSS = formatter.get_style_defs('.sourceCode')
54 border-left: 2px solid #eee;
60 border-left: 2px solid #666;
68 .quotechar { display: none; }
69 .footnote-ref, .footnote-back { text-decoration: none;}
72 font-family: monospace;
78 border-collapse: collapse;
79 border: 1px solid #999;
81 th, td { padding: 0.5em; }
85 .even { background: #eee; }
86 h1, h2, h3, h4, h5, h6 {
88 background-color: #eee;
91 h1 { font-size: 130%; }
92 h2 { font-size: 120%; }
93 h3 { font-size: 110%; }
94 h4 { font-size: 107%; }
95 h5 { font-size: 103%; }
96 h6 { font-size: 100%; }
97 p { padding: 0 0.5em; }
98 pre { padding: 0 1em; }
101 STYLESHEET = os.path.join(os.path.expanduser('~/.config/mutt'),
103 if os.path.exists(STYLESHEET):
104 DEFAULT_CSS += open(STYLESHEET).read()
107 '<div class="signature"><span class="leader">-- </span>{sig}</div>'
110 def _preprocess_signature(sig):
112 Preprocess the signature before markdown processing.
116 def _preprocess_markdown(mdwn):
118 Preprocess Markdown for handling by the converter.
120 # convert hard line breaks within paragraphs to 2 trailing spaces, which
121 # is the markdown way of representing hard line breaks. Note how the
122 # regexp will not match between paragraphs.
123 ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1> \n\g<2>', mdwn, flags=re.MULTILINE)
125 # Clients like Thunderbird need the leading '>' to be able to properly
126 # create nested quotes, so we duplicate the symbol, the first instance
127 # will tell pandoc to create a blockquote, while the second instance will
128 # be a <span> containing the character, along with a class that causes CSS
129 # to actually hide it from display. However, this does not work with the
130 # text-mode HTML2text converters, and so it's left commented for now.
131 #ret = re.sub(r'\n>', r' \n>[>]{.quotechar}', ret, flags=re.MULTILINE)
133 # With the autolink_bare_uris extension, we do not need to put links into
134 # angle brackets to have them converted, so let's conserve the brackets
135 # when used around email addresses. Note that this needs a postprocessing
136 # hack because the pandoc autolink converted includes the ambersand
137 # (https://github.com/jgm/pandoc/issues/7398).
138 ret = re.sub(r'<([^@]+@\S+)>', r'<\g<1> PANDOC_BUG_7398 >', ret)
142 def _identify_quotes_for_later(mdwn):
144 Email quoting such as:
147 On 1970-01-01, you said:
148 > The Flat Earth Society has members all around the globe.
151 isn't really properly handled by Markdown, so let's do our best to
152 identify the individual elements, and mark them, using a syntax similar to
153 what pandoc uses already in some cases. As pandoc won't actually use these
154 data (yet?), we call `self._reformat_quotes` later to use these markers
155 to slap the appropriate classes on the HTML tags.
158 def generate_lines_with_context(mdwn):
160 Iterates the input string line-wise, returning a triplet of
161 previous, current, and next line, the first and last of which
162 will be None on the first and last line of the input data
165 prev = cur = nxt = None
166 lines = iter(mdwn.splitlines())
172 yield prev, cur, None
175 for prev, cur, nxt in generate_lines_with_context(mdwn):
177 # The lead-in to a quote is a single line immediately preceding the
178 # quote, and ending with ':'. Note that there could be multiple of
180 if re.match(r'^[^>]+.*:\s*$', cur) and nxt.startswith('>'):
181 ret.append(f'{{.quotelead}}{cur.strip()}')
182 # pandoc needs an empty line before the blockquote, so
183 # we enter one for the purpose of HTML rendition:
187 # The first blockquote after such a lead-in gets marked as the
189 elif prev and re.match(r'^[^>]+.*:\s*$', prev) and cur.startswith('>'):
190 ret.append(re.sub(r'^(\s*>\s*)+(.+)',
191 r'\g<1>{.quoteinitial}\g<2>',
192 cur, flags=re.MULTILINE))
194 # All other occurrences of blockquotes get the "subsequent" marker:
195 elif cur.startswith('>') and prev is not None and not prev.startswith('>'):
196 ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)',
197 r'\g<1>{.quotesubsequent}\g<2>',
198 cur, flags=re.MULTILINE))
200 else: # pass through everything else.
203 return '\n'.join(ret)
206 def _reformat_quotes(html):
208 Earlier in the pipeline, we marked email quoting, using markers, which we
209 now need to turn into HTML classes, so that we can use CSS to style them.
211 ret = html.replace('<p>{.quotelead}', '<p class="quotelead">')
212 ret = re.sub(r'<blockquote>\n((?:<blockquote>\n)*)<p>(?:\{\.quote(\w+)\})',
213 r'<blockquote class="quote \g<2>">\n\g<1><p>', ret, flags=re.MULTILINE)
218 def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5',
219 ext_enabled=None, ext_disabled=None,
220 standalone=True, selfcontained=True, title=None):
222 Invoke pandoc to do the actual conversion of Markdown to HTML5.
225 ext_enabled = [ 'backtick_code_blocks',
236 'all_symbols_escapable',
237 'intraword_underscores',
246 'tex_math_double_backslash',
250 ext_disabled = [ 'tex_math_single_backslash',
256 enabled = '+'.join(ext_enabled)
257 disabled = '-'.join(ext_disabled)
258 inputfmt = f'{inputfmt}+{enabled}-{disabled}'
262 args.append('--standalone')
264 args.append('--self-contained')
266 args.append(f'--metadata=pagetitle:"{title}"')
268 return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt,
272 def _apply_styling(html):
274 Inline all styles defined and used into the individual HTML tags.
276 return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run()
279 def _postprocess_html(html):
281 Postprocess the generated and styled HTML.
284 # Preprocessing leaves a sentinel to work around
285 # https://github.com/jgm/pandoc/issues/7398, and so we need to remove it:
286 html = html.replace(' PANDOC_BUG_7398 ', '')
290 def convert_markdown_to_html(mdwn):
292 Converts the input Markdown to HTML, handling separately the body, as well
293 as an optional signature.
295 parts = re.split(r'^-- $', mdwn, 1, flags=re.MULTILINE)
304 body = _preprocess_markdown(body)
305 body = _identify_quotes_for_later(body)
306 html = _convert_with_pandoc(body, standalone=True, selfcontained=True,
308 html = html.replace('<title>Untitled</title>\n','')
309 html = _reformat_quotes(html)
312 sig = _preprocess_signature(sig)
313 sig = _preprocess_markdown(sig)
314 sig = _convert_with_pandoc(sig, standalone=False, selfcontained=False)
315 sig = SIGNATURE_HTML.format(sig='<br/>'.join(sig.splitlines()))
316 eob = html.find('</body>')
317 html = f'{html[:eob]}{sig}\n{html[eob:]}'
319 html = _apply_styling(html)
320 html = _postprocess_html(html)
327 Convert text on stdin to HTML, and print it to stdout, like mutt would
330 html = convert_markdown_to_html(sys.stdin.read())
332 # mutt expects the content type in the first line, so:
333 print(f'text/html\n\n{html}')
336 if __name__ == '__main__':