All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
3 # markdown2html.py — simple Markdown-to-HTML converter for use with Mutt
5 # Mutt recently learnt [how to compose `multipart/alternative`
6 # emails][1]. This script assumes a message has been composed using Markdown
7 # (with a lot of pandoc extensions enabled), and translates it to `text/html`
8 # for Mutt to tie into such a `multipart/alternative` message.
10 # [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354)
14 # set send_multipart_alternative=yes
15 # set send_multipart_alternative_filter=/path/to/markdown2html.py
17 # Optionally, Custom CSS styles will be read from `~/.config/mutt/markdown2html.css`,
22 # - PyPandoc (and pandoc installed, or downloaded)
26 # - Pygments, if installed, then syntax highlighting is enabled
29 # https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html
31 # Copyright © 2019 martin f. krafft <madduck@madduck.net>
32 # Released under the GPL-2+ licence, just like Mutt itself.
42 from pygments.formatters import get_formatter_by_name
43 formatter = get_formatter_by_name('html', style='default')
44 DEFAULT_CSS = formatter.get_style_defs('.sourceCode')
54 border-left: 2px solid #eee;
60 border-left: 2px solid #666;
68 .quotechar { display: none; }
69 .footnote-ref, .footnote-back { text-decoration: none;}
72 font-family: monospace;
78 border-collapse: collapse;
79 border: 1px solid #999;
81 th, td { padding: 0.5em; }
85 .even { background: #eee; }
86 h1, h2, h3, h4, h5, h6 {
88 background-color: #eee;
91 h1 { font-size: 130%; }
92 h2 { font-size: 120%; }
93 h3 { font-size: 110%; }
94 h4 { font-size: 107%; }
95 h5 { font-size: 103%; }
96 h6 { font-size: 100%; }
97 p { padding: 0 0.5em; }
98 pre { padding: 0 1em; }
101 STYLESHEET = os.path.join(os.path.expanduser('~/.config/mutt'),
103 if os.path.exists(STYLESHEET):
104 DEFAULT_CSS += open(STYLESHEET).read()
107 '<div class="signature"><span class="leader">-- </span>{sig}</div>'
110 def _preprocess_signature(sig):
112 Preprocess the signature before markdown processing.
116 def _preprocess_markdown(mdwn):
118 Preprocess Markdown for handling by the converter.
120 # convert hard line breaks within paragraphs to 2 trailing spaces, which
121 # is the markdown way of representing hard line breaks. Note how the
122 # regexp will not match between paragraphs.
123 ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1> \n\g<2>', mdwn, flags=re.MULTILINE)
125 # Clients like Thunderbird need the leading '>' to be able to properly
126 # create nested quotes, so we duplicate the symbol, the first instance
127 # will tell pandoc to create a blockquote, while the second instance will
128 # be a <span> containing the character, along with a class that causes CSS
129 # to actually hide it from display. However, this does not work with the
130 # text-mode HTML2text converters, and so it's left commented for now.
131 #ret = re.sub(r'\n>', r' \n>[>]{.quotechar}', ret, flags=re.MULTILINE)
133 # With the autolink_bare_uris extension, we do not need to put links into
134 # angle brackets to have them converted, so let's conserve the brackets
135 # when used around email addresses. Note that this needs a postprocessing
136 # hack because the pandoc autolink converted includes the ambersand
137 # (https://github.com/jgm/pandoc/issues/7398).
138 ret = re.sub(r'<([^@]+@.+\.[^>]+)>', r'<\g<1> -PANDOC_BUG_7398->', ret)
143 def _identify_quotes_for_later(mdwn):
145 Email quoting such as:
148 On 1970-01-01, you said:
149 > The Flat Earth Society has members all around the globe.
152 isn't really properly handled by Markdown, so let's do our best to
153 identify the individual elements, and mark them, using a syntax similar to
154 what pandoc uses already in some cases. As pandoc won't actually use these
155 data (yet?), we call `self._reformat_quotes` later to use these markers
156 to slap the appropriate classes on the HTML tags.
159 def generate_lines_with_context(mdwn):
161 Iterates the input string line-wise, returning a triplet of
162 previous, current, and next line, the first and last of which
163 will be None on the first and last line of the input data
166 prev = cur = nxt = None
167 lines = iter(mdwn.splitlines())
173 yield prev, cur, None
176 for prev, cur, nxt in generate_lines_with_context(mdwn):
178 # The lead-in to a quote is a single line immediately preceding the
179 # quote, and ending with ':'. Note that there could be multiple of
181 if re.match(r'^[^>]+.*:\s*$', cur) and nxt.startswith('>'):
182 ret.append(f'{{.quotelead}}{cur.strip()}')
183 # pandoc needs an empty line before the blockquote, so
184 # we enter one for the purpose of HTML rendition:
188 # The first blockquote after such a lead-in gets marked as the
190 elif prev and re.match(r'^[^>]+.*:\s*$', prev) and cur.startswith('>'):
191 ret.append(re.sub(r'^(\s*>\s*)+(.+)',
192 r'\g<1>{.quoteinitial}\g<2>',
193 cur, flags=re.MULTILINE))
195 # All other occurrences of blockquotes get the "subsequent" marker:
196 elif cur.startswith('>') and prev is not None and not prev.startswith('>'):
197 ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)',
198 r'\g<1>{.quotesubsequent}\g<2>',
199 cur, flags=re.MULTILINE))
201 else: # pass through everything else.
204 return '\n'.join(ret)
207 def _reformat_quotes(html):
209 Earlier in the pipeline, we marked email quoting, using markers, which we
210 now need to turn into HTML classes, so that we can use CSS to style them.
212 ret = html.replace('<p>{.quotelead}', '<p class="quotelead">')
213 ret = re.sub(r'<blockquote>\n((?:<blockquote>\n)*)<p>(?:\{\.quote(\w+)\})',
214 r'<blockquote class="quote \g<2>">\n\g<1><p>', ret, flags=re.MULTILINE)
219 def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5',
220 ext_enabled=None, ext_disabled=None,
221 standalone=True, selfcontained=True, title=None):
223 Invoke pandoc to do the actual conversion of Markdown to HTML5.
226 ext_enabled = [ 'backtick_code_blocks',
237 'all_symbols_escapable',
238 'intraword_underscores',
247 'tex_math_double_backslash',
251 ext_disabled = [ 'tex_math_single_backslash',
257 enabled = '+'.join(ext_enabled)
258 disabled = '-'.join(ext_disabled)
259 inputfmt = f'{inputfmt}+{enabled}-{disabled}'
263 args.append('--standalone')
265 args.append('--self-contained')
267 args.append(f'--metadata=pagetitle:"{title}"')
269 return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt,
273 def _apply_styling(html):
275 Inline all styles defined and used into the individual HTML tags.
277 return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run()
280 def _postprocess_html(html):
282 Postprocess the generated and styled HTML.
285 # Preprocessing leaves a sentinel to work around
286 # https://github.com/jgm/pandoc/issues/7398, and so we need to remove it:
287 html = html.replace('</a> -PANDOC_BUG_7398->', '</a>>')
291 def convert_markdown_to_html(mdwn):
293 Converts the input Markdown to HTML, handling separately the body, as well
294 as an optional signature.
296 parts = re.split(r'^-- $', mdwn, 1, flags=re.MULTILINE)
305 body = _preprocess_markdown(body)
306 body = _identify_quotes_for_later(body)
307 html = _convert_with_pandoc(body, standalone=True, selfcontained=True,
309 html = html.replace('<title>Untitled</title>\n','')
310 html = _reformat_quotes(html)
313 sig = _preprocess_signature(sig)
314 sig = _preprocess_markdown(sig)
315 sig = _convert_with_pandoc(sig, standalone=False, selfcontained=False)
316 sig = SIGNATURE_HTML.format(sig='<br/>'.join(sig.splitlines()))
317 eob = html.find('</body>')
318 html = f'{html[:eob]}{sig}\n{html[eob:]}'
320 html = _apply_styling(html)
321 html = _postprocess_html(html)
328 Convert text on stdin to HTML, and print it to stdout, like mutt would
331 html = convert_markdown_to_html(sys.stdin.read())
333 # mutt expects the content type in the first line, so:
334 print(f'text/html\n\n{html}')
337 if __name__ == '__main__':