#!/usr/bin/python3 # # markdown2html.py — simple Markdown-to-HTML converter for use with Mutt # # Mutt recently learnt [how to compose `multipart/alternative` # emails][1]. This script assumes a message has been composed using Markdown # (with a lot of pandoc extensions enabled), and translates it to `text/html` # for Mutt to tie into such a `multipart/alternative` message. # # [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354) # # Configuration: # muttrc: # set send_multipart_alternative=yes # set send_multipart_alternative_filter=/path/to/markdown2html.py # # Optionally, Custom CSS styles will be read from `~/.config/mutt/markdown2html.css`, # if present. # # Requirements: # - python3 # - PyPandoc (and pandoc installed, or downloaded) # - Pynliner # # Optional: # - Pygments, if installed, then syntax highlighting is enabled # # Latest version: # https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html # # Copyright © 2019 martin f. krafft # Released under the GPL-2+ licence, just like Mutt itself. # import pypandoc import pynliner import re import os import sys try: from pygments.formatters import get_formatter_by_name formatter = get_formatter_by_name('html', style='default') DEFAULT_CSS = formatter.get_style_defs('.sourceCode') except ImportError: DEFAULT_CSS = "" DEFAULT_CSS += ''' .block { padding: 0 0.5em; margin: 0; border-left: 2px solid #eee; } .quote, blockquote { padding: 0 0.5em; margin: 0; font-style: italic; border-left: 2px solid #666; color: #666; font-size: 80%; } .quotelead { margin-bottom: -1em; font-size: 80%; } .quotechar { display: none; } .footnote-ref, .footnote-back { text-decoration: none;} .signature { color: #999; font-family: monospace; white-space: pre; margin: 1em 0 0 0; font-size: 80%; } table, th, td { border-collapse: collapse; border: 1px solid #999; } th, td { padding: 0.5em; } .header { background: #eee; } .even { background: #eee; } h1, h2, h3, h4, h5, h6 { color: #666; background-color: #eee; padding-left: 0.5em } h1 { font-size: 130%; } h2 { font-size: 120%; } h3 { font-size: 110%; } h4 { font-size: 107%; } h5 { font-size: 103%; } h6 { font-size: 100%; } p { padding: 0 0.5em; } pre { padding: 0 1em; } ''' STYLESHEET = os.path.join(os.path.expanduser('~/.config/mutt'), 'markdown2html.css') if os.path.exists(STYLESHEET): DEFAULT_CSS += open(STYLESHEET).read() SIGNATURE_HTML = \ '
-- {sig}
' def _preprocess_signature(sig): ''' Preprocess the signature before markdown processing. ''' return f"```\n{sig}\n```" def _preprocess_markdown(mdwn): ''' Preprocess Markdown for handling by the converter. ''' # convert hard line breaks within paragraphs to 2 trailing spaces, which # is the markdown way of representing hard line breaks. Note how the # regexp will not match between paragraphs. ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1> \n\g<2>', mdwn, flags=re.MULTILINE) # Clients like Thunderbird need the leading '>' to be able to properly # create nested quotes, so we duplicate the symbol, the first instance # will tell pandoc to create a blockquote, while the second instance will # be a containing the character, along with a class that causes CSS # to actually hide it from display. However, this does not work with the # text-mode HTML2text converters, and so it's left commented for now. #ret = re.sub(r'\n>', r' \n>[>]{.quotechar}', ret, flags=re.MULTILINE) # With the autolink_bare_uris extension, we do not need to put links into # angle brackets to have them converted, so let's conserve the brackets # when used around email addresses. Note that this needs a postprocessing # hack because the pandoc autolink converted includes the ambersand # (https://github.com/jgm/pandoc/issues/7398). ret = re.sub(r'<([^@]+@\S+)>', r'<\g<1> PANDOC_BUG_7398 >', ret) return ret def _identify_quotes_for_later(mdwn): ''' Email quoting such as: ``` On 1970-01-01, you said: > The Flat Earth Society has members all around the globe. ``` isn't really properly handled by Markdown, so let's do our best to identify the individual elements, and mark them, using a syntax similar to what pandoc uses already in some cases. As pandoc won't actually use these data (yet?), we call `self._reformat_quotes` later to use these markers to slap the appropriate classes on the HTML tags. ''' def generate_lines_with_context(mdwn): ''' Iterates the input string line-wise, returning a triplet of previous, current, and next line, the first and last of which will be None on the first and last line of the input data respectively. ''' prev = cur = nxt = None lines = iter(mdwn.splitlines()) cur = next(lines) for nxt in lines: yield prev, cur, nxt prev = cur cur = nxt yield prev, cur, None ret = [] for prev, cur, nxt in generate_lines_with_context(mdwn): # The lead-in to a quote is a single line immediately preceding the # quote, and ending with ':'. Note that there could be multiple of # these: if re.match(r'^[^>]+.*:\s*$', cur) and nxt.startswith('>'): ret.append(f'{{.quotelead}}{cur.strip()}') # pandoc needs an empty line before the blockquote, so # we enter one for the purpose of HTML rendition: ret.append('') continue # The first blockquote after such a lead-in gets marked as the # "initial" quote: elif prev and re.match(r'^[^>]+.*:\s*$', prev) and cur.startswith('>'): ret.append(re.sub(r'^(\s*>\s*)+(.+)', r'\g<1>{.quoteinitial}\g<2>', cur, flags=re.MULTILINE)) # All other occurrences of blockquotes get the "subsequent" marker: elif cur.startswith('>') and prev is not None and not prev.startswith('>'): ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)', r'\g<1>{.quotesubsequent}\g<2>', cur, flags=re.MULTILINE)) else: # pass through everything else. ret.append(cur) return '\n'.join(ret) def _reformat_quotes(html): ''' Earlier in the pipeline, we marked email quoting, using markers, which we now need to turn into HTML classes, so that we can use CSS to style them. ''' ret = html.replace('

{.quotelead}', '

') ret = re.sub(r'

\n((?:
\n)*)

(?:\{\.quote(\w+)\})', r'

\n\g<1>

', ret, flags=re.MULTILINE) return ret def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5', ext_enabled=None, ext_disabled=None, standalone=True, selfcontained=True, title='Untitled'): ''' Invoke pandoc to do the actual conversion of Markdown to HTML5. ''' if not ext_enabled: ext_enabled = [ 'backtick_code_blocks', 'line_blocks', 'fancy_lists', 'startnum', 'definition_lists', 'example_lists', 'table_captions', 'simple_tables', 'multiline_tables', 'grid_tables', 'pipe_tables', 'all_symbols_escapable', 'intraword_underscores', 'strikeout', 'superscript', 'subscript', 'fenced_divs', 'bracketed_spans', 'footnotes', 'inline_notes', 'emoji', 'tex_math_double_backslash', 'autolink_bare_uris' ] if not ext_disabled: ext_disabled = [ 'tex_math_single_backslash', 'tex_math_dollars', 'smart', 'raw_html' ] enabled = '+'.join(ext_enabled) disabled = '-'.join(ext_disabled) inputfmt = f'{inputfmt}+{enabled}-{disabled}' args = ['--metadata=document-css:false'] if standalone: args.append('--standalone') if selfcontained: args.append('--self-contained') if title: args.append(f'--metadata=title:{title}') return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt, extra_args=args) def _apply_styling(html): ''' Inline all styles defined and used into the individual HTML tags. ''' return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run() def _postprocess_html(html): ''' Postprocess the generated and styled HTML. ''' # Preprocessing leaves a sentinel to work around # https://github.com/jgm/pandoc/issues/7398, and so we need to remove it: html = html.replace(' PANDOC_BUG_7398 ', '') return html def convert_markdown_to_html(mdwn): ''' Converts the input Markdown to HTML, handling separately the body, as well as an optional signature. ''' parts = re.split(r'^-- \n', mdwn, 1, flags=re.MULTILINE) body = parts[0] if len(parts) == 2: sig = parts[1] else: sig = None html='' if body: body = _preprocess_markdown(body) body = _identify_quotes_for_later(body) html = _convert_with_pandoc(body, standalone=True, selfcontained=True, title=None) html = _reformat_quotes(html) if sig: sig = _preprocess_signature(sig) sig = _preprocess_markdown(sig) sig = _convert_with_pandoc(sig, standalone=False, selfcontained=False, title=None) sig = SIGNATURE_HTML.format(sig=sig) eob = html.find('') html = f'{html[:eob]}{sig}\n{html[eob:]}' html = _apply_styling(html) html = _postprocess_html(html) return html def main(): ''' Convert text on stdin to HTML, and print it to stdout, like mutt would expect. ''' with open(sys.argv[1], 'r') if len(sys.argv) > 1 else sys.stdin as f: html = convert_markdown_to_html(f.read()) if html: # mutt expects the content type in the first line, so: print(f'text/html\n\n{html}') if __name__ == '__main__': main()