Untitled

#!/usr/bin/python3
#
# markdown2html.py — simple Markdown-to-HTML converter for use with Mutt
#
# Mutt recently learnt [how to compose `multipart/alternative`
# emails][1]. This script assumes a message has been composed using Markdown
# (with a lot of pandoc extensions enabled), and translates it to `text/html`
# for Mutt to tie into such a `multipart/alternative` message.
#
# [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354)
#
# Configuration:
#   muttrc:
#     set send_multipart_alternative=yes
#     set send_multipart_alternative_filter=/path/to/markdown2html.py
#
# Optionally, Custom CSS styles will be read from `~/.mutt/markdown2html.css`,
# if present.
#
# Requirements:
#   - python3
#   - PyPandoc (and pandoc installed, or downloaded)
#   - Pynliner
#
# Optional:
#   - Pygments, if installed, then syntax highlighting is enabled
#
# Latest version:
#   https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html
#
# Copyright © 2019 martin f. krafft <madduck@madduck.net>
# Released under the GPL-2+ licence, just like Mutt itself.
#

import pypandoc
import pynliner
import re
import os
import sys

try:
    from pygments.formatters import get_formatter_by_name
    formatter = get_formatter_by_name('html', style='default')
    DEFAULT_CSS = formatter.get_style_defs('.sourceCode')

except ImportError:
    DEFAULT_CSS = ""


DEFAULT_CSS += '''
.block {
    padding: 0 0.5em;
    margin: 0;
    border-left: 2px solid #eee;
}
.quote, blockquote {
    padding: 0 0.5em;
    margin: 0;
    font-style: italic;
    border-left: 2px solid #666;
    color: #666;
    font-size: 80%;
}
.quotelead {
    margin-bottom: -1em;
    font-size: 80%;
}
.quotechar { display: none; }
.footnote-ref, .footnote-back { text-decoration: none;}
.signature {
    color: #999;
    font-family: monospace;
    white-space: pre;
    margin: 1em 0 0 0;
    font-size: 80%;
}
table, th, td {
    border-collapse: collapse;
    border: 1px solid #999;
}
th, td { padding: 0.5em; }
.header {
    background: #eee;
}
.even { background: #eee; }
h1, h2, h3, h4, h5, h6 {
    color: #666;
    background-color: #eee;
    padding-left: 0.5em
}
h1 { font-size: 130%; }
h2 { font-size: 120%; }
h3 { font-size: 110%; }
h4 { font-size: 107%; }
h5 { font-size: 103%; }
h6 { font-size: 100%; }
p { padding: 0 0.5em; }
'''

STYLESHEET = os.path.join(os.path.expanduser('~/.mutt'),
                          'markdown2html.css')
if os.path.exists(STYLESHEET):
    DEFAULT_CSS += open(STYLESHEET).read()

SIGNATURE_HTML = \
        '<div class="signature"><span class="leader">-- </span>{sig}</div>'


def _preprocess_signature(sig):
    '''
    Preprocess the signature before markdown processing.
    '''
    return sig

def _preprocess_markdown(mdwn):
    '''
    Preprocess Markdown for handling by the converter.
    '''
    # convert hard line breaks within paragraphs to 2 trailing spaces, which
    # is the markdown way of representing hard line breaks. Note how the
    # regexp will not match between paragraphs.
    ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1>  \n\g<2>', mdwn, flags=re.MULTILINE)

    # Clients like Thunderbird need the leading '>' to be able to properly
    # create nested quotes, so we duplicate the symbol, the first instance
    # will tell pandoc to create a blockquote, while the second instance will
    # be a <span> containing the character, along with a class that causes CSS
    # to actually hide it from display. However, this does not work with the
    # text-mode HTML2text converters, and so it's left commented for now.
    #ret = re.sub(r'\n>', r'  \n>[>]{.quotechar}', ret, flags=re.MULTILINE)

    # With the autolink_bare_uris extension, we do not need to put links into
    # angle brackets to have them converted, so let's conserve the brackets
    # when used around email addresses. Note that this needs a postprocessing
    # hack because the pandoc autolink converted includes the ambersand
    # (https://github.com/jgm/pandoc/issues/7398).
    ret = re.sub(r'<([^@]+@.+\.[^>]+)>', r'&lt;\g<1> -PANDOC_BUG_7398-&gt;', ret)

    return ret


def _identify_quotes_for_later(mdwn):
    '''
    Email quoting such as:

    ```
    On 1970-01-01, you said:
    > The Flat Earth Society has members all around the globe.
    ```

    isn't really properly handled by Markdown, so let's do our best to
    identify the individual elements, and mark them, using a syntax similar to
    what pandoc uses already in some cases. As pandoc won't actually use these
    data (yet?), we call `self._reformat_quotes` later to use these markers
    to slap the appropriate classes on the HTML tags.
    '''

    def generate_lines_with_context(mdwn):
        '''
        Iterates the input string line-wise, returning a triplet of
        previous, current, and next line, the first and last of which
        will be None on the first and last line of the input data
        respectively.
        '''
        prev = cur = nxt = None
        lines = iter(mdwn.splitlines())
        cur = next(lines)
        for nxt in lines:
            yield prev, cur, nxt
            prev = cur
            cur = nxt
        yield prev, cur, None

    ret = []
    for prev, cur, nxt in generate_lines_with_context(mdwn):

        # The lead-in to a quote is a single line immediately preceding the
        # quote, and ending with ':'. Note that there could be multiple of
        # these:
        if re.match(r'^.+:\s*$', cur) and nxt.startswith('>'):
            ret.append(f'{{.quotelead}}{cur.strip()}')
            # pandoc needs an empty line before the blockquote, so
            # we enter one for the purpose of HTML rendition:
            ret.append('')
            continue

        # The first blockquote after such a lead-in gets marked as the
        # "initial" quote:
        elif prev and re.match(r'^.+:\s*$', prev) and cur.startswith('>'):
            ret.append(re.sub(r'^(\s*>\s*)+(.+)',
                              r'\g<1>{.quoteinitial}\g<2>',
                              cur, flags=re.MULTILINE))

        # All other occurrences of blockquotes get the "subsequent" marker:
        elif cur.startswith('>') and prev and not prev.startswith('>'):
            ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)',
                              r'\g<1>{.quotesubsequent}\g<2>',
                              cur, flags=re.MULTILINE))

        else: # pass through everything else.
            ret.append(cur)

    return '\n'.join(ret)


def _reformat_quotes(html):
    '''
    Earlier in the pipeline, we marked email quoting, using markers, which we
    now need to turn into HTML classes, so that we can use CSS to style them.
    '''
    ret = html.replace('<p>{.quotelead}', '<p class="quotelead">')
    ret = re.sub(r'<blockquote>\n((?:<blockquote>\n)*)<p>(?:\{\.quote(\w+)\})',
                 r'<blockquote class="quote \g<2>">\n\g<1><p>', ret, flags=re.MULTILINE)
    return ret


def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5',
                         ext_enabled=None, ext_disabled=None,
                         standalone=True, selfcontained=True, title=None):
    '''
    Invoke pandoc to do the actual conversion of Markdown to HTML5.
    '''
    if not ext_enabled:
        ext_enabled = [ 'backtick_code_blocks',
                       'line_blocks',
                       'fancy_lists',
                       'startnum',
                       'definition_lists',
                       'example_lists',
                       'table_captions',
                       'simple_tables',
                       'multiline_tables',
                       'grid_tables',
                       'pipe_tables',
                       'all_symbols_escapable',
                       'intraword_underscores',
                       'strikeout',
                       'superscript',
                       'subscript',
                       'fenced_divs',
                       'bracketed_spans',
                       'footnotes',
                       'inline_notes',
                       'emoji',
                       'tex_math_double_backslash',
                       'autolink_bare_uris'
                      ]
    if not ext_disabled:
        ext_disabled = [ 'tex_math_single_backslash',
                         'tex_math_dollars',
                         'smart',
                         'raw_html'
                       ]

    enabled = '+'.join(ext_enabled)
    disabled = '-'.join(ext_disabled)
    inputfmt = f'{inputfmt}+{enabled}-{disabled}'

    args = []
    if standalone:
        args.append('--standalone')
    if selfcontained:
        args.append('--self-contained')
    if title:
        args.append(f'--metadata=pagetitle:"{title}"')

    return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt,
                                 extra_args=args)


def _apply_styling(html):
    '''
    Inline all styles defined and used into the individual HTML tags.
    '''
    return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run()


def _postprocess_html(html):
    '''
    Postprocess the generated and styled HTML.
    '''

    # Preprocessing leaves a sentinel to work around
    # https://github.com/jgm/pandoc/issues/7398, and so we need to remove it:
    html = html.replace('</a> -PANDOC_BUG_7398-&gt;', '</a>&gt;')
    return html


def convert_markdown_to_html(mdwn):
    '''
    Converts the input Markdown to HTML, handling separately the body, as well
    as an optional signature.
    '''
    parts = re.split(r'^-- $', mdwn, 1, flags=re.MULTILINE)
    body = parts[0]
    if len(parts) == 2:
        sig = parts[1]
    else:
        sig = None

    html=''
    if body:
        body = _preprocess_markdown(body)
        body = _identify_quotes_for_later(body)
        html = _convert_with_pandoc(body, standalone=True, selfcontained=True,
                                    title=None)
        html = html.replace('<title>Untitled</title>\n','')
        html = _reformat_quotes(html)

    if sig:
        sig = _preprocess_signature(sig)
        sig = _preprocess_markdown(sig)
        sig = _convert_with_pandoc(sig, standalone=False, selfcontained=False)
        sig = SIGNATURE_HTML.format(sig='<br/>'.join(sig.splitlines()))
        eob = html.find('</body>')
        html = f'{html[:eob]}{sig}\n{html[eob:]}'

    html = _apply_styling(html)
    html = _postprocess_html(html)

    return html


def main():
    '''
    Convert text on stdin to HTML, and print it to stdout, like mutt would
    expect.
    '''
    html = convert_markdown_to_html(sys.stdin.read())
    if html:
        # mutt expects the content type in the first line, so:
        print(f'text/html\n\n{html}')


if __name__ == '__main__':
    main()