]> git.madduck.net Git - etc/mutt.git/blob - .mutt/markdown2html

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

540428f1d80f6485eb39357f2a92a976d179bf7a
[etc/mutt.git] / .mutt / markdown2html
1 #!/usr/bin/python3
2 #
3 # markdown2html.py — simple Markdown-to-HTML converter for use with Mutt
4 #
5 # Mutt recently learnt [how to compose `multipart/alternative`
6 # emails][1]. This script assumes a message has been composed using Markdown
7 # (with a lot of pandoc extensions enabled), and translates it to `text/html`
8 # for Mutt to tie into such a `multipart/alternative` message.
9 #
10 # [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354)
11 #
12 # Configuration:
13 #   muttrc:
14 #     set send_multipart_alternative=yes
15 #     set send_multipart_alternative_filter=/path/to/markdown2html.py
16 #
17 # Optionally, Custom CSS styles will be read from `~/.mutt/markdown2html.css`,
18 # if present.
19 #
20 # Requirements:
21 #   - python3
22 #   - PyPandoc (and pandoc installed, or downloaded)
23 #   - Pynliner
24 #
25 # Optional:
26 #   - Pygments, if installed, then syntax highlighting is enabled
27 #
28 # Latest version:
29 #   https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html
30 #
31 # Copyright © 2019 martin f. krafft <madduck@madduck.net>
32 # Released under the GPL-2+ licence, just like Mutt itself.
33 #
34
35 import pypandoc
36 import pynliner
37 import re
38 import os
39 import sys
40
41 try:
42     from pygments.formatters import get_formatter_by_name
43     formatter = get_formatter_by_name('html', style='default')
44     DEFAULT_CSS = formatter.get_style_defs('.sourceCode')
45
46 except ImportError:
47     DEFAULT_CSS = ""
48
49
50 DEFAULT_CSS += '''
51 .quote {
52     padding: 0 0.5em;
53     margin: 0;
54     font-style: italic;
55     border-left: 2px solid #ccc;
56     color: #999;
57     font-size: 80%;
58 }
59 .quotelead {
60     font-style: italic;
61     margin-bottom: -1em;
62     color: #999;
63     font-size: 80%;
64 }
65 .footnote-ref, .footnote-back { text-decoration: none;}
66 .signature {
67     color: #999;
68     font-family: monospace;
69     white-space: pre;
70     margin: 1em 0 0 0;
71     font-size: 80%;
72 }'''
73
74 STYLESHEET = os.path.join(os.path.expanduser('~/.mutt'),
75                           'markdown2html.css')
76 if os.path.exists(STYLESHEET):
77     DEFAULT_CSS += open(STYLESHEET).read()
78
79 HTML_DOCUMENT = '''<!DOCTYPE html>
80 <html><head>
81 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
82 <meta charset="utf-8"/>
83 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes"/>
84 <title>HTML E-Mail</title>
85 </head><body class="email">
86 {htmlbody}
87 </body></html>'''
88
89
90 SIGNATURE_HTML = \
91         '<div class="signature"><span class="leader">-- </span>{sig}</div>'
92
93
94 def _preprocess_markdown(mdwn):
95     '''
96     Preprocess Markdown for handling by the converter.
97     '''
98     # convert hard line breaks within paragraphs to 2 trailing spaces, which
99     # is the markdown way of representing hard line breaks. Note how the
100     # regexp will not match between paragraphs.
101     ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1>  \n\g<2>', mdwn, re.MULTILINE)
102
103     return ret
104
105
106 def _identify_quotes_for_later(mdwn):
107     '''
108     Email quoting such as:
109
110     ```
111     On 1970-01-01, you said:
112     > The Flat Earth Society has members all around the globe.
113     ```
114
115     isn't really properly handled by Markdown, so let's do our best to
116     identify the individual elements, and mark them, using a syntax similar to
117     what pandoc uses already in some cases. As pandoc won't actually use these
118     data (yet?), we call `self._reformat_quotes` later to use these markers
119     to slap the appropriate classes on the HTML tags.
120     '''
121
122     def generate_lines_with_context(mdwn):
123         '''
124         Iterates the input string line-wise, returning a triplet of
125         previous, current, and next line, the first and last of which
126         will be None on the first and last line of the input data
127         respectively.
128         '''
129         prev = cur = nxt = None
130         lines = iter(mdwn.splitlines())
131         cur = next(lines)
132         for nxt in lines:
133             yield prev, cur, nxt
134             prev = cur
135             cur = nxt
136         yield prev, cur, None
137
138     ret = []
139     for prev, cur, nxt in generate_lines_with_context(mdwn):
140
141         # The lead-in to a quote is a single line immediately preceding the
142         # quote, and ending with ':'. Note that there could be multiple of
143         # these:
144         if re.match(r'^.+:\s*$', cur) and nxt.startswith('>'):
145             ret.append(f'{{.quotelead}}{cur.strip()}')
146             # pandoc needs an empty line before the blockquote, so
147             # we enter one for the purpose of HTML rendition:
148             ret.append('')
149             continue
150
151         # The first blockquote after such a lead-in gets marked as the
152         # "initial" quote:
153         elif prev and re.match(r'^.+:\s*$', prev) and cur.startswith('>'):
154             ret.append(re.sub(r'^(\s*>\s*)+(.+)',
155                               r'\g<1>{.quoteinitial}\g<2>',
156                               cur, re.MULTILINE))
157
158         # All other occurrences of blockquotes get the "subsequent" marker:
159         elif cur.startswith('>') and prev and not prev.startswith('>'):
160             ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)',
161                               r'\g<1>{.quotesubsequent}\g<2>',
162                               cur, re.MULTILINE))
163
164         else: # pass through everything else.
165             ret.append(cur)
166
167     return '\n'.join(ret)
168
169
170 def _reformat_quotes(html):
171     '''
172     Earlier in the pipeline, we marked email quoting, using markers, which we
173     now need to turn into HTML classes, so that we can use CSS to style them.
174     '''
175     ret = html.replace('<p>{.quotelead}', '<p class="quotelead">')
176     ret = re.sub(r'<blockquote>\n((?:<blockquote>\n)*)<p>(?:\{\.quote(\w+)\})',
177                  r'<blockquote class="quote \g<2>">\n\g<1><p>', ret, re.MULTILINE)
178     return ret
179
180
181
182 def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5',
183                          ext_enabled=None, ext_disabled=None,
184                          standalone=True, title="HTML E-Mail"):
185     '''
186     Invoke pandoc to do the actual conversion of Markdown to HTML5.
187     '''
188     if not ext_enabled:
189         ext_enabled = [ 'backtick_code_blocks',
190                        'line_blocks',
191                        'fancy_lists',
192                        'startnum',
193                        'definition_lists',
194                        'example_lists',
195                        'table_captions',
196                        'simple_tables',
197                        'multiline_tables',
198                        'grid_tables',
199                        'pipe_tables',
200                        'all_symbols_escapable',
201                        'intraword_underscores',
202                        'strikeout',
203                        'superscript',
204                        'subscript',
205                        'fenced_divs',
206                        'bracketed_spans',
207                        'footnotes',
208                        'inline_notes',
209                        'emoji',
210                        'tex_math_double_backslash',
211                       ]
212     if not ext_disabled:
213         ext_disabled = [ 'tex_math_single_backslash',
214                          'tex_math_dollars',
215                          'raw_html'
216                        ]
217
218     enabled = '+'.join(ext_enabled)
219     disabled = '-'.join(ext_disabled)
220     inputfmt = f'{inputfmt}+{enabled}-{disabled}'
221
222     args = []
223     if standalone:
224         args.append('--standalone')
225     if title:
226         args.append(f'--metadata=pagetitle:"{title}"')
227
228     return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt,
229                                  extra_args=args)
230
231
232 def _apply_styling(html):
233     '''
234     Inline all styles defined and used into the individual HTML tags.
235     '''
236     return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run()
237
238
239 def _postprocess_html(html):
240     '''
241     Postprocess the generated and styled HTML.
242     '''
243     return html
244
245
246 def convert_markdown_to_html(mdwn):
247     '''
248     Converts the input Markdown to HTML, handling separately the body, as well
249     as an optional signature.
250     '''
251     parts = re.split(r'^-- $', mdwn, 1, flags=re.MULTILINE)
252     body = parts[0]
253     if len(parts) == 2:
254         sig = parts[1]
255     else:
256         sig = None
257
258     html=''
259     if body:
260         body = _preprocess_markdown(body)
261         body = _identify_quotes_for_later(body)
262         html = _convert_with_pandoc(body, standalone=False)
263         html = _reformat_quotes(html)
264
265     if sig:
266         sig = _preprocess_markdown(sig)
267         html += SIGNATURE_HTML.format(sig='<br/>'.join(sig.splitlines()))
268
269     html = HTML_DOCUMENT.format(htmlbody=html)
270     html = _apply_styling(html)
271     html = _postprocess_html(html)
272
273     return html
274
275
276 def main():
277     '''
278     Convert text on stdin to HTML, and print it to stdout, like mutt would
279     expect.
280     '''
281     html = convert_markdown_to_html(sys.stdin.read())
282     if html:
283         # mutt expects the content type in the first line, so:
284         print(f'text/html\n\n{html}')
285
286
287 if __name__ == '__main__':
288     main()