]> git.madduck.net Git - etc/mutt.git/blob - .mutt/markdown2html

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

d4f5b13302bc8c917eae1702ca71169f9ad5ef1a
[etc/mutt.git] / .mutt / markdown2html
1 #!/usr/bin/python3
2 #
3 # markdown2html.py — simple Markdown-to-HTML converter for use with Mutt
4 #
5 # Mutt recently learnt [how to compose `multipart/alternative`
6 # emails][1]. This script assumes a message has been composed using Markdown
7 # (with a lot of pandoc extensions enabled), and translates it to `text/html`
8 # for Mutt to tie into such a `multipart/alternative` message.
9 #
10 # [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354)
11 #
12 # Configuration:
13 #   muttrc:
14 #     set send_multipart_alternative=yes
15 #     set send_multipart_alternative_filter=/path/to/markdown2html.py
16 #
17 # Optionally, Custom CSS styles will be read from `~/.mutt/markdown2html.css`,
18 # if present.
19 #
20 # Requirements:
21 #   - python3
22 #   - PyPandoc (and pandoc installed, or downloaded)
23 #   - Pynliner
24 #
25 # Optional:
26 #   - Pygments, if installed, then syntax highlighting is enabled
27 #
28 # Latest version:
29 #   https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html
30 #
31 # Copyright © 2019 martin f. krafft <madduck@madduck.net>
32 # Released under the GPL-2+ licence, just like Mutt itself.
33 #
34
35 import pypandoc
36 import pynliner
37 import re
38 import os
39 import sys
40
41 try:
42     from pygments.formatters import get_formatter_by_name
43     formatter = get_formatter_by_name('html', style='default')
44     DEFAULT_CSS = formatter.get_style_defs('.sourceCode')
45
46 except ImportError:
47     DEFAULT_CSS = ""
48
49
50 DEFAULT_CSS += '''
51 .quote, blockquote {
52     padding: 0 0.5em;
53     margin: 0;
54     font-style: italic;
55     border-left: 2px solid #666;
56     color: #666;
57     font-size: 80%;
58 }
59 .quotelead {
60     margin-bottom: -1em;
61     font-size: 80%;
62 }
63 .quotechar { display: none; }
64 .footnote-ref, .footnote-back { text-decoration: none;}
65 .signature {
66     color: #999;
67     font-family: monospace;
68     white-space: pre;
69     margin: 1em 0 0 0;
70     font-size: 80%;
71 }
72 table, th, td {
73     border-collapse: collapse;
74     border: 1px solid #999;
75 }
76 th, td { padding: 0.5em; }
77 .header {
78     background: #eee;
79 }
80 .even { background: #eee; }
81 h1, h2, h3, h4, h5, h6 {
82     color: #666;
83     background-color: #eee;
84     padding-left: 0.5em
85 }
86 h1 { font-size: 130%; }
87 h2 { font-size: 120%; }
88 h3 { font-size: 110%; }
89 h4 { font-size: 107%; }
90 h5 { font-size: 103%; }
91 h6 { font-size: 100%; }
92 p { padding: 0 0.5em; }
93 '''
94
95 STYLESHEET = os.path.join(os.path.expanduser('~/.mutt'),
96                           'markdown2html.css')
97 if os.path.exists(STYLESHEET):
98     DEFAULT_CSS += open(STYLESHEET).read()
99
100 HTML_DOCUMENT = '''<!DOCTYPE html>
101 <html><head>
102 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
103 <meta charset="utf-8"/>
104 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes"/>
105 <title>HTML E-Mail</title>
106 </head><body class="email">
107 {htmlbody}
108 </body></html>'''
109
110
111 SIGNATURE_HTML = \
112         '<div class="signature"><span class="leader">-- </span>{sig}</div>'
113
114
115 def _preprocess_markdown(mdwn):
116     '''
117     Preprocess Markdown for handling by the converter.
118     '''
119     # convert hard line breaks within paragraphs to 2 trailing spaces, which
120     # is the markdown way of representing hard line breaks. Note how the
121     # regexp will not match between paragraphs.
122     ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1>  \n\g<2>', mdwn, flags=re.MULTILINE)
123
124     # Clients like Thunderbird need the leading '>' to be able to properly
125     # create nested quotes, so we duplicate the symbol, the first instance
126     # will tell pandoc to create a blockquote, while the second instance will
127     # be a <span> containing the character, along with a class that causes CSS
128     # to actually hide it from display. However, this does not work with the
129     # text-mode HTML2text converters, and so it's left commented for now.
130     #ret = re.sub(r'\n>', r'  \n>[>]{.quotechar}', ret, flags=re.MULTILINE)
131
132     return ret
133
134
135 def _identify_quotes_for_later(mdwn):
136     '''
137     Email quoting such as:
138
139     ```
140     On 1970-01-01, you said:
141     > The Flat Earth Society has members all around the globe.
142     ```
143
144     isn't really properly handled by Markdown, so let's do our best to
145     identify the individual elements, and mark them, using a syntax similar to
146     what pandoc uses already in some cases. As pandoc won't actually use these
147     data (yet?), we call `self._reformat_quotes` later to use these markers
148     to slap the appropriate classes on the HTML tags.
149     '''
150
151     def generate_lines_with_context(mdwn):
152         '''
153         Iterates the input string line-wise, returning a triplet of
154         previous, current, and next line, the first and last of which
155         will be None on the first and last line of the input data
156         respectively.
157         '''
158         prev = cur = nxt = None
159         lines = iter(mdwn.splitlines())
160         cur = next(lines)
161         for nxt in lines:
162             yield prev, cur, nxt
163             prev = cur
164             cur = nxt
165         yield prev, cur, None
166
167     ret = []
168     for prev, cur, nxt in generate_lines_with_context(mdwn):
169
170         # The lead-in to a quote is a single line immediately preceding the
171         # quote, and ending with ':'. Note that there could be multiple of
172         # these:
173         if re.match(r'^.+:\s*$', cur) and nxt.startswith('>'):
174             ret.append(f'{{.quotelead}}{cur.strip()}')
175             # pandoc needs an empty line before the blockquote, so
176             # we enter one for the purpose of HTML rendition:
177             ret.append('')
178             continue
179
180         # The first blockquote after such a lead-in gets marked as the
181         # "initial" quote:
182         elif prev and re.match(r'^.+:\s*$', prev) and cur.startswith('>'):
183             ret.append(re.sub(r'^(\s*>\s*)+(.+)',
184                               r'\g<1>{.quoteinitial}\g<2>',
185                               cur, flags=re.MULTILINE))
186
187         # All other occurrences of blockquotes get the "subsequent" marker:
188         elif cur.startswith('>') and prev and not prev.startswith('>'):
189             ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)',
190                               r'\g<1>{.quotesubsequent}\g<2>',
191                               cur, flags=re.MULTILINE))
192
193         else: # pass through everything else.
194             ret.append(cur)
195
196     return '\n'.join(ret)
197
198
199 def _reformat_quotes(html):
200     '''
201     Earlier in the pipeline, we marked email quoting, using markers, which we
202     now need to turn into HTML classes, so that we can use CSS to style them.
203     '''
204     ret = html.replace('<p>{.quotelead}', '<p class="quotelead">')
205     ret = re.sub(r'<blockquote>\n((?:<blockquote>\n)*)<p>(?:\{\.quote(\w+)\})',
206                  r'<blockquote class="quote \g<2>">\n\g<1><p>', ret, flags=re.MULTILINE)
207     return ret
208
209
210
211 def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5',
212                          ext_enabled=None, ext_disabled=None,
213                          standalone=True, title="HTML E-Mail"):
214     '''
215     Invoke pandoc to do the actual conversion of Markdown to HTML5.
216     '''
217     if not ext_enabled:
218         ext_enabled = [ 'backtick_code_blocks',
219                        'line_blocks',
220                        'fancy_lists',
221                        'startnum',
222                        'definition_lists',
223                        'example_lists',
224                        'table_captions',
225                        'simple_tables',
226                        'multiline_tables',
227                        'grid_tables',
228                        'pipe_tables',
229                        'all_symbols_escapable',
230                        'intraword_underscores',
231                        'strikeout',
232                        'superscript',
233                        'subscript',
234                        'fenced_divs',
235                        'bracketed_spans',
236                        'footnotes',
237                        'inline_notes',
238                        'emoji',
239                        'tex_math_double_backslash',
240                        'autolink_bare_uris'
241                       ]
242     if not ext_disabled:
243         ext_disabled = [ 'tex_math_single_backslash',
244                          'tex_math_dollars',
245                          'smart',
246                          'raw_html'
247                        ]
248
249     enabled = '+'.join(ext_enabled)
250     disabled = '-'.join(ext_disabled)
251     inputfmt = f'{inputfmt}+{enabled}-{disabled}'
252
253     args = []
254     if standalone:
255         args.append('--standalone')
256     if title:
257         args.append(f'--metadata=pagetitle:"{title}"')
258
259     return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt,
260                                  extra_args=args)
261
262
263 def _apply_styling(html):
264     '''
265     Inline all styles defined and used into the individual HTML tags.
266     '''
267     return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run()
268
269
270 def _postprocess_html(html):
271     '''
272     Postprocess the generated and styled HTML.
273     '''
274     return html
275
276
277 def convert_markdown_to_html(mdwn):
278     '''
279     Converts the input Markdown to HTML, handling separately the body, as well
280     as an optional signature.
281     '''
282     parts = re.split(r'^-- $', mdwn, 1, flags=re.MULTILINE)
283     body = parts[0]
284     if len(parts) == 2:
285         sig = parts[1]
286     else:
287         sig = None
288
289     html=''
290     if body:
291         body = _preprocess_markdown(body)
292         body = _identify_quotes_for_later(body)
293         html = _convert_with_pandoc(body, standalone=False)
294         html = _reformat_quotes(html)
295
296     if sig:
297         sig = _preprocess_markdown(sig)
298         html += SIGNATURE_HTML.format(sig='<br/>'.join(sig.splitlines()))
299
300     html = HTML_DOCUMENT.format(htmlbody=html)
301     html = _apply_styling(html)
302     html = _postprocess_html(html)
303
304     return html
305
306
307 def main():
308     '''
309     Convert text on stdin to HTML, and print it to stdout, like mutt would
310     expect.
311     '''
312     html = convert_markdown_to_html(sys.stdin.read())
313     if html:
314         # mutt expects the content type in the first line, so:
315         print(f'text/html\n\n{html}')
316
317
318 if __name__ == '__main__':
319     main()