]> git.madduck.net Git - etc/neomutt.git/blob - .config/neomutt/viewmailattachments

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

buildhtmltree.py: First part of test tree still original
[etc/neomutt.git] / .config / neomutt / viewmailattachments
1 #! /usr/bin/python3
2
3 # Source: https://raw.githubusercontent.com/akkana/scripts/master/viewmailattachments
4
5 # Take an mbox HTML message (e.g. from mutt), split it
6 # and rewrite it so all of its attachments can be viewed in a browser
7 # (perhaps after being converted to HTML from DOC or whatever first).
8 #
9 # Can be run from within a mailer like mutt, or independently
10 # on a single message file.
11 #
12 # Grew out of a simpler script called viewhtmlmail.
13 #
14 # Copyright 2015 by Akkana Peck. Share and enjoy under the GPL v2 or later.
15 # Changes:
16 #   Holger Klawitter 2014: create a secure temp file and avoid temp mbox
17
18 # To use it from mutt, put the following lines in your .muttrc:
19 # macro  index  <F10>  "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
20 # macro  pager  <F10>  "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
21
22 import os, sys
23 import re
24 import time
25 import shutil
26 import email, email.header, mimetypes
27 import tempfile
28 import subprocess
29 from bs4 import BeautifulSoup
30
31 ################################################
32 # Some prefs:
33 USE_WVHTML_FOR_DOC = False
34 BROWSER_ARGS = []
35 TMPDIR=os.path.join(os.getenv('TMPDIR'), 'volatile')
36
37 # How many seconds do we need to wait for unoconv?
38 # It defaults to 6, but on a 64-bit machine that's not enough.
39 UNOCONV_STARTUP_TIME = "10"
40
41 # Does the browser need a one-time argument for bringing up an initial window,
42 # like Firefox's -private-window -new-instance ?
43 BROWSER_FIRST_ARG = []
44
45 # What browser to use:
46 USE_QUICKBROWSE = False
47
48 if USE_QUICKBROWSE:
49     BROWSER = "quickbrowse"
50
51     # Browser argument to precede new tabs:
52     BROWSER_FIRST_ARGS = []
53     BROWSER_ARGS = [ "--new-tab" ]
54
55     # Will the browser block when first run until its window is closed?
56     # If so, we have to run it in the background.
57     BROWSER_BACKGROUND = False
58
59     # Should we convert PDF to HTML? Depends on BROWSER:
60     # Firefox has a built-in PDF viewer, but quickbrowse doesn't.
61     CONVERT_PDF_TO_HTML = False
62
63 else:    # Firefox in private browsing mode
64     BROWSER = "firefox"
65
66     # Not clear what to do here: Firefox has a built-in PDF viewer,
67     # but for some mime types it can't figure out that it should use it.
68     BROWSER_FIRST_ARGS = [ "-private-window" ]
69     BROWSER_ARGS = [ "-new-tab", "-private-window" ]
70     # Firefox doesn't run in the background.
71     BROWSER_BACKGROUND = True
72
73     CONVERT_PDF_TO_HTML = False
74
75 # End global prefs
76 ################################################
77
78 # Temporary for debugging:
79 class mysubprocess:
80     @staticmethod
81     def call(arr):
82         print("\n\n================\n=== Calling: %s" % str(arr))
83         subprocess.call(arr)
84
85     @staticmethod
86     def call_bg(arr):
87         print("\n\n================\n=== Calling in background: %s" % str(arr))
88         subprocess.Popen(arr, shell=False,
89                          stdin=None, stdout=None, stderr=None)
90
91 def view_message_attachments(fp, tmpdir):
92     '''View message attachments coming from the file-like object fp.
93     '''
94
95     msg = email.message_from_string(fp.read())
96
97     html_part = None
98     counter = 1
99     subfiles = []
100     subparts = []
101     htmlfiles = []
102     htmlparts = []
103
104     def tmp_file_name(part):
105         partfile=part.get_filename()
106         if partfile:
107             n, enc = email.header.decode_header(partfile)[0]
108             if n:
109                 partfile = n.decode(enc) if enc else n
110
111         # Applications should really sanitize the given filename so that an
112         # email message can't be used to overwrite important files.
113         # As a first step, warn about ../
114         if partfile and '../' in partfile:
115             print("Eek! Possible security problem in filename %s" % partfile)
116             return None
117
118         # Make a filename in the tmp dir:
119         if not partfile:
120             ext = mimetypes.guess_extension(part.get_content_type())
121             if not ext:
122                 # Use a generic bag-of-bits extension
123                 ext = '.bin'
124             return tempfile.mkstemp(dir=tmpdir, suffix=ext, prefix='part-')[1]
125         else:
126             return os.path.join(tmpdir, partfile)
127
128     def save_tmp_file(part):
129         '''Saves this part's payload to a tmp file, returning the new filename.
130         '''
131         partfile = tmp_file_name(part)
132
133         tmpfile = open(partfile, "wb")
134         tmpfile.write(part.get_payload(decode=True))
135         tmpfile.close()
136         return partfile
137
138     # Walk through the message a first, preliminary time
139     # to separate out any images that might be referred to by
140     # an HTML part.
141     for part in msg.walk():
142         # walk() includes the top-level message
143         if part == msg:
144             # print "  Skipping the top-level message"
145             continue
146
147         if part.get_content_type() != "multipart/related":
148             continue
149
150         # It's multipart. Walk the subtree looking for image children.
151         for child in part.walk():
152             # print " ", child.get_content_type()
153
154             # At least for now, only save images as parts of multipart.
155             if child.get_content_maintype() != "image":
156                 continue
157
158             filename = save_tmp_file(child)
159             # print "    Saved to", filename
160
161             # Rewrite image and other inline URLs in terms of content-id.
162             # Mailers may use Content-Id or Content-ID (or, presumably,
163             # other capitalizations). So we can't just look it up simply.
164             content_id = None
165             for k in list(child.keys()):
166                 if k.lower() == 'content-id':
167                     # Remove angle brackets, if present.
168                     # child['Content-Id'] is unmutable: attempts to change it
169                     # are just ignored. Copy it to a local mutable string.
170                     content_id = child[k]
171                     if content_id.startswith('<') and \
172                        content_id.endswith('>'):
173                         content_id = content_id[1:-1]
174
175                     subfiles.append({ 'filename': filename,
176                                       'Content-Id': content_id })
177                     subparts.append(child)
178                     counter += 1
179                     fp = open(filename, 'wb')
180                     fp.write(child.get_payload(decode=True))
181                     fp.close()
182                     break     # no need to look at other keys
183
184             # if not content_id:
185             #     print filename, "doesn't have a Content-Id, not saving"
186             #     # print "keys:", child.keys()
187
188     # print "Subfiles:"
189     # for sf in subfiles:
190     #     print sf
191
192     # Call up the browser window right away,
193     # so the user can see something is happening.
194     # Firefox, alas, has no way from the commandline of calling up
195     # a new private window with content, then replacing that content.
196     # So we'll create a file that refreshes, so that when content is ready,
197     # it can redirect to the first content page.
198     def write_to_index(outfile, msg, timeout_secs, redirect_url):
199         if not redirect_url:
200             redirect_url = "file://" + outfile
201         ofp = open(outfile, "w")
202         ofp.write('''<html><head>
203 <meta content="utf-8" http-equiv="encoding">
204 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
205 <meta http-equiv="refresh" content="%d;URL=%s">
206 </head><body>
207 <br><br><br><br><br><br><big><big>%s</big></big>
208 </body></html>
209 ''' % (timeout_secs, redirect_url, msg))
210         ofp.close()
211
212     redirect_timeout = 3
213     pleasewait_file = tmpdir + "/index.html"
214     write_to_index(pleasewait_file, "Please wait ...", redirect_timeout, None)
215
216     cmd = [ BROWSER ]
217     if BROWSER_FIRST_ARGS:
218         cmd += BROWSER_FIRST_ARGS
219
220     cmd.append("file://" + pleasewait_file)
221     print("Calling: %s" % ' '.join(cmd))
222     if BROWSER_BACKGROUND:
223         mysubprocess.call_bg(cmd)
224     else:
225         mysubprocess.call(cmd)
226
227                        # "data:text/html,<br><br><br><br><br><h1>Translating documents, please wait ..."
228                        # Use JS if we can figure out how to close or replace
229                        # the "please wait" tab once we have content to show.
230                        # But for now, setTimeout() doesn't work at all
231                        # in newly popped up private windows.
232                        # "javascript:document.writeln('<br><br><br><br><br><h1>Translating documents, please wait ...');setTimeout(function(){alert('hi');}, 500);"
233                      # ])
234
235     # Now walk through looking for the real parts:
236     # HTML, doc and docx.
237     for part in msg.walk():
238
239         # part has, for example:
240         # items: [('Content-Type', 'image/jpeg'),
241         #         ('Content-Transfer-Encoding', 'base64'),
242         #         ('Content-ID', '<14.3631871432@web82503.mail.mud.yahoo.com>'),
243         #         ('Content-Disposition',
244         #          'attachment; filename="ATT0001414.jpg"')]
245         # keys: ['Content-Type', 'Content-Transfer-Encoding',
246         #        'Content-ID', 'Content-Disposition']
247         # values: ['image/jpeg', 'base64',
248         #          '<14.3631871432@web82503.mail.mud.yahoo.com>',
249         # 'attachment; filename="ATT0001414.jpg"']
250
251         # multipart/* are just containers
252         #if part.get_content_maintype() == 'multipart':
253         if part.is_multipart() or part.get_content_type == 'message/rfc822':
254             continue
255
256         if part.get_content_maintype() == "application":
257             partfile = save_tmp_file(part)
258             fileparts = os.path.splitext(partfile)
259             htmlfilename = fileparts[0] + ".html"
260
261             if part.get_content_subtype() == "msword" and USE_WVHTML_FOR_DOC:
262                 mysubprocess.call(["wvHtml", partfile, htmlfilename])
263                 htmlfiles.append(htmlfilename)
264
265             elif part.get_content_subtype() == \
266                  "vnd.openxmlformats-officedocument.wordprocessingml.document" \
267                  or part.get_content_subtype() == "msword":
268                 mysubprocess.call(["unoconv", "-f", "html",
269                                    "-T", UNOCONV_STARTUP_TIME,
270                                    "-o", htmlfilename, partfile])
271
272                 htmlfilename = os.path.join(fileparts[0] + ".html")
273                 htmlfiles.append(htmlfilename)
274
275             # unoconv conversions from powerpoint to HTML drop all images.
276             # Try converting to PDF instead:
277             elif part.get_content_subtype() == "vnd.ms-powerpoint" \
278                  or part.get_content_subtype() == \
279                     "vnd.openxmlformats-officedocument.presentationml.presentation" :
280                 pdffile = fileparts[0] + ".pdf"
281                 mysubprocess.call(["unoconv", "-f", "pdf",
282                                    "-o", pdffile, partfile])
283                 htmlfiles.append(pdffile)
284
285             elif part.get_content_subtype() == "pdf":
286                 if CONVERT_PDF_TO_HTML:
287                     mysubprocess.call(["pdftohtml", "-s", partfile])
288
289                     # But pdftohtml is idiotic about output filename
290                     # and won't let you override it:
291                     htmlfiles.append(fileparts[0] + "-html.html")
292                 else:
293                     htmlfiles.append(partfile)
294
295         elif part.get_content_maintype() == "text" and \
296              part.get_content_subtype() == 'html':
297
298             htmlfile = tmp_file_name(part)
299
300             fp = open(htmlfile, 'wb')
301             htmlsrc = part.get_payload(decode=True)
302
303             soup = BeautifulSoup(htmlsrc, "lxml")
304
305             # Substitute filenames for CIDs:
306             for tag in soup.body.find_all("img", src=True):
307                 if tag['src'].lower().startswith("cid:"):
308                     for sf in subfiles:
309                         if tag['src'][4:] == sf['Content-Id']:
310                             tag['src'] = "file://" + sf['filename']
311             # for sf in subfiles:
312             #     htmlsrc = re.sub('cid: ?' + sf['Content-Id'],
313             #                      'file://' + sf['filename'],
314             #                      htmlsrc, flags=re.IGNORECASE)
315
316             # If it's HTML, we may need to add a meta charset tag. Sigh.
317             # If it's text/plain, there's nothing we can do to fix charset.
318             charset = part.get_charset()
319             if not charset:
320                 charset = "UTF-8"
321             head = soup.find("head")
322             if not head:
323                 head = soup.new_tag("head")
324                 html = soup.find("html")
325                 if html:
326                     html.insert(0, head)
327                 else:
328                     soup.insert(0, head)
329
330             if not head.findAll("meta", attrs={"http-equiv": "encoding"}) and \
331                not head.findAll("meta", attrs={"http-equiv": "content-type"}):
332                 meta = soup.new_tag("meta")
333                 meta["content"] = charset
334                 meta["http-equiv"] = "encoding"
335                 head.insert(0, meta)
336                 meta = soup.new_tag("meta")
337                 meta["http-equiv"] = "content-type"
338                 meta["content"] = "text/html; charset=%s" % charset
339                 head.insert(0, meta)
340
341             fp.write(soup.prettify().encode("utf-8", "xmlcharrefreplace"))
342             fp.close()
343
344             htmlfiles.append(htmlfile)
345         elif part.get_content_maintype() == "image" and part not in subparts:
346             partfile = save_tmp_file(part)
347             htmlfiles.append(partfile)
348
349     # Done processing attachments. Call the browser for everything.
350     if htmlfiles:
351         # For the first URL, just put a redirect in
352         write_to_index(pleasewait_file,
353                        "Redirecting to file://" + htmlfiles[0],
354                        0, "file://" + htmlfiles[0])
355
356         for f in htmlfiles[1:]:
357             # If we don't wait for the new window to pop up before
358             # calling new-tab, bad things will happen: the document
359             # may load in a new tab in the old window and THEN pop up
360             # an unwanted third window. Go firefox.
361             # Not clear whether this is true for all browsers.
362             time.sleep(1)
363             if BROWSER_ARGS:
364                 mysubprocess.call([BROWSER] + BROWSER_ARGS + ["file://" + f])
365             else:
366                 mysubprocess.call([BROWSER, "file://" + f])
367
368     # Wait a while to make sure the browser has loads the imgaes, then clean up.
369     time.sleep(6)
370     shutil.rmtree(tmpdir)
371
372 if __name__ == '__main__':
373     tmpdir = tempfile.mkdtemp(dir=TMPDIR)
374
375     if len(sys.argv) > 1:
376         for f in sys.argv[1:]:
377             fp = open(f)
378             view_message_attachments(fp, tmpdir)
379             fp.close()
380     else:
381         view_message_attachments(sys.stdin, tmpdir)