+#! /usr/bin/env python
+
+# Source: https://raw.githubusercontent.com/akkana/scripts/master/viewmailattachments
+
+# Take an mbox HTML message (e.g. from mutt), split it
+# and rewrite it so all of its attachments can be viewed in a browser
+# (perhaps after being converted to HTML from DOC or whatever first).
+#
+# Can be run from within a mailer like mutt, or independently
+# on a single message file.
+#
+# Grew out of a simpler script called viewhtmlmail.
+#
+# Copyright 2015 by Akkana Peck. Share and enjoy under the GPL v2 or later.
+# Changes:
+# Holger Klawitter 2014: create a secure temp file and avoid temp mbox
+
+# To use it from mutt, put the following lines in your .muttrc:
+# macro index <F10> "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
+# macro pager <F10> "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
+
+import os, sys
+import re
+import time
+import shutil
+import email, email.header, mimetypes
+import tempfile
+import subprocess
+from bs4 import BeautifulSoup
+
+################################################
+# Some prefs:
+USE_WVHTML_FOR_DOC = False
+BROWSER_ARGS = []
+TMPDIR=os.path.join(os.getenv('TMPDIR'), 'volatile')
+
+# How many seconds do we need to wait for unoconv?
+# It defaults to 6, but on a 64-bit machine that's not enough.
+UNOCONV_STARTUP_TIME = "10"
+
+# Does the browser need a one-time argument for bringing up an initial window,
+# like Firefox's -private-window -new-instance ?
+BROWSER_FIRST_ARG = []
+
+# What browser to use:
+USE_QUICKBROWSE = False
+
+if USE_QUICKBROWSE:
+ BROWSER = "quickbrowse"
+
+ # Browser argument to precede new tabs:
+ BROWSER_FIRST_ARGS = []
+ BROWSER_ARGS = [ "--new-tab" ]
+
+ # Will the browser block when first run until its window is closed?
+ # If so, we have to run it in the background.
+ BROWSER_BACKGROUND = False
+
+ # Should we convert PDF to HTML? Depends on BROWSER:
+ # Firefox has a built-in PDF viewer, but quickbrowse doesn't.
+ CONVERT_PDF_TO_HTML = False
+
+else: # Firefox in private browsing mode
+ BROWSER = "firefox"
+
+ # Not clear what to do here: Firefox has a built-in PDF viewer,
+ # but for some mime types it can't figure out that it should use it.
+ BROWSER_FIRST_ARGS = [ "-private-window" ]
+ BROWSER_ARGS = [ "-new-tab", "-private-window" ]
+ # Firefox doesn't run in the background.
+ BROWSER_BACKGROUND = True
+
+ CONVERT_PDF_TO_HTML = False
+
+# End global prefs
+################################################
+
+# Temporary for debugging:
+class mysubprocess:
+ @staticmethod
+ def call(arr):
+ print("\n\n================\n=== Calling: %s" % str(arr))
+ subprocess.call(arr)
+
+ @staticmethod
+ def call_bg(arr):
+ print("\n\n================\n=== Calling in background: %s" % str(arr))
+ subprocess.Popen(arr, shell=False,
+ stdin=None, stdout=None, stderr=None)
+
+def view_message_attachments(fp, tmpdir):
+ '''View message attachments coming from the file-like object fp.
+ '''
+
+ msg = email.message_from_string(fp.read())
+
+ html_part = None
+ counter = 1
+ subfiles = []
+ subparts = []
+ htmlfiles = []
+ htmlparts = []
+
+ def tmp_file_name(part):
+ partfile=part.get_filename()
+ if partfile:
+ n, enc = email.header.decode_header(partfile)[0]
+ if n:
+ partfile = n.decode(enc) if enc else n
+
+ # Applications should really sanitize the given filename so that an
+ # email message can't be used to overwrite important files.
+ # As a first step, warn about ../
+ if partfile and '../' in partfile:
+ print("Eek! Possible security problem in filename %s" % partfile)
+ return None
+
+ # Make a filename in the tmp dir:
+ if not partfile:
+ ext = mimetypes.guess_extension(part.get_content_type())
+ if not ext:
+ # Use a generic bag-of-bits extension
+ ext = '.bin'
+ return tempfile.mkstemp(dir=tmpdir, suffix=ext, prefix='part-')[1]
+ else:
+ return os.path.join(tmpdir, partfile)
+
+ def save_tmp_file(part):
+ '''Saves this part's payload to a tmp file, returning the new filename.
+ '''
+ partfile = tmp_file_name(part)
+
+ tmpfile = open(partfile, "wb")
+ tmpfile.write(part.get_payload(decode=True))
+ tmpfile.close()
+ return partfile
+
+ # Walk through the message a first, preliminary time
+ # to separate out any images that might be referred to by
+ # an HTML part.
+ for part in msg.walk():
+ # walk() includes the top-level message
+ if part == msg:
+ # print " Skipping the top-level message"
+ continue
+
+ if part.get_content_type() != "multipart/related":
+ continue
+
+ # It's multipart. Walk the subtree looking for image children.
+ for child in part.walk():
+ # print " ", child.get_content_type()
+
+ # At least for now, only save images as parts of multipart.
+ if child.get_content_maintype() != "image":
+ continue
+
+ filename = save_tmp_file(child)
+ # print " Saved to", filename
+
+ # Rewrite image and other inline URLs in terms of content-id.
+ # Mailers may use Content-Id or Content-ID (or, presumably,
+ # other capitalizations). So we can't just look it up simply.
+ content_id = None
+ for k in list(child.keys()):
+ if k.lower() == 'content-id':
+ # Remove angle brackets, if present.
+ # child['Content-Id'] is unmutable: attempts to change it
+ # are just ignored. Copy it to a local mutable string.
+ content_id = child[k]
+ if content_id.startswith('<') and \
+ content_id.endswith('>'):
+ content_id = content_id[1:-1]
+
+ subfiles.append({ 'filename': filename,
+ 'Content-Id': content_id })
+ subparts.append(child)
+ counter += 1
+ fp = open(filename, 'wb')
+ fp.write(child.get_payload(decode=True))
+ fp.close()
+ break # no need to look at other keys
+
+ # if not content_id:
+ # print filename, "doesn't have a Content-Id, not saving"
+ # # print "keys:", child.keys()
+
+ # print "Subfiles:"
+ # for sf in subfiles:
+ # print sf
+
+ # Call up the browser window right away,
+ # so the user can see something is happening.
+ # Firefox, alas, has no way from the commandline of calling up
+ # a new private window with content, then replacing that content.
+ # So we'll create a file that refreshes, so that when content is ready,
+ # it can redirect to the first content page.
+ def write_to_index(outfile, msg, timeout_secs, redirect_url):
+ if not redirect_url:
+ redirect_url = "file://" + outfile
+ ofp = open(outfile, "w")
+ ofp.write('''<html><head>
+<meta content="utf-8" http-equiv="encoding">
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<meta http-equiv="refresh" content="%d;URL=%s">
+</head><body>
+<br><br><br><br><br><br><big><big>%s</big></big>
+</body></html>
+''' % (timeout_secs, redirect_url, msg))
+ ofp.close()
+
+ redirect_timeout = 3
+ pleasewait_file = tmpdir + "/index.html"
+ write_to_index(pleasewait_file, "Please wait ...", redirect_timeout, None)
+
+ cmd = [ BROWSER ]
+ if BROWSER_FIRST_ARGS:
+ cmd += BROWSER_FIRST_ARGS
+
+ cmd.append("file://" + pleasewait_file)
+ print("Calling: %s" % ' '.join(cmd))
+ if BROWSER_BACKGROUND:
+ mysubprocess.call_bg(cmd)
+ else:
+ mysubprocess.call(cmd)
+
+ # "data:text/html,<br><br><br><br><br><h1>Translating documents, please wait ..."
+ # Use JS if we can figure out how to close or replace
+ # the "please wait" tab once we have content to show.
+ # But for now, setTimeout() doesn't work at all
+ # in newly popped up private windows.
+ # "javascript:document.writeln('<br><br><br><br><br><h1>Translating documents, please wait ...');setTimeout(function(){alert('hi');}, 500);"
+ # ])
+
+ # Now walk through looking for the real parts:
+ # HTML, doc and docx.
+ for part in msg.walk():
+
+ # part has, for example:
+ # items: [('Content-Type', 'image/jpeg'),
+ # ('Content-Transfer-Encoding', 'base64'),
+ # ('Content-ID', '<14.3631871432@web82503.mail.mud.yahoo.com>'),
+ # ('Content-Disposition',
+ # 'attachment; filename="ATT0001414.jpg"')]
+ # keys: ['Content-Type', 'Content-Transfer-Encoding',
+ # 'Content-ID', 'Content-Disposition']
+ # values: ['image/jpeg', 'base64',
+ # '<14.3631871432@web82503.mail.mud.yahoo.com>',
+ # 'attachment; filename="ATT0001414.jpg"']
+
+ # multipart/* are just containers
+ #if part.get_content_maintype() == 'multipart':
+ if part.is_multipart() or part.get_content_type == 'message/rfc822':
+ continue
+
+ if part.get_content_maintype() == "application":
+ partfile = save_tmp_file(part)
+ fileparts = os.path.splitext(partfile)
+ htmlfilename = fileparts[0] + ".html"
+
+ if part.get_content_subtype() == "msword" and USE_WVHTML_FOR_DOC:
+ mysubprocess.call(["wvHtml", partfile, htmlfilename])
+ htmlfiles.append(htmlfilename)
+
+ elif part.get_content_subtype() == \
+ "vnd.openxmlformats-officedocument.wordprocessingml.document" \
+ or part.get_content_subtype() == "msword":
+ mysubprocess.call(["unoconv", "-f", "html",
+ "-T", UNOCONV_STARTUP_TIME,
+ "-o", htmlfilename, partfile])
+
+ htmlfilename = os.path.join(fileparts[0] + ".html")
+ htmlfiles.append(htmlfilename)
+
+ # unoconv conversions from powerpoint to HTML drop all images.
+ # Try converting to PDF instead:
+ elif part.get_content_subtype() == "vnd.ms-powerpoint" \
+ or part.get_content_subtype() == \
+ "vnd.openxmlformats-officedocument.presentationml.presentation" :
+ pdffile = fileparts[0] + ".pdf"
+ mysubprocess.call(["unoconv", "-f", "pdf",
+ "-o", pdffile, partfile])
+ htmlfiles.append(pdffile)
+
+ elif part.get_content_subtype() == "pdf":
+ if CONVERT_PDF_TO_HTML:
+ mysubprocess.call(["pdftohtml", "-s", partfile])
+
+ # But pdftohtml is idiotic about output filename
+ # and won't let you override it:
+ htmlfiles.append(fileparts[0] + "-html.html")
+ else:
+ htmlfiles.append(partfile)
+
+ elif part.get_content_maintype() == "text" and \
+ part.get_content_subtype() == 'html':
+
+ htmlfile = tmp_file_name(part)
+
+ fp = open(htmlfile, 'wb')
+ htmlsrc = part.get_payload(decode=True)
+
+ soup = BeautifulSoup(htmlsrc, "lxml")
+
+ # Substitute filenames for CIDs:
+ for tag in soup.body.find_all("img", src=True):
+ if tag['src'].lower().startswith("cid:"):
+ for sf in subfiles:
+ if tag['src'][4:] == sf['Content-Id']:
+ tag['src'] = "file://" + sf['filename']
+ # for sf in subfiles:
+ # htmlsrc = re.sub('cid: ?' + sf['Content-Id'],
+ # 'file://' + sf['filename'],
+ # htmlsrc, flags=re.IGNORECASE)
+
+ # If it's HTML, we may need to add a meta charset tag. Sigh.
+ # If it's text/plain, there's nothing we can do to fix charset.
+ charset = part.get_charset()
+ if not charset:
+ charset = "UTF-8"
+ head = soup.find("head")
+ if not head:
+ head = soup.new_tag("head")
+ html = soup.find("html")
+ if html:
+ html.insert(0, head)
+ else:
+ soup.insert(0, head)
+
+ if not head.findAll("meta", attrs={"http-equiv": "encoding"}) and \
+ not head.findAll("meta", attrs={"http-equiv": "content-type"}):
+ meta = soup.new_tag("meta")
+ meta["content"] = charset
+ meta["http-equiv"] = "encoding"
+ head.insert(0, meta)
+ meta = soup.new_tag("meta")
+ meta["http-equiv"] = "content-type"
+ meta["content"] = "text/html; charset=%s" % charset
+ head.insert(0, meta)
+
+ fp.write(soup.prettify().encode("utf-8", "xmlcharrefreplace"))
+ fp.close()
+
+ htmlfiles.append(htmlfile)
+ elif part.get_content_maintype() == "image" and part not in subparts:
+ partfile = save_tmp_file(part)
+ htmlfiles.append(partfile)
+
+ # Done processing attachments. Call the browser for everything.
+ if htmlfiles:
+ # For the first URL, just put a redirect in
+ write_to_index(pleasewait_file,
+ "Redirecting to file://" + htmlfiles[0],
+ 0, "file://" + htmlfiles[0])
+
+ for f in htmlfiles[1:]:
+ # If we don't wait for the new window to pop up before
+ # calling new-tab, bad things will happen: the document
+ # may load in a new tab in the old window and THEN pop up
+ # an unwanted third window. Go firefox.
+ # Not clear whether this is true for all browsers.
+ time.sleep(1)
+ if BROWSER_ARGS:
+ mysubprocess.call([BROWSER] + BROWSER_ARGS + ["file://" + f])
+ else:
+ mysubprocess.call([BROWSER, "file://" + f])
+
+ # Wait a while to make sure the browser has loads the imgaes, then clean up.
+ time.sleep(6)
+ shutil.rmtree(tmpdir)
+
+if __name__ == '__main__':
+ tmpdir = tempfile.mkdtemp(dir=TMPDIR)
+
+ if len(sys.argv) > 1:
+ for f in sys.argv[1:]:
+ fp = open(f)
+ view_message_attachments(fp, tmpdir)
+ fp.close()
+ else:
+ view_message_attachments(sys.stdin, tmpdir)