#! /usr/bin/env python # Source: https://raw.githubusercontent.com/akkana/scripts/master/viewmailattachments # Take an mbox HTML message (e.g. from mutt), split it # and rewrite it so all of its attachments can be viewed in a browser # (perhaps after being converted to HTML from DOC or whatever first). # # Can be run from within a mailer like mutt, or independently # on a single message file. # # Grew out of a simpler script called viewhtmlmail. # # Copyright 2015 by Akkana Peck. Share and enjoy under the GPL v2 or later. # Changes: # Holger Klawitter 2014: create a secure temp file and avoid temp mbox # To use it from mutt, put the following lines in your .muttrc: # macro index "~/bin/viewmailattachments\n" "View attachments in browser" # macro pager "~/bin/viewmailattachments\n" "View attachments in browser" import os, sys import re import time import shutil import email, email.header, mimetypes import tempfile import subprocess from bs4 import BeautifulSoup ################################################ # Some prefs: USE_WVHTML_FOR_DOC = False BROWSER_ARGS = [] TMPDIR=os.path.join(os.getenv('TMPDIR'), 'volatile') # How many seconds do we need to wait for unoconv? # It defaults to 6, but on a 64-bit machine that's not enough. UNOCONV_STARTUP_TIME = "10" # Does the browser need a one-time argument for bringing up an initial window, # like Firefox's -private-window -new-instance ? BROWSER_FIRST_ARG = [] # What browser to use: USE_QUICKBROWSE = False if USE_QUICKBROWSE: BROWSER = "quickbrowse" # Browser argument to precede new tabs: BROWSER_FIRST_ARGS = [] BROWSER_ARGS = [ "--new-tab" ] # Will the browser block when first run until its window is closed? # If so, we have to run it in the background. BROWSER_BACKGROUND = False # Should we convert PDF to HTML? Depends on BROWSER: # Firefox has a built-in PDF viewer, but quickbrowse doesn't. CONVERT_PDF_TO_HTML = False else: # Firefox in private browsing mode BROWSER = "firefox" # Not clear what to do here: Firefox has a built-in PDF viewer, # but for some mime types it can't figure out that it should use it. BROWSER_FIRST_ARGS = [ "-private-window" ] BROWSER_ARGS = [ "-new-tab", "-private-window" ] # Firefox doesn't run in the background. BROWSER_BACKGROUND = True CONVERT_PDF_TO_HTML = False # End global prefs ################################################ # Temporary for debugging: class mysubprocess: @staticmethod def call(arr): print("\n\n================\n=== Calling: %s" % str(arr)) subprocess.call(arr) @staticmethod def call_bg(arr): print("\n\n================\n=== Calling in background: %s" % str(arr)) subprocess.Popen(arr, shell=False, stdin=None, stdout=None, stderr=None) def view_message_attachments(fp, tmpdir): '''View message attachments coming from the file-like object fp. ''' msg = email.message_from_string(fp.read()) html_part = None counter = 1 subfiles = [] subparts = [] htmlfiles = [] htmlparts = [] def tmp_file_name(part): partfile=part.get_filename() if partfile: n, enc = email.header.decode_header(partfile)[0] if n: partfile = n.decode(enc) if enc else n # Applications should really sanitize the given filename so that an # email message can't be used to overwrite important files. # As a first step, warn about ../ if partfile and '../' in partfile: print("Eek! Possible security problem in filename %s" % partfile) return None # Make a filename in the tmp dir: if not partfile: ext = mimetypes.guess_extension(part.get_content_type()) if not ext: # Use a generic bag-of-bits extension ext = '.bin' return tempfile.mkstemp(dir=tmpdir, suffix=ext, prefix='part-')[1] else: return os.path.join(tmpdir, partfile) def save_tmp_file(part): '''Saves this part's payload to a tmp file, returning the new filename. ''' partfile = tmp_file_name(part) tmpfile = open(partfile, "wb") tmpfile.write(part.get_payload(decode=True)) tmpfile.close() return partfile # Walk through the message a first, preliminary time # to separate out any images that might be referred to by # an HTML part. for part in msg.walk(): # walk() includes the top-level message if part == msg: # print " Skipping the top-level message" continue if part.get_content_type() != "multipart/related": continue # It's multipart. Walk the subtree looking for image children. for child in part.walk(): # print " ", child.get_content_type() # At least for now, only save images as parts of multipart. if child.get_content_maintype() != "image": continue filename = save_tmp_file(child) # print " Saved to", filename # Rewrite image and other inline URLs in terms of content-id. # Mailers may use Content-Id or Content-ID (or, presumably, # other capitalizations). So we can't just look it up simply. content_id = None for k in list(child.keys()): if k.lower() == 'content-id': # Remove angle brackets, if present. # child['Content-Id'] is unmutable: attempts to change it # are just ignored. Copy it to a local mutable string. content_id = child[k] if content_id.startswith('<') and \ content_id.endswith('>'): content_id = content_id[1:-1] subfiles.append({ 'filename': filename, 'Content-Id': content_id }) subparts.append(child) counter += 1 fp = open(filename, 'wb') fp.write(child.get_payload(decode=True)) fp.close() break # no need to look at other keys # if not content_id: # print filename, "doesn't have a Content-Id, not saving" # # print "keys:", child.keys() # print "Subfiles:" # for sf in subfiles: # print sf # Call up the browser window right away, # so the user can see something is happening. # Firefox, alas, has no way from the commandline of calling up # a new private window with content, then replacing that content. # So we'll create a file that refreshes, so that when content is ready, # it can redirect to the first content page. def write_to_index(outfile, msg, timeout_secs, redirect_url): if not redirect_url: redirect_url = "file://" + outfile ofp = open(outfile, "w") ofp.write('''





%s ''' % (timeout_secs, redirect_url, msg)) ofp.close() redirect_timeout = 3 pleasewait_file = tmpdir + "/index.html" write_to_index(pleasewait_file, "Please wait ...", redirect_timeout, None) cmd = [ BROWSER ] if BROWSER_FIRST_ARGS: cmd += BROWSER_FIRST_ARGS cmd.append("file://" + pleasewait_file) print("Calling: %s" % ' '.join(cmd)) if BROWSER_BACKGROUND: mysubprocess.call_bg(cmd) else: mysubprocess.call(cmd) # "data:text/html,




Translating documents, please wait ..." # Use JS if we can figure out how to close or replace # the "please wait" tab once we have content to show. # But for now, setTimeout() doesn't work at all # in newly popped up private windows. # "javascript:document.writeln('




Translating documents, please wait ...');setTimeout(function(){alert('hi');}, 500);" # ]) # Now walk through looking for the real parts: # HTML, doc and docx. for part in msg.walk(): # part has, for example: # items: [('Content-Type', 'image/jpeg'), # ('Content-Transfer-Encoding', 'base64'), # ('Content-ID', '<14.3631871432@web82503.mail.mud.yahoo.com>'), # ('Content-Disposition', # 'attachment; filename="ATT0001414.jpg"')] # keys: ['Content-Type', 'Content-Transfer-Encoding', # 'Content-ID', 'Content-Disposition'] # values: ['image/jpeg', 'base64', # '<14.3631871432@web82503.mail.mud.yahoo.com>', # 'attachment; filename="ATT0001414.jpg"'] # multipart/* are just containers #if part.get_content_maintype() == 'multipart': if part.is_multipart() or part.get_content_type == 'message/rfc822': continue if part.get_content_maintype() == "application": partfile = save_tmp_file(part) fileparts = os.path.splitext(partfile) htmlfilename = fileparts[0] + ".html" if part.get_content_subtype() == "msword" and USE_WVHTML_FOR_DOC: mysubprocess.call(["wvHtml", partfile, htmlfilename]) htmlfiles.append(htmlfilename) elif part.get_content_subtype() == \ "vnd.openxmlformats-officedocument.wordprocessingml.document" \ or part.get_content_subtype() == "msword": mysubprocess.call(["unoconv", "-f", "html", "-T", UNOCONV_STARTUP_TIME, "-o", htmlfilename, partfile]) htmlfilename = os.path.join(fileparts[0] + ".html") htmlfiles.append(htmlfilename) # unoconv conversions from powerpoint to HTML drop all images. # Try converting to PDF instead: elif part.get_content_subtype() == "vnd.ms-powerpoint" \ or part.get_content_subtype() == \ "vnd.openxmlformats-officedocument.presentationml.presentation" : pdffile = fileparts[0] + ".pdf" mysubprocess.call(["unoconv", "-f", "pdf", "-o", pdffile, partfile]) htmlfiles.append(pdffile) elif part.get_content_subtype() == "pdf": if CONVERT_PDF_TO_HTML: mysubprocess.call(["pdftohtml", "-s", partfile]) # But pdftohtml is idiotic about output filename # and won't let you override it: htmlfiles.append(fileparts[0] + "-html.html") else: htmlfiles.append(partfile) elif part.get_content_maintype() == "text" and \ part.get_content_subtype() == 'html': htmlfile = tmp_file_name(part) fp = open(htmlfile, 'wb') htmlsrc = part.get_payload(decode=True) soup = BeautifulSoup(htmlsrc, "lxml") # Substitute filenames for CIDs: for tag in soup.body.find_all("img", src=True): if tag['src'].lower().startswith("cid:"): for sf in subfiles: if tag['src'][4:] == sf['Content-Id']: tag['src'] = "file://" + sf['filename'] # for sf in subfiles: # htmlsrc = re.sub('cid: ?' + sf['Content-Id'], # 'file://' + sf['filename'], # htmlsrc, flags=re.IGNORECASE) # If it's HTML, we may need to add a meta charset tag. Sigh. # If it's text/plain, there's nothing we can do to fix charset. charset = part.get_charset() if not charset: charset = "UTF-8" head = soup.find("head") if not head: head = soup.new_tag("head") html = soup.find("html") if html: html.insert(0, head) else: soup.insert(0, head) if not head.findAll("meta", attrs={"http-equiv": "encoding"}) and \ not head.findAll("meta", attrs={"http-equiv": "content-type"}): meta = soup.new_tag("meta") meta["content"] = charset meta["http-equiv"] = "encoding" head.insert(0, meta) meta = soup.new_tag("meta") meta["http-equiv"] = "content-type" meta["content"] = "text/html; charset=%s" % charset head.insert(0, meta) fp.write(soup.prettify().encode("utf-8", "xmlcharrefreplace")) fp.close() htmlfiles.append(htmlfile) elif part.get_content_maintype() == "image" and part not in subparts: partfile = save_tmp_file(part) htmlfiles.append(partfile) # Done processing attachments. Call the browser for everything. if htmlfiles: # For the first URL, just put a redirect in write_to_index(pleasewait_file, "Redirecting to file://" + htmlfiles[0], 0, "file://" + htmlfiles[0]) for f in htmlfiles[1:]: # If we don't wait for the new window to pop up before # calling new-tab, bad things will happen: the document # may load in a new tab in the old window and THEN pop up # an unwanted third window. Go firefox. # Not clear whether this is true for all browsers. time.sleep(1) if BROWSER_ARGS: mysubprocess.call([BROWSER] + BROWSER_ARGS + ["file://" + f]) else: mysubprocess.call([BROWSER, "file://" + f]) # Wait a while to make sure the browser has loads the imgaes, then clean up. time.sleep(6) shutil.rmtree(tmpdir) if __name__ == '__main__': tmpdir = tempfile.mkdtemp(dir=TMPDIR) if len(sys.argv) > 1: for f in sys.argv[1:]: fp = open(f) view_message_attachments(fp, tmpdir) fp.close() else: view_message_attachments(sys.stdin, tmpdir)