+++ /dev/null
-#! /usr/bin/python3
-
-# Source: https://raw.githubusercontent.com/akkana/scripts/master/viewmailattachments
-
-# Take an mbox HTML message (e.g. from mutt), split it
-# and rewrite it so all of its attachments can be viewed in a browser
-# (perhaps after being converted to HTML from DOC or whatever first).
-#
-# Can be run from within a mailer like mutt, or independently
-# on a single message file.
-#
-# Grew out of a simpler script called viewhtmlmail.
-#
-# Copyright 2015 by Akkana Peck. Share and enjoy under the GPL v2 or later.
-# Changes:
-# Holger Klawitter 2014: create a secure temp file and avoid temp mbox
-
-# To use it from mutt, put the following lines in your .muttrc:
-# macro index <F10> "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
-# macro pager <F10> "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
-
-import os, sys
-import re
-import time
-import shutil
-import email, email.header, mimetypes
-import tempfile
-import subprocess
-from bs4 import BeautifulSoup
-
-################################################
-# Some prefs:
-USE_WVHTML_FOR_DOC = False
-BROWSER_ARGS = []
-TMPDIR=os.path.join(os.getenv('TMPDIR'), 'volatile')
-
-# How many seconds do we need to wait for unoconv?
-# It defaults to 6, but on a 64-bit machine that's not enough.
-UNOCONV_STARTUP_TIME = "10"
-
-# Does the browser need a one-time argument for bringing up an initial window,
-# like Firefox's -private-window -new-instance ?
-BROWSER_FIRST_ARG = []
-
-# What browser to use:
-USE_QUICKBROWSE = False
-
-if USE_QUICKBROWSE:
- BROWSER = "quickbrowse"
-
- # Browser argument to precede new tabs:
- BROWSER_FIRST_ARGS = []
- BROWSER_ARGS = [ "--new-tab" ]
-
- # Will the browser block when first run until its window is closed?
- # If so, we have to run it in the background.
- BROWSER_BACKGROUND = False
-
- # Should we convert PDF to HTML? Depends on BROWSER:
- # Firefox has a built-in PDF viewer, but quickbrowse doesn't.
- CONVERT_PDF_TO_HTML = False
-
-else: # Firefox in private browsing mode
- BROWSER = "firefox"
-
- # Not clear what to do here: Firefox has a built-in PDF viewer,
- # but for some mime types it can't figure out that it should use it.
- BROWSER_FIRST_ARGS = [ "-private-window" ]
- BROWSER_ARGS = [ "-new-tab", "-private-window" ]
- # Firefox doesn't run in the background.
- BROWSER_BACKGROUND = True
-
- CONVERT_PDF_TO_HTML = False
-
-# End global prefs
-################################################
-
-# Temporary for debugging:
-class mysubprocess:
- @staticmethod
- def call(arr):
- print("\n\n================\n=== Calling: %s" % str(arr))
- subprocess.call(arr)
-
- @staticmethod
- def call_bg(arr):
- print("\n\n================\n=== Calling in background: %s" % str(arr))
- subprocess.Popen(arr, shell=False,
- stdin=None, stdout=None, stderr=None)
-
-def view_message_attachments(fp, tmpdir):
- '''View message attachments coming from the file-like object fp.
- '''
-
- msg = email.message_from_string(fp.read())
-
- html_part = None
- counter = 1
- subfiles = []
- subparts = []
- htmlfiles = []
- htmlparts = []
-
- def tmp_file_name(part):
- partfile=part.get_filename()
- if partfile:
- n, enc = email.header.decode_header(partfile)[0]
- if n:
- partfile = n.decode(enc) if enc else n
-
- # Applications should really sanitize the given filename so that an
- # email message can't be used to overwrite important files.
- # As a first step, warn about ../
- if partfile and '../' in partfile:
- print("Eek! Possible security problem in filename %s" % partfile)
- return None
-
- # Make a filename in the tmp dir:
- if not partfile:
- ext = mimetypes.guess_extension(part.get_content_type())
- if not ext:
- # Use a generic bag-of-bits extension
- ext = '.bin'
- return tempfile.mkstemp(dir=tmpdir, suffix=ext, prefix='part-')[1]
- else:
- return os.path.join(tmpdir, partfile)
-
- def save_tmp_file(part):
- '''Saves this part's payload to a tmp file, returning the new filename.
- '''
- partfile = tmp_file_name(part)
-
- tmpfile = open(partfile, "wb")
- tmpfile.write(part.get_payload(decode=True))
- tmpfile.close()
- return partfile
-
- # Walk through the message a first, preliminary time
- # to separate out any images that might be referred to by
- # an HTML part.
- for part in msg.walk():
- # walk() includes the top-level message
- if part == msg:
- # print " Skipping the top-level message"
- continue
-
- if part.get_content_type() != "multipart/related":
- continue
-
- # It's multipart. Walk the subtree looking for image children.
- for child in part.walk():
- # print " ", child.get_content_type()
-
- # At least for now, only save images as parts of multipart.
- if child.get_content_maintype() != "image":
- continue
-
- filename = save_tmp_file(child)
- # print " Saved to", filename
-
- # Rewrite image and other inline URLs in terms of content-id.
- # Mailers may use Content-Id or Content-ID (or, presumably,
- # other capitalizations). So we can't just look it up simply.
- content_id = None
- for k in list(child.keys()):
- if k.lower() == 'content-id':
- # Remove angle brackets, if present.
- # child['Content-Id'] is unmutable: attempts to change it
- # are just ignored. Copy it to a local mutable string.
- content_id = child[k]
- if content_id.startswith('<') and \
- content_id.endswith('>'):
- content_id = content_id[1:-1]
-
- subfiles.append({ 'filename': filename,
- 'Content-Id': content_id })
- subparts.append(child)
- counter += 1
- fp = open(filename, 'wb')
- fp.write(child.get_payload(decode=True))
- fp.close()
- break # no need to look at other keys
-
- # if not content_id:
- # print filename, "doesn't have a Content-Id, not saving"
- # # print "keys:", child.keys()
-
- # print "Subfiles:"
- # for sf in subfiles:
- # print sf
-
- # Call up the browser window right away,
- # so the user can see something is happening.
- # Firefox, alas, has no way from the commandline of calling up
- # a new private window with content, then replacing that content.
- # So we'll create a file that refreshes, so that when content is ready,
- # it can redirect to the first content page.
- def write_to_index(outfile, msg, timeout_secs, redirect_url):
- if not redirect_url:
- redirect_url = "file://" + outfile
- ofp = open(outfile, "w")
- ofp.write('''<html><head>
-<meta content="utf-8" http-equiv="encoding">
-<meta http-equiv="content-type" content="text/html; charset=UTF-8">
-<meta http-equiv="refresh" content="%d;URL=%s">
-</head><body>
-<br><br><br><br><br><br><big><big>%s</big></big>
-</body></html>
-''' % (timeout_secs, redirect_url, msg))
- ofp.close()
-
- redirect_timeout = 3
- pleasewait_file = tmpdir + "/index.html"
- write_to_index(pleasewait_file, "Please wait ...", redirect_timeout, None)
-
- cmd = [ BROWSER ]
- if BROWSER_FIRST_ARGS:
- cmd += BROWSER_FIRST_ARGS
-
- cmd.append("file://" + pleasewait_file)
- print("Calling: %s" % ' '.join(cmd))
- if BROWSER_BACKGROUND:
- mysubprocess.call_bg(cmd)
- else:
- mysubprocess.call(cmd)
-
- # "data:text/html,<br><br><br><br><br><h1>Translating documents, please wait ..."
- # Use JS if we can figure out how to close or replace
- # the "please wait" tab once we have content to show.
- # But for now, setTimeout() doesn't work at all
- # in newly popped up private windows.
- # "javascript:document.writeln('<br><br><br><br><br><h1>Translating documents, please wait ...');setTimeout(function(){alert('hi');}, 500);"
- # ])
-
- # Now walk through looking for the real parts:
- # HTML, doc and docx.
- for part in msg.walk():
-
- # part has, for example:
- # items: [('Content-Type', 'image/jpeg'),
- # ('Content-Transfer-Encoding', 'base64'),
- # ('Content-ID', '<14.3631871432@web82503.mail.mud.yahoo.com>'),
- # ('Content-Disposition',
- # 'attachment; filename="ATT0001414.jpg"')]
- # keys: ['Content-Type', 'Content-Transfer-Encoding',
- # 'Content-ID', 'Content-Disposition']
- # values: ['image/jpeg', 'base64',
- # '<14.3631871432@web82503.mail.mud.yahoo.com>',
- # 'attachment; filename="ATT0001414.jpg"']
-
- # multipart/* are just containers
- #if part.get_content_maintype() == 'multipart':
- if part.is_multipart() or part.get_content_type == 'message/rfc822':
- continue
-
- if part.get_content_maintype() == "application":
- partfile = save_tmp_file(part)
- fileparts = os.path.splitext(partfile)
- htmlfilename = fileparts[0] + ".html"
-
- if part.get_content_subtype() == "msword" and USE_WVHTML_FOR_DOC:
- mysubprocess.call(["wvHtml", partfile, htmlfilename])
- htmlfiles.append(htmlfilename)
-
- elif part.get_content_subtype() == \
- "vnd.openxmlformats-officedocument.wordprocessingml.document" \
- or part.get_content_subtype() == "msword":
- mysubprocess.call(["unoconv", "-f", "html",
- "-T", UNOCONV_STARTUP_TIME,
- "-o", htmlfilename, partfile])
-
- htmlfilename = os.path.join(fileparts[0] + ".html")
- htmlfiles.append(htmlfilename)
-
- # unoconv conversions from powerpoint to HTML drop all images.
- # Try converting to PDF instead:
- elif part.get_content_subtype() == "vnd.ms-powerpoint" \
- or part.get_content_subtype() == \
- "vnd.openxmlformats-officedocument.presentationml.presentation" :
- pdffile = fileparts[0] + ".pdf"
- mysubprocess.call(["unoconv", "-f", "pdf",
- "-o", pdffile, partfile])
- htmlfiles.append(pdffile)
-
- elif part.get_content_subtype() == "pdf":
- if CONVERT_PDF_TO_HTML:
- mysubprocess.call(["pdftohtml", "-s", partfile])
-
- # But pdftohtml is idiotic about output filename
- # and won't let you override it:
- htmlfiles.append(fileparts[0] + "-html.html")
- else:
- htmlfiles.append(partfile)
-
- elif part.get_content_maintype() == "text" and \
- part.get_content_subtype() == 'html':
-
- htmlfile = tmp_file_name(part)
-
- fp = open(htmlfile, 'wb')
- htmlsrc = part.get_payload(decode=True)
-
- soup = BeautifulSoup(htmlsrc, "lxml")
-
- # Substitute filenames for CIDs:
- for tag in soup.body.find_all("img", src=True):
- if tag['src'].lower().startswith("cid:"):
- for sf in subfiles:
- if tag['src'][4:] == sf['Content-Id']:
- tag['src'] = "file://" + sf['filename']
- # for sf in subfiles:
- # htmlsrc = re.sub('cid: ?' + sf['Content-Id'],
- # 'file://' + sf['filename'],
- # htmlsrc, flags=re.IGNORECASE)
-
- # If it's HTML, we may need to add a meta charset tag. Sigh.
- # If it's text/plain, there's nothing we can do to fix charset.
- charset = part.get_charset()
- if not charset:
- charset = "UTF-8"
- head = soup.find("head")
- if not head:
- head = soup.new_tag("head")
- html = soup.find("html")
- if html:
- html.insert(0, head)
- else:
- soup.insert(0, head)
-
- if not head.findAll("meta", attrs={"http-equiv": "encoding"}) and \
- not head.findAll("meta", attrs={"http-equiv": "content-type"}):
- meta = soup.new_tag("meta")
- meta["content"] = charset
- meta["http-equiv"] = "encoding"
- head.insert(0, meta)
- meta = soup.new_tag("meta")
- meta["http-equiv"] = "content-type"
- meta["content"] = "text/html; charset=%s" % charset
- head.insert(0, meta)
-
- fp.write(soup.prettify().encode("utf-8", "xmlcharrefreplace"))
- fp.close()
-
- htmlfiles.append(htmlfile)
- elif part.get_content_maintype() == "image" and part not in subparts:
- partfile = save_tmp_file(part)
- htmlfiles.append(partfile)
-
- # Done processing attachments. Call the browser for everything.
- if htmlfiles:
- # For the first URL, just put a redirect in
- write_to_index(pleasewait_file,
- "Redirecting to file://" + htmlfiles[0],
- 0, "file://" + htmlfiles[0])
-
- for f in htmlfiles[1:]:
- # If we don't wait for the new window to pop up before
- # calling new-tab, bad things will happen: the document
- # may load in a new tab in the old window and THEN pop up
- # an unwanted third window. Go firefox.
- # Not clear whether this is true for all browsers.
- time.sleep(1)
- if BROWSER_ARGS:
- mysubprocess.call([BROWSER] + BROWSER_ARGS + ["file://" + f])
- else:
- mysubprocess.call([BROWSER, "file://" + f])
-
- # Wait a while to make sure the browser has loads the imgaes, then clean up.
- time.sleep(6)
- shutil.rmtree(tmpdir)
-
-if __name__ == '__main__':
- tmpdir = tempfile.mkdtemp(dir=TMPDIR)
-
- if len(sys.argv) > 1:
- for f in sys.argv[1:]:
- fp = open(f)
- view_message_attachments(fp, tmpdir)
- fp.close()
- else:
- view_message_attachments(sys.stdin, tmpdir)