All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
   3 # Source: https://raw.githubusercontent.com/akkana/scripts/master/viewmailattachments
 
   5 # Take an mbox HTML message (e.g. from mutt), split it
 
   6 # and rewrite it so all of its attachments can be viewed in a browser
 
   7 # (perhaps after being converted to HTML from DOC or whatever first).
 
   9 # Can be run from within a mailer like mutt, or independently
 
  10 # on a single message file.
 
  12 # Grew out of a simpler script called viewhtmlmail.
 
  14 # Copyright 2015 by Akkana Peck. Share and enjoy under the GPL v2 or later.
 
  16 #   Holger Klawitter 2014: create a secure temp file and avoid temp mbox
 
  18 # To use it from mutt, put the following lines in your .muttrc:
 
  19 # macro  index  <F10>  "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
 
  20 # macro  pager  <F10>  "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
 
  26 import email, email.header, mimetypes
 
  29 from bs4 import BeautifulSoup
 
  31 ################################################
 
  33 USE_WVHTML_FOR_DOC = False
 
  35 TMPDIR=os.path.join(os.getenv('TMPDIR'), 'volatile')
 
  37 # How many seconds do we need to wait for unoconv?
 
  38 # It defaults to 6, but on a 64-bit machine that's not enough.
 
  39 UNOCONV_STARTUP_TIME = "10"
 
  41 # Does the browser need a one-time argument for bringing up an initial window,
 
  42 # like Firefox's -private-window -new-instance ?
 
  43 BROWSER_FIRST_ARG = []
 
  45 # What browser to use:
 
  46 USE_QUICKBROWSE = False
 
  49     BROWSER = "quickbrowse"
 
  51     # Browser argument to precede new tabs:
 
  52     BROWSER_FIRST_ARGS = []
 
  53     BROWSER_ARGS = [ "--new-tab" ]
 
  55     # Will the browser block when first run until its window is closed?
 
  56     # If so, we have to run it in the background.
 
  57     BROWSER_BACKGROUND = False
 
  59     # Should we convert PDF to HTML? Depends on BROWSER:
 
  60     # Firefox has a built-in PDF viewer, but quickbrowse doesn't.
 
  61     CONVERT_PDF_TO_HTML = False
 
  63 else:    # Firefox in private browsing mode
 
  66     # Not clear what to do here: Firefox has a built-in PDF viewer,
 
  67     # but for some mime types it can't figure out that it should use it.
 
  68     BROWSER_FIRST_ARGS = [ "-private-window" ]
 
  69     BROWSER_ARGS = [ "-new-tab", "-private-window" ]
 
  70     # Firefox doesn't run in the background.
 
  71     BROWSER_BACKGROUND = True
 
  73     CONVERT_PDF_TO_HTML = False
 
  76 ################################################
 
  78 # Temporary for debugging:
 
  82         print("\n\n================\n=== Calling: %s" % str(arr))
 
  87         print("\n\n================\n=== Calling in background: %s" % str(arr))
 
  88         subprocess.Popen(arr, shell=False,
 
  89                          stdin=None, stdout=None, stderr=None)
 
  91 def view_message_attachments(fp, tmpdir):
 
  92     '''View message attachments coming from the file-like object fp.
 
  95     msg = email.message_from_string(fp.read())
 
 104     def tmp_file_name(part):
 
 105         partfile=part.get_filename()
 
 107             n, enc = email.header.decode_header(partfile)[0]
 
 109                 partfile = n.decode(enc) if enc else n
 
 111         # Applications should really sanitize the given filename so that an
 
 112         # email message can't be used to overwrite important files.
 
 113         # As a first step, warn about ../
 
 114         if partfile and '../' in partfile:
 
 115             print("Eek! Possible security problem in filename %s" % partfile)
 
 118         # Make a filename in the tmp dir:
 
 120             ext = mimetypes.guess_extension(part.get_content_type())
 
 122                 # Use a generic bag-of-bits extension
 
 124             return tempfile.mkstemp(dir=tmpdir, suffix=ext, prefix='part-')[1]
 
 126             return os.path.join(tmpdir, partfile)
 
 128     def save_tmp_file(part):
 
 129         '''Saves this part's payload to a tmp file, returning the new filename.
 
 131         partfile = tmp_file_name(part)
 
 133         tmpfile = open(partfile, "wb")
 
 134         tmpfile.write(part.get_payload(decode=True))
 
 138     # Walk through the message a first, preliminary time
 
 139     # to separate out any images that might be referred to by
 
 141     for part in msg.walk():
 
 142         # walk() includes the top-level message
 
 144             # print "  Skipping the top-level message"
 
 147         if part.get_content_type() != "multipart/related":
 
 150         # It's multipart. Walk the subtree looking for image children.
 
 151         for child in part.walk():
 
 152             # print " ", child.get_content_type()
 
 154             # At least for now, only save images as parts of multipart.
 
 155             if child.get_content_maintype() != "image":
 
 158             filename = save_tmp_file(child)
 
 159             # print "    Saved to", filename
 
 161             # Rewrite image and other inline URLs in terms of content-id.
 
 162             # Mailers may use Content-Id or Content-ID (or, presumably,
 
 163             # other capitalizations). So we can't just look it up simply.
 
 165             for k in list(child.keys()):
 
 166                 if k.lower() == 'content-id':
 
 167                     # Remove angle brackets, if present.
 
 168                     # child['Content-Id'] is unmutable: attempts to change it
 
 169                     # are just ignored. Copy it to a local mutable string.
 
 170                     content_id = child[k]
 
 171                     if content_id.startswith('<') and \
 
 172                        content_id.endswith('>'):
 
 173                         content_id = content_id[1:-1]
 
 175                     subfiles.append({ 'filename': filename,
 
 176                                       'Content-Id': content_id })
 
 177                     subparts.append(child)
 
 179                     fp = open(filename, 'wb')
 
 180                     fp.write(child.get_payload(decode=True))
 
 182                     break     # no need to look at other keys
 
 185             #     print filename, "doesn't have a Content-Id, not saving"
 
 186             #     # print "keys:", child.keys()
 
 189     # for sf in subfiles:
 
 192     # Call up the browser window right away,
 
 193     # so the user can see something is happening.
 
 194     # Firefox, alas, has no way from the commandline of calling up
 
 195     # a new private window with content, then replacing that content.
 
 196     # So we'll create a file that refreshes, so that when content is ready,
 
 197     # it can redirect to the first content page.
 
 198     def write_to_index(outfile, msg, timeout_secs, redirect_url):
 
 200             redirect_url = "file://" + outfile
 
 201         ofp = open(outfile, "w")
 
 202         ofp.write('''<html><head>
 
 203 <meta content="utf-8" http-equiv="encoding">
 
 204 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 
 205 <meta http-equiv="refresh" content="%d;URL=%s">
 
 207 <br><br><br><br><br><br><big><big>%s</big></big>
 
 209 ''' % (timeout_secs, redirect_url, msg))
 
 213     pleasewait_file = tmpdir + "/index.html"
 
 214     write_to_index(pleasewait_file, "Please wait ...", redirect_timeout, None)
 
 217     if BROWSER_FIRST_ARGS:
 
 218         cmd += BROWSER_FIRST_ARGS
 
 220     cmd.append("file://" + pleasewait_file)
 
 221     print("Calling: %s" % ' '.join(cmd))
 
 222     if BROWSER_BACKGROUND:
 
 223         mysubprocess.call_bg(cmd)
 
 225         mysubprocess.call(cmd)
 
 227                        # "data:text/html,<br><br><br><br><br><h1>Translating documents, please wait ..."
 
 228                        # Use JS if we can figure out how to close or replace
 
 229                        # the "please wait" tab once we have content to show.
 
 230                        # But for now, setTimeout() doesn't work at all
 
 231                        # in newly popped up private windows.
 
 232                        # "javascript:document.writeln('<br><br><br><br><br><h1>Translating documents, please wait ...');setTimeout(function(){alert('hi');}, 500);"
 
 235     # Now walk through looking for the real parts:
 
 236     # HTML, doc and docx.
 
 237     for part in msg.walk():
 
 239         # part has, for example:
 
 240         # items: [('Content-Type', 'image/jpeg'),
 
 241         #         ('Content-Transfer-Encoding', 'base64'),
 
 242         #         ('Content-ID', '<14.3631871432@web82503.mail.mud.yahoo.com>'),
 
 243         #         ('Content-Disposition',
 
 244         #          'attachment; filename="ATT0001414.jpg"')]
 
 245         # keys: ['Content-Type', 'Content-Transfer-Encoding',
 
 246         #        'Content-ID', 'Content-Disposition']
 
 247         # values: ['image/jpeg', 'base64',
 
 248         #          '<14.3631871432@web82503.mail.mud.yahoo.com>',
 
 249         # 'attachment; filename="ATT0001414.jpg"']
 
 251         # multipart/* are just containers
 
 252         #if part.get_content_maintype() == 'multipart':
 
 253         if part.is_multipart() or part.get_content_type == 'message/rfc822':
 
 256         if part.get_content_maintype() == "application":
 
 257             partfile = save_tmp_file(part)
 
 258             fileparts = os.path.splitext(partfile)
 
 259             htmlfilename = fileparts[0] + ".html"
 
 261             if part.get_content_subtype() == "msword" and USE_WVHTML_FOR_DOC:
 
 262                 mysubprocess.call(["wvHtml", partfile, htmlfilename])
 
 263                 htmlfiles.append(htmlfilename)
 
 265             elif part.get_content_subtype() == \
 
 266                  "vnd.openxmlformats-officedocument.wordprocessingml.document" \
 
 267                  or part.get_content_subtype() == "msword":
 
 268                 mysubprocess.call(["unoconv", "-f", "html",
 
 269                                    "-T", UNOCONV_STARTUP_TIME,
 
 270                                    "-o", htmlfilename, partfile])
 
 272                 htmlfilename = os.path.join(fileparts[0] + ".html")
 
 273                 htmlfiles.append(htmlfilename)
 
 275             # unoconv conversions from powerpoint to HTML drop all images.
 
 276             # Try converting to PDF instead:
 
 277             elif part.get_content_subtype() == "vnd.ms-powerpoint" \
 
 278                  or part.get_content_subtype() == \
 
 279                     "vnd.openxmlformats-officedocument.presentationml.presentation" :
 
 280                 pdffile = fileparts[0] + ".pdf"
 
 281                 mysubprocess.call(["unoconv", "-f", "pdf",
 
 282                                    "-o", pdffile, partfile])
 
 283                 htmlfiles.append(pdffile)
 
 285             elif part.get_content_subtype() == "pdf":
 
 286                 if CONVERT_PDF_TO_HTML:
 
 287                     mysubprocess.call(["pdftohtml", "-s", partfile])
 
 289                     # But pdftohtml is idiotic about output filename
 
 290                     # and won't let you override it:
 
 291                     htmlfiles.append(fileparts[0] + "-html.html")
 
 293                     htmlfiles.append(partfile)
 
 295         elif part.get_content_maintype() == "text" and \
 
 296              part.get_content_subtype() == 'html':
 
 298             htmlfile = tmp_file_name(part)
 
 300             fp = open(htmlfile, 'wb')
 
 301             htmlsrc = part.get_payload(decode=True)
 
 303             soup = BeautifulSoup(htmlsrc, "lxml")
 
 305             # Substitute filenames for CIDs:
 
 306             for tag in soup.body.find_all("img", src=True):
 
 307                 if tag['src'].lower().startswith("cid:"):
 
 309                         if tag['src'][4:] == sf['Content-Id']:
 
 310                             tag['src'] = "file://" + sf['filename']
 
 311             # for sf in subfiles:
 
 312             #     htmlsrc = re.sub('cid: ?' + sf['Content-Id'],
 
 313             #                      'file://' + sf['filename'],
 
 314             #                      htmlsrc, flags=re.IGNORECASE)
 
 316             # If it's HTML, we may need to add a meta charset tag. Sigh.
 
 317             # If it's text/plain, there's nothing we can do to fix charset.
 
 318             charset = part.get_charset()
 
 321             head = soup.find("head")
 
 323                 head = soup.new_tag("head")
 
 324                 html = soup.find("html")
 
 330             if not head.findAll("meta", attrs={"http-equiv": "encoding"}) and \
 
 331                not head.findAll("meta", attrs={"http-equiv": "content-type"}):
 
 332                 meta = soup.new_tag("meta")
 
 333                 meta["content"] = charset
 
 334                 meta["http-equiv"] = "encoding"
 
 336                 meta = soup.new_tag("meta")
 
 337                 meta["http-equiv"] = "content-type"
 
 338                 meta["content"] = "text/html; charset=%s" % charset
 
 341             fp.write(soup.prettify().encode("utf-8", "xmlcharrefreplace"))
 
 344             htmlfiles.append(htmlfile)
 
 345         elif part.get_content_maintype() == "image" and part not in subparts:
 
 346             partfile = save_tmp_file(part)
 
 347             htmlfiles.append(partfile)
 
 349     # Done processing attachments. Call the browser for everything.
 
 351         # For the first URL, just put a redirect in
 
 352         write_to_index(pleasewait_file,
 
 353                        "Redirecting to file://" + htmlfiles[0],
 
 354                        0, "file://" + htmlfiles[0])
 
 356         for f in htmlfiles[1:]:
 
 357             # If we don't wait for the new window to pop up before
 
 358             # calling new-tab, bad things will happen: the document
 
 359             # may load in a new tab in the old window and THEN pop up
 
 360             # an unwanted third window. Go firefox.
 
 361             # Not clear whether this is true for all browsers.
 
 364                 mysubprocess.call([BROWSER] + BROWSER_ARGS + ["file://" + f])
 
 366                 mysubprocess.call([BROWSER, "file://" + f])
 
 368     # Wait a while to make sure the browser has loads the imgaes, then clean up.
 
 370     shutil.rmtree(tmpdir)
 
 372 if __name__ == '__main__':
 
 373     tmpdir = tempfile.mkdtemp(dir=TMPDIR)
 
 375     if len(sys.argv) > 1:
 
 376         for f in sys.argv[1:]:
 
 378             view_message_attachments(fp, tmpdir)
 
 381         view_message_attachments(sys.stdin, tmpdir)