.mutt/viewmailattachments

   1 #! /usr/bin/python3
   2
   3 # Source: https://raw.githubusercontent.com/akkana/scripts/master/viewmailattachments
   4
   5 # Take an mbox HTML message (e.g. from mutt), split it
   6 # and rewrite it so all of its attachments can be viewed in a browser
   7 # (perhaps after being converted to HTML from DOC or whatever first).
   8 #
   9 # Can be run from within a mailer like mutt, or independently
  10 # on a single message file.
  11 #
  12 # Grew out of a simpler script called viewhtmlmail.
  13 #
  14 # Copyright 2015 by Akkana Peck. Share and enjoy under the GPL v2 or later.
  15 # Changes:
  16 #   Holger Klawitter 2014: create a secure temp file and avoid temp mbox
  17
  18 # To use it from mutt, put the following lines in your .muttrc:
  19 # macro  index  <F10>  "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
  20 # macro  pager  <F10>  "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
  21
  22 import os, sys
  23 import re
  24 import time
  25 import shutil
  26 import email, email.header, mimetypes
  27 import tempfile
  28 import subprocess
  29 from bs4 import BeautifulSoup
  30
  31 ################################################
  32 # Some prefs:
  33 USE_WVHTML_FOR_DOC = False
  34 BROWSER_ARGS = []
  35 TMPDIR=os.path.join(os.getenv('TMPDIR'), 'volatile')
  36
  37 # How many seconds do we need to wait for unoconv?
  38 # It defaults to 6, but on a 64-bit machine that's not enough.
  39 UNOCONV_STARTUP_TIME = "10"
  40
  41 # Does the browser need a one-time argument for bringing up an initial window,
  42 # like Firefox's -private-window -new-instance ?
  43 BROWSER_FIRST_ARG = []
  44
  45 # What browser to use:
  46 USE_QUICKBROWSE = False
  47
  48 if USE_QUICKBROWSE:
  49     BROWSER = "quickbrowse"
  50
  51     # Browser argument to precede new tabs:
  52     BROWSER_FIRST_ARGS = []
  53     BROWSER_ARGS = [ "--new-tab" ]
  54
  55     # Will the browser block when first run until its window is closed?
  56     # If so, we have to run it in the background.
  57     BROWSER_BACKGROUND = False
  58
  59     # Should we convert PDF to HTML? Depends on BROWSER:
  60     # Firefox has a built-in PDF viewer, but quickbrowse doesn't.
  61     CONVERT_PDF_TO_HTML = False
  62
  63 else:    # Firefox in private browsing mode
  64     BROWSER = "firefox"
  65
  66     # Not clear what to do here: Firefox has a built-in PDF viewer,
  67     # but for some mime types it can't figure out that it should use it.
  68     BROWSER_FIRST_ARGS = [ "-private-window" ]
  69     BROWSER_ARGS = [ "-new-tab", "-private-window" ]
  70     # Firefox doesn't run in the background.
  71     BROWSER_BACKGROUND = True
  72
  73     CONVERT_PDF_TO_HTML = False
  74
  75 # End global prefs
  76 ################################################
  77
  78 # Temporary for debugging:
  79 class mysubprocess:
  80     @staticmethod
  81     def call(arr):
  82         print("\n\n================\n=== Calling: %s" % str(arr))
  83         subprocess.call(arr)
  84
  85     @staticmethod
  86     def call_bg(arr):
  87         print("\n\n================\n=== Calling in background: %s" % str(arr))
  88         subprocess.Popen(arr, shell=False,
  89                          stdin=None, stdout=None, stderr=None)
  90
  91 def view_message_attachments(fp, tmpdir):
  92     '''View message attachments coming from the file-like object fp.
  93     '''
  94
  95     msg = email.message_from_string(fp.read())
  96
  97     html_part = None
  98     counter = 1
  99     subfiles = []
 100     subparts = []
 101     htmlfiles = []
 102     htmlparts = []
 103
 104     def tmp_file_name(part):
 105         partfile=part.get_filename()
 106         if partfile:
 107             n, enc = email.header.decode_header(partfile)[0]
 108             if n:
 109                 partfile = n.decode(enc) if enc else n
 110
 111         # Applications should really sanitize the given filename so that an
 112         # email message can't be used to overwrite important files.
 113         # As a first step, warn about ../
 114         if partfile and '../' in partfile:
 115             print("Eek! Possible security problem in filename %s" % partfile)
 116             return None
 117
 118         # Make a filename in the tmp dir:
 119         if not partfile:
 120             ext = mimetypes.guess_extension(part.get_content_type())
 121             if not ext:
 122                 # Use a generic bag-of-bits extension
 123                 ext = '.bin'
 124             return tempfile.mkstemp(dir=tmpdir, suffix=ext, prefix='part-')[1]
 125         else:
 126             return os.path.join(tmpdir, partfile)
 127
 128     def save_tmp_file(part):
 129         '''Saves this part's payload to a tmp file, returning the new filename.
 130         '''
 131         partfile = tmp_file_name(part)
 132
 133         tmpfile = open(partfile, "wb")
 134         tmpfile.write(part.get_payload(decode=True))
 135         tmpfile.close()
 136         return partfile
 137
 138     # Walk through the message a first, preliminary time
 139     # to separate out any images that might be referred to by
 140     # an HTML part.
 141     for part in msg.walk():
 142         # walk() includes the top-level message
 143         if part == msg:
 144             # print "  Skipping the top-level message"
 145             continue
 146
 147         if part.get_content_type() != "multipart/related":
 148             continue
 149
 150         # It's multipart. Walk the subtree looking for image children.
 151         for child in part.walk():
 152             # print " ", child.get_content_type()
 153
 154             # At least for now, only save images as parts of multipart.
 155             if child.get_content_maintype() != "image":
 156                 continue
 157
 158             filename = save_tmp_file(child)
 159             # print "    Saved to", filename
 160
 161             # Rewrite image and other inline URLs in terms of content-id.
 162             # Mailers may use Content-Id or Content-ID (or, presumably,
 163             # other capitalizations). So we can't just look it up simply.
 164             content_id = None
 165             for k in list(child.keys()):
 166                 if k.lower() == 'content-id':
 167                     # Remove angle brackets, if present.
 168                     # child['Content-Id'] is unmutable: attempts to change it
 169                     # are just ignored. Copy it to a local mutable string.
 170                     content_id = child[k]
 171                     if content_id.startswith('<') and \
 172                        content_id.endswith('>'):
 173                         content_id = content_id[1:-1]
 174
 175                     subfiles.append({ 'filename': filename,
 176                                       'Content-Id': content_id })
 177                     subparts.append(child)
 178                     counter += 1
 179                     fp = open(filename, 'wb')
 180                     fp.write(child.get_payload(decode=True))
 181                     fp.close()
 182                     break     # no need to look at other keys
 183
 184             # if not content_id:
 185             #     print filename, "doesn't have a Content-Id, not saving"
 186             #     # print "keys:", child.keys()
 187
 188     # print "Subfiles:"
 189     # for sf in subfiles:
 190     #     print sf
 191
 192     # Call up the browser window right away,
 193     # so the user can see something is happening.
 194     # Firefox, alas, has no way from the commandline of calling up
 195     # a new private window with content, then replacing that content.
 196     # So we'll create a file that refreshes, so that when content is ready,
 197     # it can redirect to the first content page.
 198     def write_to_index(outfile, msg, timeout_secs, redirect_url):
 199         if not redirect_url:
 200             redirect_url = "file://" + outfile
 201         ofp = open(outfile, "w")
 202         ofp.write('''<html><head>
 203 <meta content="utf-8" http-equiv="encoding">
 204 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 205 <meta http-equiv="refresh" content="%d;URL=%s">
 206 </head><body>
 207 <br><br><br><br><br><br><big><big>%s</big></big>
 208 </body></html>
 209 ''' % (timeout_secs, redirect_url, msg))
 210         ofp.close()
 211
 212     redirect_timeout = 3
 213     pleasewait_file = tmpdir + "/index.html"
 214     write_to_index(pleasewait_file, "Please wait ...", redirect_timeout, None)
 215
 216     cmd = [ BROWSER ]
 217     if BROWSER_FIRST_ARGS:
 218         cmd += BROWSER_FIRST_ARGS
 219
 220     cmd.append("file://" + pleasewait_file)
 221     print("Calling: %s" % ' '.join(cmd))
 222     if BROWSER_BACKGROUND:
 223         mysubprocess.call_bg(cmd)
 224     else:
 225         mysubprocess.call(cmd)
 226
 227                        # "data:text/html,<br><br><br><br><br><h1>Translating documents, please wait ..."
 228                        # Use JS if we can figure out how to close or replace
 229                        # the "please wait" tab once we have content to show.
 230                        # But for now, setTimeout() doesn't work at all
 231                        # in newly popped up private windows.
 232                        # "javascript:document.writeln('<br><br><br><br><br><h1>Translating documents, please wait ...');setTimeout(function(){alert('hi');}, 500);"
 233                      # ])
 234
 235     # Now walk through looking for the real parts:
 236     # HTML, doc and docx.
 237     for part in msg.walk():
 238
 239         # part has, for example:
 240         # items: [('Content-Type', 'image/jpeg'),
 241         #         ('Content-Transfer-Encoding', 'base64'),
 242         #         ('Content-ID', '<14.3631871432@web82503.mail.mud.yahoo.com>'),
 243         #         ('Content-Disposition',
 244         #          'attachment; filename="ATT0001414.jpg"')]
 245         # keys: ['Content-Type', 'Content-Transfer-Encoding',
 246         #        'Content-ID', 'Content-Disposition']
 247         # values: ['image/jpeg', 'base64',
 248         #          '<14.3631871432@web82503.mail.mud.yahoo.com>',
 249         # 'attachment; filename="ATT0001414.jpg"']
 250
 251         # multipart/* are just containers
 252         #if part.get_content_maintype() == 'multipart':
 253         if part.is_multipart() or part.get_content_type == 'message/rfc822':
 254             continue
 255
 256         if part.get_content_maintype() == "application":
 257             partfile = save_tmp_file(part)
 258             fileparts = os.path.splitext(partfile)
 259             htmlfilename = fileparts[0] + ".html"
 260
 261             if part.get_content_subtype() == "msword" and USE_WVHTML_FOR_DOC:
 262                 mysubprocess.call(["wvHtml", partfile, htmlfilename])
 263                 htmlfiles.append(htmlfilename)
 264
 265             elif part.get_content_subtype() == \
 266                  "vnd.openxmlformats-officedocument.wordprocessingml.document" \
 267                  or part.get_content_subtype() == "msword":
 268                 mysubprocess.call(["unoconv", "-f", "html",
 269                                    "-T", UNOCONV_STARTUP_TIME,
 270                                    "-o", htmlfilename, partfile])
 271
 272                 htmlfilename = os.path.join(fileparts[0] + ".html")
 273                 htmlfiles.append(htmlfilename)
 274
 275             # unoconv conversions from powerpoint to HTML drop all images.
 276             # Try converting to PDF instead:
 277             elif part.get_content_subtype() == "vnd.ms-powerpoint" \
 278                  or part.get_content_subtype() == \
 279                     "vnd.openxmlformats-officedocument.presentationml.presentation" :
 280                 pdffile = fileparts[0] + ".pdf"
 281                 mysubprocess.call(["unoconv", "-f", "pdf",
 282                                    "-o", pdffile, partfile])
 283                 htmlfiles.append(pdffile)
 284
 285             elif part.get_content_subtype() == "pdf":
 286                 if CONVERT_PDF_TO_HTML:
 287                     mysubprocess.call(["pdftohtml", "-s", partfile])
 288
 289                     # But pdftohtml is idiotic about output filename
 290                     # and won't let you override it:
 291                     htmlfiles.append(fileparts[0] + "-html.html")
 292                 else:
 293                     htmlfiles.append(partfile)
 294
 295         elif part.get_content_maintype() == "text" and \
 296              part.get_content_subtype() == 'html':
 297
 298             htmlfile = tmp_file_name(part)
 299
 300             fp = open(htmlfile, 'wb')
 301             htmlsrc = part.get_payload(decode=True)
 302
 303             soup = BeautifulSoup(htmlsrc, "lxml")
 304
 305             # Substitute filenames for CIDs:
 306             for tag in soup.body.find_all("img", src=True):
 307                 if tag['src'].lower().startswith("cid:"):
 308                     for sf in subfiles:
 309                         if tag['src'][4:] == sf['Content-Id']:
 310                             tag['src'] = "file://" + sf['filename']
 311             # for sf in subfiles:
 312             #     htmlsrc = re.sub('cid: ?' + sf['Content-Id'],
 313             #                      'file://' + sf['filename'],
 314             #                      htmlsrc, flags=re.IGNORECASE)
 315
 316             # If it's HTML, we may need to add a meta charset tag. Sigh.
 317             # If it's text/plain, there's nothing we can do to fix charset.
 318             charset = part.get_charset()
 319             if not charset:
 320                 charset = "UTF-8"
 321             head = soup.find("head")
 322             if not head:
 323                 head = soup.new_tag("head")
 324                 html = soup.find("html")
 325                 if html:
 326                     html.insert(0, head)
 327                 else:
 328                     soup.insert(0, head)
 329
 330             if not head.findAll("meta", attrs={"http-equiv": "encoding"}) and \
 331                not head.findAll("meta", attrs={"http-equiv": "content-type"}):
 332                 meta = soup.new_tag("meta")
 333                 meta["content"] = charset
 334                 meta["http-equiv"] = "encoding"
 335                 head.insert(0, meta)
 336                 meta = soup.new_tag("meta")
 337                 meta["http-equiv"] = "content-type"
 338                 meta["content"] = "text/html; charset=%s" % charset
 339                 head.insert(0, meta)
 340
 341             fp.write(soup.prettify().encode("utf-8", "xmlcharrefreplace"))
 342             fp.close()
 343
 344             htmlfiles.append(htmlfile)
 345         elif part.get_content_maintype() == "image" and part not in subparts:
 346             partfile = save_tmp_file(part)
 347             htmlfiles.append(partfile)
 348
 349     # Done processing attachments. Call the browser for everything.
 350     if htmlfiles:
 351         # For the first URL, just put a redirect in
 352         write_to_index(pleasewait_file,
 353                        "Redirecting to file://" + htmlfiles[0],
 354                        0, "file://" + htmlfiles[0])
 355
 356         for f in htmlfiles[1:]:
 357             # If we don't wait for the new window to pop up before
 358             # calling new-tab, bad things will happen: the document
 359             # may load in a new tab in the old window and THEN pop up
 360             # an unwanted third window. Go firefox.
 361             # Not clear whether this is true for all browsers.
 362             time.sleep(1)
 363             if BROWSER_ARGS:
 364                 mysubprocess.call([BROWSER] + BROWSER_ARGS + ["file://" + f])
 365             else:
 366                 mysubprocess.call([BROWSER, "file://" + f])
 367
 368     # Wait a while to make sure the browser has loads the imgaes, then clean up.
 369     time.sleep(6)
 370     shutil.rmtree(tmpdir)
 371
 372 if __name__ == '__main__':
 373     tmpdir = tempfile.mkdtemp(dir=TMPDIR)
 374
 375     if len(sys.argv) > 1:
 376         for f in sys.argv[1:]:
 377             fp = open(f)
 378             view_message_attachments(fp, tmpdir)
 379             fp.close()
 380     else:
 381         view_message_attachments(sys.stdin, tmpdir)