All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
3 # Source: https://raw.githubusercontent.com/akkana/scripts/master/viewmailattachments
5 # Take an mbox HTML message (e.g. from mutt), split it
6 # and rewrite it so all of its attachments can be viewed in a browser
7 # (perhaps after being converted to HTML from DOC or whatever first).
9 # Can be run from within a mailer like mutt, or independently
10 # on a single message file.
12 # Grew out of a simpler script called viewhtmlmail.
14 # Copyright 2015 by Akkana Peck. Share and enjoy under the GPL v2 or later.
16 # Holger Klawitter 2014: create a secure temp file and avoid temp mbox
18 # To use it from mutt, put the following lines in your .muttrc:
19 # macro index <F10> "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
20 # macro pager <F10> "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
26 import email, email.header, mimetypes
29 from bs4 import BeautifulSoup
31 ################################################
33 USE_WVHTML_FOR_DOC = False
35 TMPDIR=os.path.join(os.getenv('TMPDIR'), 'volatile')
37 # How many seconds do we need to wait for unoconv?
38 # It defaults to 6, but on a 64-bit machine that's not enough.
39 UNOCONV_STARTUP_TIME = "10"
41 # Does the browser need a one-time argument for bringing up an initial window,
42 # like Firefox's -private-window -new-instance ?
43 BROWSER_FIRST_ARG = []
45 # What browser to use:
46 USE_QUICKBROWSE = False
49 BROWSER = "quickbrowse"
51 # Browser argument to precede new tabs:
52 BROWSER_FIRST_ARGS = []
53 BROWSER_ARGS = [ "--new-tab" ]
55 # Will the browser block when first run until its window is closed?
56 # If so, we have to run it in the background.
57 BROWSER_BACKGROUND = False
59 # Should we convert PDF to HTML? Depends on BROWSER:
60 # Firefox has a built-in PDF viewer, but quickbrowse doesn't.
61 CONVERT_PDF_TO_HTML = False
63 else: # Firefox in private browsing mode
66 # Not clear what to do here: Firefox has a built-in PDF viewer,
67 # but for some mime types it can't figure out that it should use it.
68 BROWSER_FIRST_ARGS = [ "-private-window" ]
69 BROWSER_ARGS = [ "-new-tab", "-private-window" ]
70 # Firefox doesn't run in the background.
71 BROWSER_BACKGROUND = True
73 CONVERT_PDF_TO_HTML = False
76 ################################################
78 # Temporary for debugging:
82 print("\n\n================\n=== Calling: %s" % str(arr))
87 print("\n\n================\n=== Calling in background: %s" % str(arr))
88 subprocess.Popen(arr, shell=False,
89 stdin=None, stdout=None, stderr=None)
91 def view_message_attachments(fp, tmpdir):
92 '''View message attachments coming from the file-like object fp.
95 msg = email.message_from_string(fp.read())
104 def tmp_file_name(part):
105 partfile=part.get_filename()
107 n, enc = email.header.decode_header(partfile)[0]
109 partfile = n.decode(enc) if enc else n
111 # Applications should really sanitize the given filename so that an
112 # email message can't be used to overwrite important files.
113 # As a first step, warn about ../
114 if partfile and '../' in partfile:
115 print("Eek! Possible security problem in filename %s" % partfile)
118 # Make a filename in the tmp dir:
120 ext = mimetypes.guess_extension(part.get_content_type())
122 # Use a generic bag-of-bits extension
124 return tempfile.mkstemp(dir=tmpdir, suffix=ext, prefix='part-')[1]
126 return os.path.join(tmpdir, partfile)
128 def save_tmp_file(part):
129 '''Saves this part's payload to a tmp file, returning the new filename.
131 partfile = tmp_file_name(part)
133 tmpfile = open(partfile, "wb")
134 tmpfile.write(part.get_payload(decode=True))
138 # Walk through the message a first, preliminary time
139 # to separate out any images that might be referred to by
141 for part in msg.walk():
142 # walk() includes the top-level message
144 # print " Skipping the top-level message"
147 if part.get_content_type() != "multipart/related":
150 # It's multipart. Walk the subtree looking for image children.
151 for child in part.walk():
152 # print " ", child.get_content_type()
154 # At least for now, only save images as parts of multipart.
155 if child.get_content_maintype() != "image":
158 filename = save_tmp_file(child)
159 # print " Saved to", filename
161 # Rewrite image and other inline URLs in terms of content-id.
162 # Mailers may use Content-Id or Content-ID (or, presumably,
163 # other capitalizations). So we can't just look it up simply.
165 for k in list(child.keys()):
166 if k.lower() == 'content-id':
167 # Remove angle brackets, if present.
168 # child['Content-Id'] is unmutable: attempts to change it
169 # are just ignored. Copy it to a local mutable string.
170 content_id = child[k]
171 if content_id.startswith('<') and \
172 content_id.endswith('>'):
173 content_id = content_id[1:-1]
175 subfiles.append({ 'filename': filename,
176 'Content-Id': content_id })
177 subparts.append(child)
179 fp = open(filename, 'wb')
180 fp.write(child.get_payload(decode=True))
182 break # no need to look at other keys
185 # print filename, "doesn't have a Content-Id, not saving"
186 # # print "keys:", child.keys()
189 # for sf in subfiles:
192 # Call up the browser window right away,
193 # so the user can see something is happening.
194 # Firefox, alas, has no way from the commandline of calling up
195 # a new private window with content, then replacing that content.
196 # So we'll create a file that refreshes, so that when content is ready,
197 # it can redirect to the first content page.
198 def write_to_index(outfile, msg, timeout_secs, redirect_url):
200 redirect_url = "file://" + outfile
201 ofp = open(outfile, "w")
202 ofp.write('''<html><head>
203 <meta content="utf-8" http-equiv="encoding">
204 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
205 <meta http-equiv="refresh" content="%d;URL=%s">
207 <br><br><br><br><br><br><big><big>%s</big></big>
209 ''' % (timeout_secs, redirect_url, msg))
213 pleasewait_file = tmpdir + "/index.html"
214 write_to_index(pleasewait_file, "Please wait ...", redirect_timeout, None)
217 if BROWSER_FIRST_ARGS:
218 cmd += BROWSER_FIRST_ARGS
220 cmd.append("file://" + pleasewait_file)
221 print("Calling: %s" % ' '.join(cmd))
222 if BROWSER_BACKGROUND:
223 mysubprocess.call_bg(cmd)
225 mysubprocess.call(cmd)
227 # "data:text/html,<br><br><br><br><br><h1>Translating documents, please wait ..."
228 # Use JS if we can figure out how to close or replace
229 # the "please wait" tab once we have content to show.
230 # But for now, setTimeout() doesn't work at all
231 # in newly popped up private windows.
232 # "javascript:document.writeln('<br><br><br><br><br><h1>Translating documents, please wait ...');setTimeout(function(){alert('hi');}, 500);"
235 # Now walk through looking for the real parts:
236 # HTML, doc and docx.
237 for part in msg.walk():
239 # part has, for example:
240 # items: [('Content-Type', 'image/jpeg'),
241 # ('Content-Transfer-Encoding', 'base64'),
242 # ('Content-ID', '<14.3631871432@web82503.mail.mud.yahoo.com>'),
243 # ('Content-Disposition',
244 # 'attachment; filename="ATT0001414.jpg"')]
245 # keys: ['Content-Type', 'Content-Transfer-Encoding',
246 # 'Content-ID', 'Content-Disposition']
247 # values: ['image/jpeg', 'base64',
248 # '<14.3631871432@web82503.mail.mud.yahoo.com>',
249 # 'attachment; filename="ATT0001414.jpg"']
251 # multipart/* are just containers
252 #if part.get_content_maintype() == 'multipart':
253 if part.is_multipart() or part.get_content_type == 'message/rfc822':
256 if part.get_content_maintype() == "application":
257 partfile = save_tmp_file(part)
258 fileparts = os.path.splitext(partfile)
259 htmlfilename = fileparts[0] + ".html"
261 if part.get_content_subtype() == "msword" and USE_WVHTML_FOR_DOC:
262 mysubprocess.call(["wvHtml", partfile, htmlfilename])
263 htmlfiles.append(htmlfilename)
265 elif part.get_content_subtype() == \
266 "vnd.openxmlformats-officedocument.wordprocessingml.document" \
267 or part.get_content_subtype() == "msword":
268 mysubprocess.call(["unoconv", "-f", "html",
269 "-T", UNOCONV_STARTUP_TIME,
270 "-o", htmlfilename, partfile])
272 htmlfilename = os.path.join(fileparts[0] + ".html")
273 htmlfiles.append(htmlfilename)
275 # unoconv conversions from powerpoint to HTML drop all images.
276 # Try converting to PDF instead:
277 elif part.get_content_subtype() == "vnd.ms-powerpoint" \
278 or part.get_content_subtype() == \
279 "vnd.openxmlformats-officedocument.presentationml.presentation" :
280 pdffile = fileparts[0] + ".pdf"
281 mysubprocess.call(["unoconv", "-f", "pdf",
282 "-o", pdffile, partfile])
283 htmlfiles.append(pdffile)
285 elif part.get_content_subtype() == "pdf":
286 if CONVERT_PDF_TO_HTML:
287 mysubprocess.call(["pdftohtml", "-s", partfile])
289 # But pdftohtml is idiotic about output filename
290 # and won't let you override it:
291 htmlfiles.append(fileparts[0] + "-html.html")
293 htmlfiles.append(partfile)
295 elif part.get_content_maintype() == "text" and \
296 part.get_content_subtype() == 'html':
298 htmlfile = tmp_file_name(part)
300 fp = open(htmlfile, 'wb')
301 htmlsrc = part.get_payload(decode=True)
303 soup = BeautifulSoup(htmlsrc, "lxml")
305 # Substitute filenames for CIDs:
306 for tag in soup.body.find_all("img", src=True):
307 if tag['src'].lower().startswith("cid:"):
309 if tag['src'][4:] == sf['Content-Id']:
310 tag['src'] = "file://" + sf['filename']
311 # for sf in subfiles:
312 # htmlsrc = re.sub('cid: ?' + sf['Content-Id'],
313 # 'file://' + sf['filename'],
314 # htmlsrc, flags=re.IGNORECASE)
316 # If it's HTML, we may need to add a meta charset tag. Sigh.
317 # If it's text/plain, there's nothing we can do to fix charset.
318 charset = part.get_charset()
321 head = soup.find("head")
323 head = soup.new_tag("head")
324 html = soup.find("html")
330 if not head.findAll("meta", attrs={"http-equiv": "encoding"}) and \
331 not head.findAll("meta", attrs={"http-equiv": "content-type"}):
332 meta = soup.new_tag("meta")
333 meta["content"] = charset
334 meta["http-equiv"] = "encoding"
336 meta = soup.new_tag("meta")
337 meta["http-equiv"] = "content-type"
338 meta["content"] = "text/html; charset=%s" % charset
341 fp.write(soup.prettify().encode("utf-8", "xmlcharrefreplace"))
344 htmlfiles.append(htmlfile)
345 elif part.get_content_maintype() == "image" and part not in subparts:
346 partfile = save_tmp_file(part)
347 htmlfiles.append(partfile)
349 # Done processing attachments. Call the browser for everything.
351 # For the first URL, just put a redirect in
352 write_to_index(pleasewait_file,
353 "Redirecting to file://" + htmlfiles[0],
354 0, "file://" + htmlfiles[0])
356 for f in htmlfiles[1:]:
357 # If we don't wait for the new window to pop up before
358 # calling new-tab, bad things will happen: the document
359 # may load in a new tab in the old window and THEN pop up
360 # an unwanted third window. Go firefox.
361 # Not clear whether this is true for all browsers.
364 mysubprocess.call([BROWSER] + BROWSER_ARGS + ["file://" + f])
366 mysubprocess.call([BROWSER, "file://" + f])
368 # Wait a while to make sure the browser has loads the imgaes, then clean up.
370 shutil.rmtree(tmpdir)
372 if __name__ == '__main__':
373 tmpdir = tempfile.mkdtemp(dir=TMPDIR)
375 if len(sys.argv) > 1:
376 for f in sys.argv[1:]:
378 view_message_attachments(fp, tmpdir)
381 view_message_attachments(sys.stdin, tmpdir)