#!/usr/bin/env python # encoding: utf-8 """ emlx.py Created by Rui Carmo on 2008-03-03. Released under the MIT license """ from BeautifulSoup import BeautifulSoup import os, re, codecs, email, urllib2 from email.MIMEImage import MIMEImage from email.MIMEMultipart import MIMEMultipart # Message headers used by Mail.app that we want to preserve preserved_headers = [ "X-Uniform-Type-Identifier", "X-Mail-Rss-Source-Url", "X-Mail-Rss-Article-Identifier", "X-Mail-Rss-Article-Url", "Received", "Subject", "X-Mail-Rss-Author" "Message-Id", "X-Mail-Rss-Source-Name", "Reply-To", "Mime-Version", "Date" ] class emlx: """emlx parser""" def __init__(self, filename): """initialization""" self.filename = filename self.opener = urllib2.build_opener() # Mimic Mail.app User-agent self.opener.addheaders = [('User-agent', 'Apple-PubSub/59')] self.load() def load(self): # open the .emlx file as binary (and not using codecs) to ensure byte offsets work self.fh = open(self.filename,'rb') # get the payload length self.bytes = int(self.fh.readline().strip()) # get the MIME payload self.message = email.message_from_string(self.fh.read(self.bytes)) # the remaining bytes are the .plist self.plist = ''.join(self.fh.readlines()) self.fh.close() def save(self, filename): fh = open(filename,'wb') # get the payload length bytes = len(str(self.message)) fh.write("%d\n%s%s" % (bytes, self.message, self.plist)) fh.close() def grab(self, url): """grab images (not very sophisticated yet, doesn't handle redirects and such)""" h = self.opener.open(url) mtype = h.info().getheader('Content-Type') data = h.read() return (mtype,data) def parse(self): for part in self.message.walk(): if part.get_content_type() == 'text/html': self.rebuild(part) return def rebuild(self,part): # parse the HTML soup = BeautifulSoup(part.get_payload()) # strain out all images referenced by HTTP/HTTPS images = soup('img',{'src':re.compile('^http')}) count = 0 # prepare new MIME message newmessage = MIMEMultipart('related') for h in preserved_headers: newmessage.add_header(h,self.message[h]) attachments = [] for i in images: # Grab the image (mtype, data) = self.grab(i['src']) # Build a cid for it subtype = mtype.split('/')[1] cid = '%(count)d.%(subtype)s' % locals() # Create and attach new MIME part # we use all reference methods to ensure cross-MUA compatibility image = MIMEImage(data, subtype,name=cid) image.add_header('Content-ID', '<%s>' % cid) image.add_header('Content-Location', cid) image.add_header('Content-Disposition','inline', filename=("%s" % cid)) attachments.append(image) # update references to images i['src'] = '%s' % cid count = count + 1 # inject rewritten HTML first part.set_payload(str(soup)) newmessage.attach(part) # now add inline images as extra MIME parts for a in attachments: newmessage.attach(a) # replace the message self.message = newmessage if __name__ == "__main__": a = emlx('320611.emlx') a.parse() a.save('injected.emlx')