Package nsi :: Package granulate :: Module GranulateOffice'
[hide private]
[frames] | no frames]

Source Code for Module nsi.granulate.GranulateOffice'

  1  # -*- coding: utf-8 -*- 
  2  ############################################################################## 
  3  # 
  4  # Copyright (c) 2007 ISrg (NSI, CEFETCAMPOS, BRAZIL) and Contributors.  
  5  #                                                         All Rights Reserved. 
  6  #                              Ronaldo Amaral Santos <ronaldinho.as@gmail.com>  
  7  # 
  8  # WARNING: This program as such is intended to be used by professional 
  9  # programmers who take the whole responsability of assessing all potential 
 10  # consequences resulting from its eventual inadequacies and bugs 
 11  # End users who are looking for a ready-to-use solution with commercial 
 12  # garantees and support are strongly adviced to contract a Free Software 
 13  # Service Company 
 14  # 
 15  # This program is Free Software; you can redistribute it and/or 
 16  # modify it under the terms of the GNU General Public License 
 17  # as published by the Free Software Foundation; either version 2 
 18  # of the License, or (at your option) any later version. 
 19  # 
 20  # This program is distributed in the hope that it will be useful, 
 21  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 22  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 23  # GNU General Public License for more details. 
 24  # 
 25  # You should have received a copy of the GNU General Public License 
 26  # along with this program; if not, write to the Free Software 
 27  # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 28  # 
 29  ############################################################################## 
 30   
 31  __author__ = """Ronaldo Amaral Santos <ronaldinho.as@gmail.com>""" 
 32  __docformat__ = 'plaintext' 
 33   
 34  from xmlrpclib import *  
 35  from xml.dom.minidom import parseString 
 36  import copy 
 37  from StringIO import StringIO 
 38   
 39  from GranularUtils import Grain 
 40   
 41  import zipfile, base64, re 
 42  import os, sys, time, random 
 43  import shutil 
 44   
 45  import PIL.Image 
 46  import commands 
 47  #import config 
 48  import mimetypes 
 49   
50 -class ConectionServerError(Exception): pass
51 52
53 -class GranulateOffice(object):
54 """ 55 - Provide the grain extraction functionality for ms-office and odf documents 56 - Retrieve tables, images, thumbnails and summary 57 """ 58 Document = None 59 __parseContent = None 60 __zipFile = None 61 __ooodServer = None 62 supportedMimeType=('application/vnd.oasis.opendocument.text', 63 'application/vnd.sun.xml.writer', 64 'application/msword', 65 'application/rtf', 66 'application/vnd.stardivision.writer', 67 'application/x-starwriter', 68 'text/plain', 69 'application/vnd.oasis.opendocument.spreadsheet', 70 'application/vnd.sun.xml.calc', 71 'application/vnd.ms-excel', 72 'application/vnd.stardivision.calc', 73 'application/x-starcalc', 74 'application/vnd.oasis.opendocument.presentation', 75 'application/vnd.sun.xml.impress', 76 'application/vnd.ms-powerpoint', 77 'application/vnd.stardivision.draw', 78 'application/vnd.stardivision.impress', 79 'application/x-starimpress',) 80 81 supportedGranulateMimeTypes=('application/vnd.oasis.opendocument.text', 82 'application/vnd.oasis.opendocument.presentation',) 83 84 supportedConvertionMimeTypes=('application/msword', 85 'application/rtf', 86 'application/vnd.ms-powerpoint',) 87 88 89 90 91 # Construct
92 - def __init__(self, Document=None, ooodServer=None):
93 """ 94 - The parameter "Document" is a instance of the class "File" what it is in the FileUtils module 95 - The ooodServer MUST be specified if the file is a ms-office file, or else the convertion server 96 will be not found, and the grains will not be extracted 97 """ 98 self.Document = Document 99 self.__ooodServer = ooodServer 100 if self.Document.getContentType() in self.supportedConvertionMimeTypes: 101 # converts the file and stores in the Document object 102 self.Document.setData(self.__convertDocumentToOdf()) 103 self.__parseXmlZipFile() 104 105 elif self.Document.getContentType() in self.supportedGranulateMimeTypes: 106 self.__parseXmlZipFile()
107 108 109
110 - def __call__(self):
111 return self.granulateDocument()
112 113 114 ### Private Methods ### 115 116
117 - def __mkServer(self):
118 """ 119 Create a connection to the OpenOffice(oood-ERP5) Server 120 """ 121 try: 122 if self.__ooodServer is not None: 123 return ServerProxy(self.__ooodServer) 124 else: 125 raise ConectionServerError, "It was not possible to connect. oood Server not found " 126 except: 127 raise ConectionServerError, "It was not possible to connect to the Convertion Server."
128
129 - def __createNewOOoDocument(self):
130 """ 131 Creates a new odt document based in a blank template 132 """ 133 templatePath = os.path.join(os.path.dirname(__file__), 'template', 'template.odt') 134 template_str=open(templatePath).read() 135 return template_str
136
137 - def __getNodeText(self,node):
138 """ 139 Get text value in a xml node 140 """ 141 text = '' 142 for child in node.childNodes: 143 if child.nodeType is child.TEXT_NODE: 144 text += child.data 145 return text
146
147 - def __getTextChildNodesImage(self,node,text=[]):
148 """ 149 Get the subtitle text of image in odf document 150 """ 151 if node.nextSibling: 152 node = node.nextSibling 153 if node.nodeType is node.TEXT_NODE: 154 text.append(node.data) 155 else: 156 text.append(self.__getNodeText(node)) 157 return self.__getTextChildNodesImage(node,text) 158 else: 159 return text
160
161 - def __getTextChildNodesTable(self,node,text=[]):
162 """ 163 Get the subtitle text of a table in odf document 164 """ 165 for n in node.childNodes: 166 if n.nodeType is n.TEXT_NODE: 167 text.append(n.data) 168 if n.hasChildNodes(): 169 self.__getTextChildNodesTable(n,text) 170 return text
171
172 - def __getAttrStyles(self, Node):
173 """ 174 Get the associated Styles of given node 175 """ 176 if Node.attributes is not None: 177 for i in Node.attributes.keys(): 178 if re.search("^.+\:style-name$",i): 179 if Node.getAttribute(i): 180 return Node.getAttribute(i)
181
182 - def __getAttributesR(self, Node, styles=[]):
183 style=self.__getAttrStyles(Node) 184 if style: 185 styles.append(style) 186 for i in Node.childNodes: 187 self.__getAttributesR(i,styles) 188 if styles: 189 return styles
190
191 - def __convertDocumentToOdf(self):
192 """ 193 Convert a ms-office document to Open Document Format (odf) 194 """ 195 sp = self.__mkServer() 196 res = sp.convert(self.Document.getFilename(), base64.encodestring(self.Document.getData().getvalue())) 197 if res[0]==200: 198 file=StringIO(base64.decodestring(res[1]['data'])) 199 return file 200 else: 201 return None
202 203
204 - def __getSummaryDocument(self):
205 """ 206 Get the Summary of an odf document 207 """ 208 title_elements = self.__parseContent.getElementsByTagName('text:h') 209 titles = [] 210 for t in title_elements: 211 level = int(t.attributes['text:outline-level'].value) 212 title = self.__getNodeText(t) 213 titles.append({'level':level, 'value':title}) 214 if titles: 215 return titles 216 else: 217 return None
218 219
220 - def __getThumbnailsDocument(self):
221 """ 222 Get the Thumbnails of an odf document 223 """ 224 for f in self.__zipFile.infolist(): 225 if f.filename == 'Thumbnails/thumbnail.png': 226 contents = self.__zipFile.read('Thumbnails/thumbnail.png') 227 return StringIO(contents) 228 229 return None
230
231 - def __parseXmlZipFile(self):
232 """ 233 Uncompress an odf file and parse the "content.xml" file. 234 """ 235 try: 236 self.__zipFile = zipfile.PyZipFile(self.Document.getData(),'r') 237 except zipfile.BadZipfile, e: 238 #Log.error("File is not a zip file") 239 return None, None 240 241 contents = self.__zipFile.read('content.xml') 242 self.__parseContent = parseString(contents)
243 244
245 - def __getTableDocumentList(self):
246 """ 247 Extract the tables from a document and return a list of Grain instances 248 """ 249 table_list=[] 250 # create an empty template 251 template_str=self.__createNewOOoDocument() 252 tables= self.__parseContent.getElementsByTagName('table:table') 253 stylesDoc= self.__parseContent.getElementsByTagName('style:style') 254 for t in tables: 255 styles = self.__getAttributesR(t) 256 table_name = t.getAttribute('table:name') 257 imgHrefs=[] 258 for img in t.getElementsByTagName("draw:image"): 259 if img.hasAttribute("xlink:href"): 260 path=img.getAttribute('xlink:href') 261 #checks if the path is empty 262 if "ObjectReplacements" in path: 263 # remove th "./" of the path that could be "./ObjectReplacements/Object 2" 264 imgHrefs.append(path.replace("./","")) 265 # happens when it has an image from a website 266 elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path): 267 continue 268 else: 269 imgHrefs.append(path) 270 271 # extract legend 272 objGran = Grain() 273 leg=[] 274 p = t.previousSibling 275 n = t.nextSibling 276 if p is not None: 277 if p.hasChildNodes(): 278 legenda = '' 279 for i in self.__getTextChildNodesTable(p,text=[]): 280 legenda+=i 281 leg.append(legenda) 282 else: 283 leg.append(self.__getNodeText(p)) 284 if n is not None: 285 if n.hasChildNodes(): 286 legenda = '' 287 for j in self.__getTextChildNodesTable(n,text=[]): 288 legenda+=j 289 leg.append(legenda) 290 else: 291 leg.append(self.__getNodeText(n)) 292 293 # join the strings to make a single legend 294 caption = ' '.join([ i for i in leg if i is not None]) 295 296 objGran.setCaption(caption) 297 # Creating an empty File 298 table_name = t.getAttribute('table:name') 299 new_table = StringIO() 300 new_table.write(template_str) 301 template_odt = zipfile.PyZipFile(new_table,'a') 302 doc = parseString(template_odt.read('content.xml')) 303 office_text=doc.getElementsByTagName('office:text') 304 office_text=office_text[0] 305 306 # copy the table node from a document to a new table grain 307 newTableNo=doc.importNode(t,True) 308 office_text.appendChild(newTableNo) 309 310 for sty in stylesDoc: 311 if (sty.getAttribute('style:name') in styles): 312 office_automatic_styles=doc.getElementsByTagName('office:automatic-styles') 313 office_automatic_styles=office_automatic_styles[0] 314 office_automatic_styles.appendChild(doc.importNode(sty,True)) 315 if imgHrefs: 316 for image in imgHrefs: 317 template_odt.writestr(str(image),self.__zipFile.read(image)) 318 template_odt.writestr('content.xml',doc.toxml().encode('utf-8')) 319 template_odt.close() 320 if table_name: 321 #objGran.setId(plone_utils.normalizeString(table_name)) 322 objGran.setId(table_name) 323 objGran.setContent(new_table) 324 table_list.append(objGran) 325 if table_list: 326 return table_list 327 else: 328 return None
329 330 331
332 - def __getImageDocumentList(self):
333 """ 334 Extract the images from a document and return a list of Grain instances 335 """ 336 image_list=[] 337 #get the elements in the tags draw:image, where the image references are kept 338 tag_images = self.__parseContent.getElementsByTagName('draw:image') 339 #checks if an image element exists 340 if len(tag_images): 341 for item in tag_images: 342 name=None 343 if item.hasAttribute("xlink:href"): 344 path=item.getAttribute('xlink:href') 345 #checks if the path is empty 346 if "Pictures" in path: 347 #remove the file extension 348 name=path.replace("Pictures/","") 349 elif "ObjectReplacements" in path: 350 name=path.replace("./ObjectReplacements/","") 351 # removes the "./" of the path that could be "./ObjectReplacements/Object 2" 352 path = path.replace("./","") 353 354 # happens when it has an image from a website 355 elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path): 356 continue 357 358 if name is not None: 359 #checks the image extension 360 f, e = os.path.splitext(name) 361 if e.lower() in ['.png','.gif','.jpg']: 362 # verifies if the image is already in the list 363 if not name in [image.getId() for image in image_list]: 364 parent = item.parentNode 365 nChild = parent.nextSibling 366 objGran = Grain() 367 if nChild: 368 text=[] 369 caption = '' 370 if nChild.nodeType is nChild.TEXT_NODE: 371 text.append(nChild.data) 372 for t in self.__getTextChildNodesImage(nChild,text): 373 if t is not None: caption+=t 374 objGran.setCaption(caption) 375 imagefile = StringIO(self.__zipFile.read(path)) 376 objGran.setId(name) 377 objGran.setContent(imagefile) 378 image_list.append(objGran) 379 if image_list: 380 return image_list 381 else: 382 return None
383 384 385 ### Public Methods ### 386
387 - def getThumbnailsDocument(self):
388 """ 389 Get document's thumbnails 390 """ 391 if self.__zipFile is not None: 392 return self.__getThumbnailsDocument() 393 else: 394 return None
395
396 - def getSummaryDocument(self):
397 """ 398 Get document's summary 399 """ 400 if self.__zipFile is not None: 401 return self.__getSummaryDocument() 402 else: 403 return None
404
405 - def getImageDocumentList(self):
406 """ 407 Invoke the private method __getImageDocumentList in order to retrieve the document's images 408 """ 409 if self.__zipFile is not None: 410 return self.__getImageDocumentList() 411 else: 412 return None
413
414 - def getTableDocumentList(self):
415 """ 416 Invoke the private method __getTableDocumentList in order to retrieve the document's tables 417 """ 418 if self.__zipFile is not None: 419 return self.__getTableDocumentList() 420 else: 421 return None
422
423 - def granulateDocument(self):
424 """ 425 Extract the grains from a document, returning a dictionary with a list of tables and a list of images 426 """ 427 returnfiles = {} 428 if self.__zipFile is not None: 429 returnfiles['image_list'] = self.__getImageDocumentList() 430 returnfiles['table_list'] = self.__getTableDocumentList() 431 return returnfiles 432 433 else: 434 return None
435