Package nsi :: Package granulate :: Module GranulatePDF'
[hide private]
[frames] | no frames]

Source Code for Module nsi.granulate.GranulatePDF'

  1  # -*- coding: utf-8 -*- 
  2  ############################################################################## 
  3  # 
  4  # Copyright (c) 2007 ISrg (NSI, CEFETCAMPOS, BRAZIL) and Contributors.  
  5  #                                                         All Rights Reserved. 
  6  #                              Ronaldo Amaral Santos <ronaldinho.as@gmail.com>  
  7  # 
  8  # WARNING: This program as such is intended to be used by professional 
  9  # programmers who take the whole responsability of assessing all potential 
 10  # consequences resulting from its eventual inadequacies and bugs 
 11  # End users who are looking for a ready-to-use solution with commercial 
 12  # garantees and support are strongly adviced to contract a Free Software 
 13  # Service Company 
 14  # 
 15  # This program is Free Software; you can redistribute it and/or 
 16  # modify it under the terms of the GNU General Public License 
 17  # as published by the Free Software Foundation; either version 2 
 18  # of the License, or (at your option) any later version. 
 19  # 
 20  # This program is distributed in the hope that it will be useful, 
 21  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 22  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 23  # GNU General Public License for more details. 
 24  # 
 25  # You should have received a copy of the GNU General Public License 
 26  # along with this program; if not, write to the Free Software 
 27  # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 28  # 
 29  ############################################################################## 
 30   
 31  __author__ = """Ronaldo Amaral Santos <ronaldinho.as@gmail.com>""" 
 32  __docformat__ = 'plaintext' 
 33   
 34  from xml.dom import minidom 
 35  from StringIO import StringIO 
 36  from GranularUtils import Grain 
 37   
 38  import os, sys, time, random, shutil 
 39  import PIL.Image 
 40  import mimetypes 
 41   
 42  from pypdf2table import ExecuteConverter 
 43   
44 -class ErrorXmlto(Exception):pass
45 -class PyPdf2TableError(Exception):pass
46
47 -class GranulatePDF(object):
48 """ 49 - Provide the grain extraction functionality for PDF documents 50 - Retrieve tables and images 51 """ 52 Document = None 53 __pathFolder = None 54
55 - def __init__(self, Document=None):
56 """ 57 Checks if the Document is a PDF file, then creates a temporary folder and saves 58 the PDF file in the filesystem 59 """ 60 self.Document = Document 61 62 if self.Document.getContentType() == "application/pdf": 63 dtime = str(time.time())+str(int(random.random()*100)) 64 self.__pathFolder = os.path.join('/tmp', dtime) 65 os.mkdir(self.__pathFolder) 66 filePDF=open(os.path.join(self.__pathFolder,self.Document.getFilename()),'w') 67 filePDF.write(self.Document().getvalue()) 68 filePDF.close() 69 70 else: 71 raise "The file is not a PDF Document"
72 73
74 - def __call__(self):
75 return self.granulateDocument()
76
77 - def __del__(self):
78 """ 79 When the object is destroyed, the temporary folder is removed with everything inside of it. 80 """ 81 shutil.rmtree(self.__pathFolder)
82 83 84 ### Private Methods ### 85
86 - def __getImageDocumentList (self):
87 """ 88 Retrieves images from a PDF document 89 """ 90 if os.system('pdfimages -j "' + os.path.join(self.__pathFolder,self.Document.getFilename()) + '" ' + self.__pathFolder +'/imagegrain') == 256: 91 #raise EOFError, "File has not the mandatory ending %EOF. File must be corrupted" 92 return [] 93 # Lists the content of the temporary folder where the files are in. 94 images = os.listdir(self.__pathFolder) 95 images.remove(self.Document.getFilename()) 96 image_list = []; 97 for image in images: 98 f, e = os.path.splitext(image) 99 #convert the images .ppm or. pbm to files .png 100 if e.lower() in ['.ppm','.pbm']: 101 try: 102 content = StringIO() 103 PIL.Image.open(os.path.join(self.__pathFolder,image)).save(content, "PNG") 104 image = f + ".png" 105 except: 106 fileImage = open(self.__pathFolder+'/'+image, "r") 107 content = StringIO(fileImage.read()) 108 fileImage.close() 109 else: 110 fileImage = open(self.__pathFolder+'/'+image, "r") 111 content = StringIO(fileImage.read()) 112 fileImage.close() 113 114 image_list.append(Grain(id=image,content=content)) 115 116 117 return image_list
118
119 - def __getTableDocumentList(self):
120 """ 121 Extract tables from a pdf file using pyPdf2Table 122 """ 123 tableList = [] 124 pdfFile = os.path.join(self.__pathFolder,self.Document.getFilename()) 125 outputXMLFolder = os.path.join(self.__pathFolder,"outputXMLFolder") 126 try: 127 converterObj = ExecuteConverter.ExecuteConverter() 128 converterObj.extractTables(pdfFile, outputXMLFolder) 129 tableListStr = converterObj.getTableList() 130 except Exception, e: 131 raise PyPdf2TableError, e 132 133 i = 0 134 for table in tableListStr: 135 # generate table name 136 i+=1 137 tableId = "Table" + str(i) + ".html" 138 # finally, the Grain is created en added to the list 139 grainObj = Grain() 140 grainObj.setId(tableId) 141 grainObj.setContent(StringIO(table)) 142 tableList.append(grainObj) 143 144 return tableList
145 146 ### Public Methods ### 147 148 #def getThumbnailsDocument(self): 149 # """ 150 # """ 151 # pass 152 153 #def getSummaryDocument(self): 154 # """ 155 # """ 156 # pass 157 158
159 - def getImageDocumentList(self):
160 """ 161 Invoke the private method __getImageDocumentList in order to retrieve the document's images 162 """ 163 if self.__pathFolder is not None: 164 return self.__getImageDocumentList() 165 else: 166 return None
167
168 - def getTableDocumentList(self):
169 """ 170 Invoke the private method __getTableDocumentList in order to retrieve the document's tables 171 """ 172 if self.__pathFolder is not None: 173 return self.__getTableDocumentList() 174 else: 175 return None
176
177 - def granulateDocument(self):
178 """ 179 Extract the grains from a document, returning a dictionary with a list of tables and a list of images 180 """ 181 returnfiles = {} 182 if self.__pathFolder is not None: 183 returnfiles['image_list'] = self.__getImageDocumentList() 184 returnfiles['table_list'] = self.__getTableDocumentList() 185 return returnfiles 186 187 else: 188 return None
189