Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# vim: sw=4:expandtab:foldmethod=marker 

2# 

3# Copyright (c) 2006, Mathieu Fenniak 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30from .generic import * 

31from .utils import isString, str_ 

32from .pdf import PdfFileReader, PdfFileWriter 

33from .pagerange import PageRange 

34from sys import version_info 

35if version_info < ( 3, 0 ): 

36 from cStringIO import StringIO 

37 StreamIO = StringIO 

38else: 

39 from io import BytesIO 

40 from io import FileIO as file 

41 StreamIO = BytesIO 

42 

43 

44class _MergedPage(object): 

45 """ 

46 _MergedPage is used internally by PdfFileMerger to collect necessary 

47 information on each page that is being merged. 

48 """ 

49 def __init__(self, pagedata, src, id): 

50 self.src = src 

51 self.pagedata = pagedata 

52 self.out_pagedata = None 

53 self.id = id 

54 

55 

56class PdfFileMerger(object): 

57 """ 

58 Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs 

59 into a single PDF. It can concatenate, slice, insert, or any combination 

60 of the above. 

61 

62 See the functions :meth:`merge()<merge>` (or :meth:`append()<append>`) 

63 and :meth:`write()<write>` for usage information. 

64 

65 :param bool strict: Determines whether user should be warned of all 

66 problems and also causes some correctable problems to be fatal. 

67 Defaults to ``True``. 

68 """ 

69 

70 def __init__(self, strict=True): 

71 self.inputs = [] 

72 self.pages = [] 

73 self.output = PdfFileWriter() 

74 self.bookmarks = [] 

75 self.named_dests = [] 

76 self.id_count = 0 

77 self.strict = strict 

78 

79 def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True): 

80 """ 

81 Merges the pages from the given file into the output file at the 

82 specified page number. 

83 

84 :param int position: The *page number* to insert this file. File will 

85 be inserted after the given number. 

86 

87 :param fileobj: A File Object or an object that supports the standard read 

88 and seek methods similar to a File Object. Could also be a 

89 string representing a path to a PDF file. 

90 

91 :param str bookmark: Optionally, you may specify a bookmark to be applied at 

92 the beginning of the included file by supplying the text of the bookmark. 

93 

94 :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple 

95 to merge only the specified range of pages from the source 

96 document into the output document. 

97 

98 :param bool import_bookmarks: You may prevent the source document's bookmarks 

99 from being imported by specifying this as ``False``. 

100 """ 

101 

102 # This parameter is passed to self.inputs.append and means 

103 # that the stream used was created in this method. 

104 my_file = False 

105 

106 # If the fileobj parameter is a string, assume it is a path 

107 # and create a file object at that location. If it is a file, 

108 # copy the file's contents into a BytesIO (or StreamIO) stream object; if 

109 # it is a PdfFileReader, copy that reader's stream into a 

110 # BytesIO (or StreamIO) stream. 

111 # If fileobj is none of the above types, it is not modified 

112 decryption_key = None 

113 if isString(fileobj): 

114 fileobj = file(fileobj, 'rb') 

115 my_file = True 

116 elif isinstance(fileobj, file): 

117 fileobj.seek(0) 

118 filecontent = fileobj.read() 

119 fileobj = StreamIO(filecontent) 

120 my_file = True 

121 elif isinstance(fileobj, PdfFileReader): 

122 orig_tell = fileobj.stream.tell() 

123 fileobj.stream.seek(0) 

124 filecontent = StreamIO(fileobj.stream.read()) 

125 fileobj.stream.seek(orig_tell) # reset the stream to its original location 

126 fileobj = filecontent 

127 if hasattr(fileobj, '_decryption_key'): 

128 decryption_key = fileobj._decryption_key 

129 my_file = True 

130 

131 # Create a new PdfFileReader instance using the stream 

132 # (either file or BytesIO or StringIO) created above 

133 pdfr = PdfFileReader(fileobj, strict=self.strict) 

134 if decryption_key is not None: 

135 pdfr._decryption_key = decryption_key 

136 

137 # Find the range of pages to merge. 

138 if pages == None: 

139 pages = (0, pdfr.getNumPages()) 

140 elif isinstance(pages, PageRange): 

141 pages = pages.indices(pdfr.getNumPages()) 

142 elif not isinstance(pages, tuple): 

143 raise TypeError('"pages" must be a tuple of (start, stop[, step])') 

144 

145 srcpages = [] 

146 if bookmark: 

147 bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit')) 

148 

149 outline = [] 

150 if import_bookmarks: 

151 outline = pdfr.getOutlines() 

152 outline = self._trim_outline(pdfr, outline, pages) 

153 

154 if bookmark: 

155 self.bookmarks += [bookmark, outline] 

156 else: 

157 self.bookmarks += outline 

158 

159 dests = pdfr.namedDestinations 

160 dests = self._trim_dests(pdfr, dests, pages) 

161 self.named_dests += dests 

162 

163 # Gather all the pages that are going to be merged 

164 for i in range(*pages): 

165 pg = pdfr.getPage(i) 

166 

167 id = self.id_count 

168 self.id_count += 1 

169 

170 mp = _MergedPage(pg, pdfr, id) 

171 

172 srcpages.append(mp) 

173 

174 self._associate_dests_to_pages(srcpages) 

175 self._associate_bookmarks_to_pages(srcpages) 

176 

177 # Slice to insert the pages at the specified position 

178 self.pages[position:position] = srcpages 

179 

180 # Keep track of our input files so we can close them later 

181 self.inputs.append((fileobj, pdfr, my_file)) 

182 

183 def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): 

184 """ 

185 Identical to the :meth:`merge()<merge>` method, but assumes you want to concatenate 

186 all pages onto the end of the file instead of specifying a position. 

187 

188 :param fileobj: A File Object or an object that supports the standard read 

189 and seek methods similar to a File Object. Could also be a 

190 string representing a path to a PDF file. 

191 

192 :param str bookmark: Optionally, you may specify a bookmark to be applied at 

193 the beginning of the included file by supplying the text of the bookmark. 

194 

195 :param pages: can be a :ref:`Page Range <page-range>` or a ``(start, stop[, step])`` tuple 

196 to merge only the specified range of pages from the source 

197 document into the output document. 

198 

199 :param bool import_bookmarks: You may prevent the source document's bookmarks 

200 from being imported by specifying this as ``False``. 

201 """ 

202 

203 self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) 

204 

205 def write(self, fileobj): 

206 """ 

207 Writes all data that has been merged to the given output file. 

208 

209 :param fileobj: Output file. Can be a filename or any kind of 

210 file-like object. 

211 """ 

212 my_file = False 

213 if isString(fileobj): 

214 fileobj = file(fileobj, 'wb') 

215 my_file = True 

216 

217 # Add pages to the PdfFileWriter 

218 # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13 

219 for page in self.pages: 

220 self.output.addPage(page.pagedata) 

221 page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject()) 

222 #idnum = self.output._objects.index(self.output._pages.getObject()["/Kids"][-1].getObject()) + 1 

223 #page.out_pagedata = IndirectObject(idnum, 0, self.output) 

224 

225 # Once all pages are added, create bookmarks to point at those pages 

226 self._write_dests() 

227 self._write_bookmarks() 

228 

229 # Write the output to the file 

230 self.output.write(fileobj) 

231 

232 if my_file: 

233 fileobj.close() 

234 

235 def close(self): 

236 """ 

237 Shuts all file descriptors (input and output) and clears all memory 

238 usage. 

239 """ 

240 self.pages = [] 

241 for fo, pdfr, mine in self.inputs: 

242 if mine: 

243 fo.close() 

244 

245 self.inputs = [] 

246 self.output = None 

247 

248 def addMetadata(self, infos): 

249 """ 

250 Add custom metadata to the output. 

251 

252 :param dict infos: a Python dictionary where each key is a field 

253 and each value is your new metadata. 

254 Example: ``{u'/Title': u'My title'}`` 

255 """ 

256 self.output.addMetadata(infos) 

257 

258 def setPageLayout(self, layout): 

259 """ 

260 Set the page layout 

261 

262 :param str layout: The page layout to be used 

263 

264 Valid layouts are: 

265 /NoLayout Layout explicitly not specified 

266 /SinglePage Show one page at a time 

267 /OneColumn Show one column at a time 

268 /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left 

269 /TwoColumnRight Show pages in two columns, odd-numbered pages on the right 

270 /TwoPageLeft Show two pages at a time, odd-numbered pages on the left 

271 /TwoPageRight Show two pages at a time, odd-numbered pages on the right 

272 """ 

273 self.output.setPageLayout(layout) 

274 

275 def setPageMode(self, mode): 

276 """ 

277 Set the page mode. 

278 

279 :param str mode: The page mode to use. 

280 

281 Valid modes are: 

282 /UseNone Do not show outlines or thumbnails panels 

283 /UseOutlines Show outlines (aka bookmarks) panel 

284 /UseThumbs Show page thumbnails panel 

285 /FullScreen Fullscreen view 

286 /UseOC Show Optional Content Group (OCG) panel 

287 /UseAttachments Show attachments panel 

288 """ 

289 self.output.setPageMode(mode) 

290 

291 def _trim_dests(self, pdf, dests, pages): 

292 """ 

293 Removes any named destinations that are not a part of the specified 

294 page set. 

295 """ 

296 new_dests = [] 

297 prev_header_added = True 

298 for k, o in list(dests.items()): 

299 for j in range(*pages): 

300 if pdf.getPage(j).getObject() == o['/Page'].getObject(): 

301 o[NameObject('/Page')] = o['/Page'].getObject() 

302 assert str_(k) == str_(o['/Title']) 

303 new_dests.append(o) 

304 break 

305 return new_dests 

306 

307 def _trim_outline(self, pdf, outline, pages): 

308 """ 

309 Removes any outline/bookmark entries that are not a part of the 

310 specified page set. 

311 """ 

312 new_outline = [] 

313 prev_header_added = True 

314 for i, o in enumerate(outline): 

315 if isinstance(o, list): 

316 sub = self._trim_outline(pdf, o, pages) 

317 if sub: 

318 if not prev_header_added: 

319 new_outline.append(outline[i-1]) 

320 new_outline.append(sub) 

321 else: 

322 prev_header_added = False 

323 for j in range(*pages): 

324 if pdf.getPage(j).getObject() == o['/Page'].getObject(): 

325 o[NameObject('/Page')] = o['/Page'].getObject() 

326 new_outline.append(o) 

327 prev_header_added = True 

328 break 

329 return new_outline 

330 

331 def _write_dests(self): 

332 dests = self.named_dests 

333 

334 for v in dests: 

335 pageno = None 

336 pdf = None 

337 if '/Page' in v: 

338 for i, p in enumerate(self.pages): 

339 if p.id == v['/Page']: 

340 v[NameObject('/Page')] = p.out_pagedata 

341 pageno = i 

342 pdf = p.src 

343 break 

344 if pageno != None: 

345 self.output.addNamedDestinationObject(v) 

346 

347 def _write_bookmarks(self, bookmarks=None, parent=None): 

348 

349 if bookmarks == None: 

350 bookmarks = self.bookmarks 

351 

352 last_added = None 

353 for b in bookmarks: 

354 if isinstance(b, list): 

355 self._write_bookmarks(b, last_added) 

356 continue 

357 

358 pageno = None 

359 pdf = None 

360 if '/Page' in b: 

361 for i, p in enumerate(self.pages): 

362 if p.id == b['/Page']: 

363 #b[NameObject('/Page')] = p.out_pagedata 

364 args = [NumberObject(p.id), NameObject(b['/Type'])] 

365 #nothing more to add 

366 #if b['/Type'] == '/Fit' or b['/Type'] == '/FitB' 

367 if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH': 

368 if '/Top' in b and not isinstance(b['/Top'], NullObject): 

369 args.append(FloatObject(b['/Top'])) 

370 else: 

371 args.append(FloatObject(0)) 

372 del b['/Top'] 

373 elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV': 

374 if '/Left' in b and not isinstance(b['/Left'], NullObject): 

375 args.append(FloatObject(b['/Left'])) 

376 else: 

377 args.append(FloatObject(0)) 

378 del b['/Left'] 

379 elif b['/Type'] == '/XYZ': 

380 if '/Left' in b and not isinstance(b['/Left'], NullObject): 

381 args.append(FloatObject(b['/Left'])) 

382 else: 

383 args.append(FloatObject(0)) 

384 if '/Top' in b and not isinstance(b['/Top'], NullObject): 

385 args.append(FloatObject(b['/Top'])) 

386 else: 

387 args.append(FloatObject(0)) 

388 if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject): 

389 args.append(FloatObject(b['/Zoom'])) 

390 else: 

391 args.append(FloatObject(0)) 

392 del b['/Top'], b['/Zoom'], b['/Left'] 

393 elif b['/Type'] == '/FitR': 

394 if '/Left' in b and not isinstance(b['/Left'], NullObject): 

395 args.append(FloatObject(b['/Left'])) 

396 else: 

397 args.append(FloatObject(0)) 

398 if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject): 

399 args.append(FloatObject(b['/Bottom'])) 

400 else: 

401 args.append(FloatObject(0)) 

402 if '/Right' in b and not isinstance(b['/Right'], NullObject): 

403 args.append(FloatObject(b['/Right'])) 

404 else: 

405 args.append(FloatObject(0)) 

406 if '/Top' in b and not isinstance(b['/Top'], NullObject): 

407 args.append(FloatObject(b['/Top'])) 

408 else: 

409 args.append(FloatObject(0)) 

410 del b['/Left'], b['/Right'], b['/Bottom'], b['/Top'] 

411 

412 b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)}) 

413 

414 pageno = i 

415 pdf = p.src 

416 break 

417 if pageno != None: 

418 del b['/Page'], b['/Type'] 

419 last_added = self.output.addBookmarkDict(b, parent) 

420 

421 def _associate_dests_to_pages(self, pages): 

422 for nd in self.named_dests: 

423 pageno = None 

424 np = nd['/Page'] 

425 

426 if isinstance(np, NumberObject): 

427 continue 

428 

429 for p in pages: 

430 if np.getObject() == p.pagedata.getObject(): 

431 pageno = p.id 

432 

433 if pageno != None: 

434 nd[NameObject('/Page')] = NumberObject(pageno) 

435 else: 

436 raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],)) 

437 

438 def _associate_bookmarks_to_pages(self, pages, bookmarks=None): 

439 if bookmarks == None: 

440 bookmarks = self.bookmarks 

441 

442 for b in bookmarks: 

443 if isinstance(b, list): 

444 self._associate_bookmarks_to_pages(pages, b) 

445 continue 

446 

447 pageno = None 

448 bp = b['/Page'] 

449 

450 if isinstance(bp, NumberObject): 

451 continue 

452 

453 for p in pages: 

454 if bp.getObject() == p.pagedata.getObject(): 

455 pageno = p.id 

456 

457 if pageno != None: 

458 b[NameObject('/Page')] = NumberObject(pageno) 

459 else: 

460 raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],)) 

461 

462 def findBookmark(self, bookmark, root=None): 

463 if root == None: 

464 root = self.bookmarks 

465 

466 for i, b in enumerate(root): 

467 if isinstance(b, list): 

468 res = self.findBookmark(bookmark, b) 

469 if res: 

470 return [i] + res 

471 elif b == bookmark or b['/Title'] == bookmark: 

472 return [i] 

473 

474 return None 

475 

476 def addBookmark(self, title, pagenum, parent=None): 

477 """ 

478 Add a bookmark to this PDF file. 

479 

480 :param str title: Title to use for this bookmark. 

481 :param int pagenum: Page number this bookmark will point to. 

482 :param parent: A reference to a parent bookmark to create nested 

483 bookmarks. 

484 """ 

485 if parent == None: 

486 iloc = [len(self.bookmarks)-1] 

487 elif isinstance(parent, list): 

488 iloc = parent 

489 else: 

490 iloc = self.findBookmark(parent) 

491 

492 dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) 

493 

494 if parent == None: 

495 self.bookmarks.append(dest) 

496 else: 

497 bmparent = self.bookmarks 

498 for i in iloc[:-1]: 

499 bmparent = bmparent[i] 

500 npos = iloc[-1]+1 

501 if npos < len(bmparent) and isinstance(bmparent[npos], list): 

502 bmparent[npos].append(dest) 

503 else: 

504 bmparent.insert(npos, [dest]) 

505 return dest 

506 

507 def addNamedDestination(self, title, pagenum): 

508 """ 

509 Add a destination to the output. 

510 

511 :param str title: Title to use 

512 :param int pagenum: Page number this destination points at. 

513 """ 

514 

515 dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) 

516 self.named_dests.append(dest) 

517 

518 

519class OutlinesObject(list): 

520 def __init__(self, pdf, tree, parent=None): 

521 list.__init__(self) 

522 self.tree = tree 

523 self.pdf = pdf 

524 self.parent = parent 

525 

526 def remove(self, index): 

527 obj = self[index] 

528 del self[index] 

529 self.tree.removeChild(obj) 

530 

531 def add(self, title, pagenum): 

532 pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] 

533 action = DictionaryObject() 

534 action.update({ 

535 NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), 

536 NameObject('/S') : NameObject('/GoTo') 

537 }) 

538 actionRef = self.pdf._addObject(action) 

539 bookmark = TreeObject() 

540 

541 bookmark.update({ 

542 NameObject('/A'): actionRef, 

543 NameObject('/Title'): createStringObject(title), 

544 }) 

545 

546 self.pdf._addObject(bookmark) 

547 

548 self.tree.addChild(bookmark) 

549 

550 def removeAll(self): 

551 for child in [x for x in self.tree.children()]: 

552 self.tree.removeChild(child) 

553 self.pop()