Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/PyPDF2/pdf.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2#
3# vim: sw=4:expandtab:foldmethod=marker
4#
5# Copyright (c) 2006, Mathieu Fenniak
6# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
7#
8# All rights reserved.
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright notice,
15# this list of conditions and the following disclaimer.
16# * Redistributions in binary form must reproduce the above copyright notice,
17# this list of conditions and the following disclaimer in the documentation
18# and/or other materials provided with the distribution.
19# * The name of the author may not be used to endorse or promote products
20# derived from this software without specific prior written permission.
21#
22# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32# POSSIBILITY OF SUCH DAMAGE.
34"""
35A pure-Python PDF library with an increasing number of capabilities.
36See README for links to FAQ, documentation, homepage, etc.
37"""
39__author__ = "Mathieu Fenniak"
40__author_email__ = "biziqe@mathieu.fenniak.net"
42__maintainer__ = "Phaseit, Inc."
43__maintainer_email = "PyPDF2@phaseit.net"
45import string
46import math
47import struct
48import sys
49import uuid
50from sys import version_info
51if version_info < ( 3, 0 ):
52 from cStringIO import StringIO
53else:
54 from io import StringIO
56if version_info < ( 3, 0 ):
57 BytesIO = StringIO
58else:
59 from io import BytesIO
61from . import filters
62from . import utils
63import warnings
64import codecs
65from .generic import *
66from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
67from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning
69if version_info < ( 2, 4 ):
70 from sets import ImmutableSet as frozenset
72if version_info < ( 2, 5 ):
73 from md5 import md5
74else:
75 from hashlib import md5
76import uuid
79class PdfFileWriter(object):
80 """
81 This class supports writing PDF files out, given pages produced by another
82 class (typically :class:`PdfFileReader<PdfFileReader>`).
83 """
84 def __init__(self):
85 self._header = b_("%PDF-1.3")
86 self._objects = [] # array of indirect objects
88 # The root of our page tree node.
89 pages = DictionaryObject()
90 pages.update({
91 NameObject("/Type"): NameObject("/Pages"),
92 NameObject("/Count"): NumberObject(0),
93 NameObject("/Kids"): ArrayObject(),
94 })
95 self._pages = self._addObject(pages)
97 # info object
98 info = DictionaryObject()
99 info.update({
100 NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be'))
101 })
102 self._info = self._addObject(info)
104 # root object
105 root = DictionaryObject()
106 root.update({
107 NameObject("/Type"): NameObject("/Catalog"),
108 NameObject("/Pages"): self._pages,
109 })
110 self._root = None
111 self._root_object = root
113 def _addObject(self, obj):
114 self._objects.append(obj)
115 return IndirectObject(len(self._objects), 0, self)
117 def getObject(self, ido):
118 if ido.pdf != self:
119 raise ValueError("pdf must be self")
120 return self._objects[ido.idnum - 1]
122 def _addPage(self, page, action):
123 assert page["/Type"] == "/Page"
124 page[NameObject("/Parent")] = self._pages
125 page = self._addObject(page)
126 pages = self.getObject(self._pages)
127 action(pages["/Kids"], page)
128 pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1)
130 def addPage(self, page):
131 """
132 Adds a page to this PDF file. The page is usually acquired from a
133 :class:`PdfFileReader<PdfFileReader>` instance.
135 :param PageObject page: The page to add to the document. Should be
136 an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
137 """
138 self._addPage(page, list.append)
140 def insertPage(self, page, index=0):
141 """
142 Insert a page in this PDF file. The page is usually acquired from a
143 :class:`PdfFileReader<PdfFileReader>` instance.
145 :param PageObject page: The page to add to the document. This
146 argument should be an instance of :class:`PageObject<pdf.PageObject>`.
147 :param int index: Position at which the page will be inserted.
148 """
149 self._addPage(page, lambda l, p: l.insert(index, p))
151 def getPage(self, pageNumber):
152 """
153 Retrieves a page by number from this PDF file.
155 :param int pageNumber: The page number to retrieve
156 (pages begin at zero)
157 :return: the page at the index given by *pageNumber*
158 :rtype: :class:`PageObject<pdf.PageObject>`
159 """
160 pages = self.getObject(self._pages)
161 # XXX: crude hack
162 return pages["/Kids"][pageNumber].getObject()
164 def getNumPages(self):
165 """
166 :return: the number of pages.
167 :rtype: int
168 """
169 pages = self.getObject(self._pages)
170 return int(pages[NameObject("/Count")])
172 def addBlankPage(self, width=None, height=None):
173 """
174 Appends a blank page to this PDF file and returns it. If no page size
175 is specified, use the size of the last page.
177 :param float width: The width of the new page expressed in default user
178 space units.
179 :param float height: The height of the new page expressed in default
180 user space units.
181 :return: the newly appended page
182 :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
183 :raises PageSizeNotDefinedError: if width and height are not defined
184 and previous page does not exist.
185 """
186 page = PageObject.createBlankPage(self, width, height)
187 self.addPage(page)
188 return page
190 def insertBlankPage(self, width=None, height=None, index=0):
191 """
192 Inserts a blank page to this PDF file and returns it. If no page size
193 is specified, use the size of the last page.
195 :param float width: The width of the new page expressed in default user
196 space units.
197 :param float height: The height of the new page expressed in default
198 user space units.
199 :param int index: Position to add the page.
200 :return: the newly appended page
201 :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
202 :raises PageSizeNotDefinedError: if width and height are not defined
203 and previous page does not exist.
204 """
205 if width is None or height is None and \
206 (self.getNumPages() - 1) >= index:
207 oldpage = self.getPage(index)
208 width = oldpage.mediaBox.getWidth()
209 height = oldpage.mediaBox.getHeight()
210 page = PageObject.createBlankPage(self, width, height)
211 self.insertPage(page, index)
212 return page
214 def addJS(self, javascript):
215 """
216 Add Javascript which will launch upon opening this PDF.
218 :param str javascript: Your Javascript.
220 >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
221 # Example: This will launch the print window when the PDF is opened.
222 """
223 js = DictionaryObject()
224 js.update({
225 NameObject("/Type"): NameObject("/Action"),
226 NameObject("/S"): NameObject("/JavaScript"),
227 NameObject("/JS"): NameObject("(%s)" % javascript)
228 })
229 js_indirect_object = self._addObject(js)
231 # We need a name for parameterized javascript in the pdf file, but it can be anything.
232 js_string_name = str(uuid.uuid4())
234 js_name_tree = DictionaryObject()
235 js_name_tree.update({
236 NameObject("/JavaScript"): DictionaryObject({
237 NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object])
238 })
239 })
240 self._addObject(js_name_tree)
242 self._root_object.update({
243 NameObject("/OpenAction"): js_indirect_object,
244 NameObject("/Names"): js_name_tree
245 })
247 def addAttachment(self, fname, fdata):
248 """
249 Embed a file inside the PDF.
251 :param str fname: The filename to display.
252 :param str fdata: The data in the file.
254 Reference:
255 https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
256 Section 7.11.3
257 """
259 # We need 3 entries:
260 # * The file's data
261 # * The /Filespec entry
262 # * The file's name, which goes in the Catalog
265 # The entry for the file
266 """ Sample:
267 8 0 obj
268 <<
269 /Length 12
270 /Type /EmbeddedFile
271 >>
272 stream
273 Hello world!
274 endstream
275 endobj
276 """
277 file_entry = DecodedStreamObject()
278 file_entry.setData(fdata)
279 file_entry.update({
280 NameObject("/Type"): NameObject("/EmbeddedFile")
281 })
283 # The Filespec entry
284 """ Sample:
285 7 0 obj
286 <<
287 /Type /Filespec
288 /F (hello.txt)
289 /EF << /F 8 0 R >>
290 >>
291 """
292 efEntry = DictionaryObject()
293 efEntry.update({ NameObject("/F"):file_entry })
295 filespec = DictionaryObject()
296 filespec.update({
297 NameObject("/Type"): NameObject("/Filespec"),
298 NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject
299 NameObject("/EF"): efEntry
300 })
302 # Then create the entry for the root, as it needs a reference to the Filespec
303 """ Sample:
304 1 0 obj
305 <<
306 /Type /Catalog
307 /Outlines 2 0 R
308 /Pages 3 0 R
309 /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
310 >>
311 endobj
313 """
314 embeddedFilesNamesDictionary = DictionaryObject()
315 embeddedFilesNamesDictionary.update({
316 NameObject("/Names"): ArrayObject([createStringObject(fname), filespec])
317 })
319 embeddedFilesDictionary = DictionaryObject()
320 embeddedFilesDictionary.update({
321 NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary
322 })
323 # Update the root
324 self._root_object.update({
325 NameObject("/Names"): embeddedFilesDictionary
326 })
328 def appendPagesFromReader(self, reader, after_page_append=None):
329 """
330 Copy pages from reader to writer. Includes an optional callback parameter
331 which is invoked after pages are appended to the writer.
333 :param reader: a PdfFileReader object from which to copy page
334 annotations to this writer object. The writer's annots
335 will then be updated
336 :callback after_page_append (function): Callback function that is invoked after
337 each page is appended to the writer. Callback signature:
339 :param writer_pageref (PDF page reference): Reference to the page
340 appended to the writer.
341 """
342 # Get page count from writer and reader
343 reader_num_pages = reader.getNumPages()
344 writer_num_pages = self.getNumPages()
346 # Copy pages from reader to writer
347 for rpagenum in range(0, reader_num_pages):
348 reader_page = reader.getPage(rpagenum)
349 self.addPage(reader_page)
350 writer_page = self.getPage(writer_num_pages+rpagenum)
351 # Trigger callback, pass writer page as parameter
352 if callable(after_page_append): after_page_append(writer_page)
354 def updatePageFormFieldValues(self, page, fields):
355 '''
356 Update the form field values for a given page from a fields dictionary.
357 Copy field texts and values from fields to page.
359 :param page: Page reference from PDF writer where the annotations
360 and field data will be updated.
361 :param fields: a Python dictionary of field names (/T) and text
362 values (/V)
363 '''
364 # Iterate through pages, update field values
365 for j in range(0, len(page['/Annots'])):
366 writer_annot = page['/Annots'][j].getObject()
367 for field in fields:
368 if writer_annot.get('/T') == field:
369 writer_annot.update({
370 NameObject("/V"): TextStringObject(fields[field])
371 })
373 def cloneReaderDocumentRoot(self, reader):
374 '''
375 Copy the reader document root to the writer.
377 :param reader: PdfFileReader from the document root should be copied.
378 :callback after_page_append
379 '''
380 self._root_object = reader.trailer['/Root']
382 def cloneDocumentFromReader(self, reader, after_page_append=None):
383 '''
384 Create a copy (clone) of a document from a PDF file reader
386 :param reader: PDF file reader instance from which the clone
387 should be created.
388 :callback after_page_append (function): Callback function that is invoked after
389 each page is appended to the writer. Signature includes a reference to the
390 appended page (delegates to appendPagesFromReader). Callback signature:
392 :param writer_pageref (PDF page reference): Reference to the page just
393 appended to the document.
394 '''
395 self.cloneReaderDocumentRoot(reader)
396 self.appendPagesFromReader(reader, after_page_append)
398 def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
399 """
400 Encrypt this PDF file with the PDF Standard encryption handler.
402 :param str user_pwd: The "user password", which allows for opening
403 and reading the PDF file with the restrictions provided.
404 :param str owner_pwd: The "owner password", which allows for
405 opening the PDF files without any restrictions. By default,
406 the owner password is the same as the user password.
407 :param bool use_128bit: flag as to whether to use 128bit
408 encryption. When false, 40bit encryption will be used. By default,
409 this flag is on.
410 """
411 import time, random
412 if owner_pwd == None:
413 owner_pwd = user_pwd
414 if use_128bit:
415 V = 2
416 rev = 3
417 keylen = int(128 / 8)
418 else:
419 V = 1
420 rev = 2
421 keylen = int(40 / 8)
422 # permit everything:
423 P = -1
424 O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
425 ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
426 ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
427 self._ID = ArrayObject((ID_1, ID_2))
428 if rev == 2:
429 U, key = _alg34(user_pwd, O, P, ID_1)
430 else:
431 assert rev == 3
432 U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
433 encrypt = DictionaryObject()
434 encrypt[NameObject("/Filter")] = NameObject("/Standard")
435 encrypt[NameObject("/V")] = NumberObject(V)
436 if V == 2:
437 encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
438 encrypt[NameObject("/R")] = NumberObject(rev)
439 encrypt[NameObject("/O")] = ByteStringObject(O)
440 encrypt[NameObject("/U")] = ByteStringObject(U)
441 encrypt[NameObject("/P")] = NumberObject(P)
442 self._encrypt = self._addObject(encrypt)
443 self._encrypt_key = key
445 def write(self, stream):
446 """
447 Writes the collection of pages added to this object out as a PDF file.
449 :param stream: An object to write the file to. The object must support
450 the write method and the tell method, similar to a file object.
451 """
452 if hasattr(stream, 'mode') and 'b' not in stream.mode:
453 warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name)
454 debug = False
455 import struct
457 if not self._root:
458 self._root = self._addObject(self._root_object)
460 externalReferenceMap = {}
462 # PDF objects sometimes have circular references to their /Page objects
463 # inside their object tree (for example, annotations). Those will be
464 # indirect references to objects that we've recreated in this PDF. To
465 # address this problem, PageObject's store their original object
466 # reference number, and we add it to the external reference map before
467 # we sweep for indirect references. This forces self-page-referencing
468 # trees to reference the correct new object location, rather than
469 # copying in a new copy of the page object.
470 for objIndex in range(len(self._objects)):
471 obj = self._objects[objIndex]
472 if isinstance(obj, PageObject) and obj.indirectRef != None:
473 data = obj.indirectRef
474 if data.pdf not in externalReferenceMap:
475 externalReferenceMap[data.pdf] = {}
476 if data.generation not in externalReferenceMap[data.pdf]:
477 externalReferenceMap[data.pdf][data.generation] = {}
478 externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self)
480 self.stack = []
481 if debug: print(("ERM:", externalReferenceMap, "root:", self._root))
482 self._sweepIndirectReferences(externalReferenceMap, self._root)
483 del self.stack
485 # Begin writing:
486 object_positions = []
487 stream.write(self._header + b_("\n"))
488 for i in range(len(self._objects)):
489 idnum = (i + 1)
490 obj = self._objects[i]
491 object_positions.append(stream.tell())
492 stream.write(b_(str(idnum) + " 0 obj\n"))
493 key = None
494 if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum:
495 pack1 = struct.pack("<i", i + 1)[:3]
496 pack2 = struct.pack("<i", 0)[:2]
497 key = self._encrypt_key + pack1 + pack2
498 assert len(key) == (len(self._encrypt_key) + 5)
499 md5_hash = md5(key).digest()
500 key = md5_hash[:min(16, len(self._encrypt_key) + 5)]
501 obj.writeToStream(stream, key)
502 stream.write(b_("\nendobj\n"))
504 # xref table
505 xref_location = stream.tell()
506 stream.write(b_("xref\n"))
507 stream.write(b_("0 %s\n" % (len(self._objects) + 1)))
508 stream.write(b_("%010d %05d f \n" % (0, 65535)))
509 for offset in object_positions:
510 stream.write(b_("%010d %05d n \n" % (offset, 0)))
512 # trailer
513 stream.write(b_("trailer\n"))
514 trailer = DictionaryObject()
515 trailer.update({
516 NameObject("/Size"): NumberObject(len(self._objects) + 1),
517 NameObject("/Root"): self._root,
518 NameObject("/Info"): self._info,
519 })
520 if hasattr(self, "_ID"):
521 trailer[NameObject("/ID")] = self._ID
522 if hasattr(self, "_encrypt"):
523 trailer[NameObject("/Encrypt")] = self._encrypt
524 trailer.writeToStream(stream, None)
526 # eof
527 stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
529 def addMetadata(self, infos):
530 """
531 Add custom metadata to the output.
533 :param dict infos: a Python dictionary where each key is a field
534 and each value is your new metadata.
535 """
536 args = {}
537 for key, value in list(infos.items()):
538 args[NameObject(key)] = createStringObject(value)
539 self.getObject(self._info).update(args)
541 def _sweepIndirectReferences(self, externMap, data):
542 debug = False
543 if debug: print((data, "TYPE", data.__class__.__name__))
544 if isinstance(data, DictionaryObject):
545 for key, value in list(data.items()):
546 origvalue = value
547 value = self._sweepIndirectReferences(externMap, value)
548 if isinstance(value, StreamObject):
549 # a dictionary value is a stream. streams must be indirect
550 # objects, so we need to change this value.
551 value = self._addObject(value)
552 data[key] = value
553 return data
554 elif isinstance(data, ArrayObject):
555 for i in range(len(data)):
556 value = self._sweepIndirectReferences(externMap, data[i])
557 if isinstance(value, StreamObject):
558 # an array value is a stream. streams must be indirect
559 # objects, so we need to change this value
560 value = self._addObject(value)
561 data[i] = value
562 return data
563 elif isinstance(data, IndirectObject):
564 # internal indirect references are fine
565 if data.pdf == self:
566 if data.idnum in self.stack:
567 return data
568 else:
569 self.stack.append(data.idnum)
570 realdata = self.getObject(data)
571 self._sweepIndirectReferences(externMap, realdata)
572 return data
573 else:
574 newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None)
575 if newobj == None:
576 try:
577 newobj = data.pdf.getObject(data)
578 self._objects.append(None) # placeholder
579 idnum = len(self._objects)
580 newobj_ido = IndirectObject(idnum, 0, self)
581 if data.pdf not in externMap:
582 externMap[data.pdf] = {}
583 if data.generation not in externMap[data.pdf]:
584 externMap[data.pdf][data.generation] = {}
585 externMap[data.pdf][data.generation][data.idnum] = newobj_ido
586 newobj = self._sweepIndirectReferences(externMap, newobj)
587 self._objects[idnum-1] = newobj
588 return newobj_ido
589 except ValueError:
590 # Unable to resolve the Object, returning NullObject instead.
591 return NullObject()
592 return newobj
593 else:
594 return data
596 def getReference(self, obj):
597 idnum = self._objects.index(obj) + 1
598 ref = IndirectObject(idnum, 0, self)
599 assert ref.getObject() == obj
600 return ref
602 def getOutlineRoot(self):
603 if '/Outlines' in self._root_object:
604 outline = self._root_object['/Outlines']
605 idnum = self._objects.index(outline) + 1
606 outlineRef = IndirectObject(idnum, 0, self)
607 assert outlineRef.getObject() == outline
608 else:
609 outline = TreeObject()
610 outline.update({ })
611 outlineRef = self._addObject(outline)
612 self._root_object[NameObject('/Outlines')] = outlineRef
614 return outline
616 def getNamedDestRoot(self):
617 if '/Names' in self._root_object and isinstance(self._root_object['/Names'], DictionaryObject):
618 names = self._root_object['/Names']
619 idnum = self._objects.index(names) + 1
620 namesRef = IndirectObject(idnum, 0, self)
621 assert namesRef.getObject() == names
622 if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject):
623 dests = names['/Dests']
624 idnum = self._objects.index(dests) + 1
625 destsRef = IndirectObject(idnum, 0, self)
626 assert destsRef.getObject() == dests
627 if '/Names' in dests:
628 nd = dests['/Names']
629 else:
630 nd = ArrayObject()
631 dests[NameObject('/Names')] = nd
632 else:
633 dests = DictionaryObject()
634 destsRef = self._addObject(dests)
635 names[NameObject('/Dests')] = destsRef
636 nd = ArrayObject()
637 dests[NameObject('/Names')] = nd
639 else:
640 names = DictionaryObject()
641 namesRef = self._addObject(names)
642 self._root_object[NameObject('/Names')] = namesRef
643 dests = DictionaryObject()
644 destsRef = self._addObject(dests)
645 names[NameObject('/Dests')] = destsRef
646 nd = ArrayObject()
647 dests[NameObject('/Names')] = nd
649 return nd
651 def addBookmarkDestination(self, dest, parent=None):
652 destRef = self._addObject(dest)
654 outlineRef = self.getOutlineRoot()
656 if parent == None:
657 parent = outlineRef
659 parent = parent.getObject()
660 #print parent.__class__.__name__
661 parent.addChild(destRef, self)
663 return destRef
665 def addBookmarkDict(self, bookmark, parent=None):
666 bookmarkObj = TreeObject()
667 for k, v in list(bookmark.items()):
668 bookmarkObj[NameObject(str(k))] = v
669 bookmarkObj.update(bookmark)
671 if '/A' in bookmark:
672 action = DictionaryObject()
673 for k, v in list(bookmark['/A'].items()):
674 action[NameObject(str(k))] = v
675 actionRef = self._addObject(action)
676 bookmarkObj[NameObject('/A')] = actionRef
678 bookmarkRef = self._addObject(bookmarkObj)
680 outlineRef = self.getOutlineRoot()
682 if parent == None:
683 parent = outlineRef
685 parent = parent.getObject()
686 parent.addChild(bookmarkRef, self)
688 return bookmarkRef
690 def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args):
691 """
692 Add a bookmark to this PDF file.
694 :param str title: Title to use for this bookmark.
695 :param int pagenum: Page number this bookmark will point to.
696 :param parent: A reference to a parent bookmark to create nested
697 bookmarks.
698 :param tuple color: Color of the bookmark as a red, green, blue tuple
699 from 0.0 to 1.0
700 :param bool bold: Bookmark is bold
701 :param bool italic: Bookmark is italic
702 :param str fit: The fit of the destination page. See
703 :meth:`addLink()<addLink>` for details.
704 """
705 pageRef = self.getObject(self._pages)['/Kids'][pagenum]
706 action = DictionaryObject()
707 zoomArgs = []
708 for a in args:
709 if a is not None:
710 zoomArgs.append(NumberObject(a))
711 else:
712 zoomArgs.append(NullObject())
713 dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs)
714 destArray = dest.getDestArray()
715 action.update({
716 NameObject('/D') : destArray,
717 NameObject('/S') : NameObject('/GoTo')
718 })
719 actionRef = self._addObject(action)
721 outlineRef = self.getOutlineRoot()
723 if parent == None:
724 parent = outlineRef
726 bookmark = TreeObject()
728 bookmark.update({
729 NameObject('/A'): actionRef,
730 NameObject('/Title'): createStringObject(title),
731 })
733 if color is not None:
734 bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])})
736 format = 0
737 if italic:
738 format += 1
739 if bold:
740 format += 2
741 if format:
742 bookmark.update({NameObject('/F'): NumberObject(format)})
744 bookmarkRef = self._addObject(bookmark)
746 parent = parent.getObject()
747 parent.addChild(bookmarkRef, self)
749 return bookmarkRef
751 def addNamedDestinationObject(self, dest):
752 destRef = self._addObject(dest)
754 nd = self.getNamedDestRoot()
755 nd.extend([dest['/Title'], destRef])
757 return destRef
759 def addNamedDestination(self, title, pagenum):
760 pageRef = self.getObject(self._pages)['/Kids'][pagenum]
761 dest = DictionaryObject()
762 dest.update({
763 NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
764 NameObject('/S') : NameObject('/GoTo')
765 })
767 destRef = self._addObject(dest)
768 nd = self.getNamedDestRoot()
770 nd.extend([title, destRef])
772 return destRef
774 def removeLinks(self):
775 """
776 Removes links and annotations from this output.
777 """
778 pages = self.getObject(self._pages)['/Kids']
779 for page in pages:
780 pageRef = self.getObject(page)
781 if "/Annots" in pageRef:
782 del pageRef['/Annots']
784 def removeImages(self, ignoreByteStringObject=False):
785 """
786 Removes images from this output.
788 :param bool ignoreByteStringObject: optional parameter
789 to ignore ByteString Objects.
790 """
791 pages = self.getObject(self._pages)['/Kids']
792 for j in range(len(pages)):
793 page = pages[j]
794 pageRef = self.getObject(page)
795 content = pageRef['/Contents'].getObject()
796 if not isinstance(content, ContentStream):
797 content = ContentStream(content, pageRef)
799 _operations = []
800 seq_graphics = False
801 for operands, operator in content.operations:
802 if operator == b_('Tj'):
803 text = operands[0]
804 if ignoreByteStringObject:
805 if not isinstance(text, TextStringObject):
806 operands[0] = TextStringObject()
807 elif operator == b_("'"):
808 text = operands[0]
809 if ignoreByteStringObject:
810 if not isinstance(text, TextStringObject):
811 operands[0] = TextStringObject()
812 elif operator == b_('"'):
813 text = operands[2]
814 if ignoreByteStringObject:
815 if not isinstance(text, TextStringObject):
816 operands[2] = TextStringObject()
817 elif operator == b_("TJ"):
818 for i in range(len(operands[0])):
819 if ignoreByteStringObject:
820 if not isinstance(operands[0][i], TextStringObject):
821 operands[0][i] = TextStringObject()
823 if operator == b_('q'):
824 seq_graphics = True
825 if operator == b_('Q'):
826 seq_graphics = False
827 if seq_graphics:
828 if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'),
829 b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'),
830 b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]:
831 continue
832 if operator == b_('re'):
833 continue
834 _operations.append((operands, operator))
836 content.operations = _operations
837 pageRef.__setitem__(NameObject('/Contents'), content)
839 def removeText(self, ignoreByteStringObject=False):
840 """
841 Removes images from this output.
843 :param bool ignoreByteStringObject: optional parameter
844 to ignore ByteString Objects.
845 """
846 pages = self.getObject(self._pages)['/Kids']
847 for j in range(len(pages)):
848 page = pages[j]
849 pageRef = self.getObject(page)
850 content = pageRef['/Contents'].getObject()
851 if not isinstance(content, ContentStream):
852 content = ContentStream(content, pageRef)
853 for operands,operator in content.operations:
854 if operator == b_('Tj'):
855 text = operands[0]
856 if not ignoreByteStringObject:
857 if isinstance(text, TextStringObject):
858 operands[0] = TextStringObject()
859 else:
860 if isinstance(text, TextStringObject) or \
861 isinstance(text, ByteStringObject):
862 operands[0] = TextStringObject()
863 elif operator == b_("'"):
864 text = operands[0]
865 if not ignoreByteStringObject:
866 if isinstance(text, TextStringObject):
867 operands[0] = TextStringObject()
868 else:
869 if isinstance(text, TextStringObject) or \
870 isinstance(text, ByteStringObject):
871 operands[0] = TextStringObject()
872 elif operator == b_('"'):
873 text = operands[2]
874 if not ignoreByteStringObject:
875 if isinstance(text, TextStringObject):
876 operands[2] = TextStringObject()
877 else:
878 if isinstance(text, TextStringObject) or \
879 isinstance(text, ByteStringObject):
880 operands[2] = TextStringObject()
881 elif operator == b_("TJ"):
882 for i in range(len(operands[0])):
883 if not ignoreByteStringObject:
884 if isinstance(operands[0][i], TextStringObject):
885 operands[0][i] = TextStringObject()
886 else:
887 if isinstance(operands[0][i], TextStringObject) or \
888 isinstance(operands[0][i], ByteStringObject):
889 operands[0][i] = TextStringObject()
891 pageRef.__setitem__(NameObject('/Contents'), content)
893 def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args):
894 """
895 Add an internal link from a rectangular area to the specified page.
897 :param int pagenum: index of the page on which to place the link.
898 :param int pagedest: index of the page to which the link should go.
899 :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four
900 integers specifying the clickable rectangular area
901 ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``.
902 :param border: if provided, an array describing border-drawing
903 properties. See the PDF spec for details. No border will be
904 drawn if this argument is omitted.
905 :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need
906 to be supplied. Passing ``None`` will be read as a null value for that coordinate.
908 Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details):
909 /Fit No additional arguments
910 /XYZ [left] [top] [zoomFactor]
911 /FitH [top]
912 /FitV [left]
913 /FitR [left] [bottom] [right] [top]
914 /FitB No additional arguments
915 /FitBH [top]
916 /FitBV [left]
917 """
919 pageLink = self.getObject(self._pages)['/Kids'][pagenum]
920 pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link
921 pageRef = self.getObject(pageLink)
923 if border is not None:
924 borderArr = [NameObject(n) for n in border[:3]]
925 if len(border) == 4:
926 dashPattern = ArrayObject([NameObject(n) for n in border[3]])
927 borderArr.append(dashPattern)
928 else:
929 borderArr = [NumberObject(0)] * 3
931 if isString(rect):
932 rect = NameObject(rect)
933 elif isinstance(rect, RectangleObject):
934 pass
935 else:
936 rect = RectangleObject(rect)
938 zoomArgs = []
939 for a in args:
940 if a is not None:
941 zoomArgs.append(NumberObject(a))
942 else:
943 zoomArgs.append(NullObject())
944 dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link
945 destArray = dest.getDestArray()
947 lnk = DictionaryObject()
948 lnk.update({
949 NameObject('/Type'): NameObject('/Annot'),
950 NameObject('/Subtype'): NameObject('/Link'),
951 NameObject('/P'): pageLink,
952 NameObject('/Rect'): rect,
953 NameObject('/Border'): ArrayObject(borderArr),
954 NameObject('/Dest'): destArray
955 })
956 lnkRef = self._addObject(lnk)
958 if "/Annots" in pageRef:
959 pageRef['/Annots'].append(lnkRef)
960 else:
961 pageRef[NameObject('/Annots')] = ArrayObject([lnkRef])
963 _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight']
965 def getPageLayout(self):
966 """
967 Get the page layout.
968 See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts.
970 :return: Page layout currently being used.
971 :rtype: str, None if not specified
972 """
973 try:
974 return self._root_object['/PageLayout']
975 except KeyError:
976 return None
978 def setPageLayout(self, layout):
979 """
980 Set the page layout
982 :param str layout: The page layout to be used
984 Valid layouts are:
985 /NoLayout Layout explicitly not specified
986 /SinglePage Show one page at a time
987 /OneColumn Show one column at a time
988 /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left
989 /TwoColumnRight Show pages in two columns, odd-numbered pages on the right
990 /TwoPageLeft Show two pages at a time, odd-numbered pages on the left
991 /TwoPageRight Show two pages at a time, odd-numbered pages on the right
992 """
993 if not isinstance(layout, NameObject):
994 if layout not in self._valid_layouts:
995 warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts)))
996 layout = NameObject(layout)
997 self._root_object.update({NameObject('/PageLayout'): layout})
999 pageLayout = property(getPageLayout, setPageLayout)
1000 """Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>`
1001 and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods."""
1003 _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments']
1005 def getPageMode(self):
1006 """
1007 Get the page mode.
1008 See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description
1009 of valid modes.
1011 :return: Page mode currently being used.
1012 :rtype: str, None if not specified
1013 """
1014 try:
1015 return self._root_object['/PageMode']
1016 except KeyError:
1017 return None
1019 def setPageMode(self, mode):
1020 """
1021 Set the page mode.
1023 :param str mode: The page mode to use.
1025 Valid modes are:
1026 /UseNone Do not show outlines or thumbnails panels
1027 /UseOutlines Show outlines (aka bookmarks) panel
1028 /UseThumbs Show page thumbnails panel
1029 /FullScreen Fullscreen view
1030 /UseOC Show Optional Content Group (OCG) panel
1031 /UseAttachments Show attachments panel
1032 """
1033 if not isinstance(mode, NameObject):
1034 if mode not in self._valid_modes:
1035 warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes)))
1036 mode = NameObject(mode)
1037 self._root_object.update({NameObject('/PageMode'): mode})
1039 pageMode = property(getPageMode, setPageMode)
1040 """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>`
1041 and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods."""
1044class PdfFileReader(object):
1045 """
1046 Initializes a PdfFileReader object. This operation can take some time, as
1047 the PDF stream's cross-reference tables are read into memory.
1049 :param stream: A File object or an object that supports the standard read
1050 and seek methods similar to a File object. Could also be a
1051 string representing a path to a PDF file.
1052 :param bool strict: Determines whether user should be warned of all
1053 problems and also causes some correctable problems to be fatal.
1054 Defaults to ``True``.
1055 :param warndest: Destination for logging warnings (defaults to
1056 ``sys.stderr``).
1057 :param bool overwriteWarnings: Determines whether to override Python's
1058 ``warnings.py`` module with a custom implementation (defaults to
1059 ``True``).
1060 """
1061 def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True):
1062 if overwriteWarnings:
1063 # have to dynamically override the default showwarning since there are no
1064 # public methods that specify the 'file' parameter
1065 def _showwarning(message, category, filename, lineno, file=warndest, line=None):
1066 if file is None:
1067 file = sys.stderr
1068 try:
1069 file.write(formatWarning(message, category, filename, lineno, line))
1070 except IOError:
1071 pass
1072 warnings.showwarning = _showwarning
1073 self.strict = strict
1074 self.flattenedPages = None
1075 self.resolvedObjects = {}
1076 self.xrefIndex = 0
1077 self._pageId2Num = None # map page IndirectRef number to Page Number
1078 if hasattr(stream, 'mode') and 'b' not in stream.mode:
1079 warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning)
1080 if isString(stream):
1081 fileobj = open(stream, 'rb')
1082 stream = BytesIO(b_(fileobj.read()))
1083 fileobj.close()
1084 self.read(stream)
1085 self.stream = stream
1087 self._override_encryption = False
1089 def getDocumentInfo(self):
1090 """
1091 Retrieves the PDF file's document information dictionary, if it exists.
1092 Note that some PDF files use metadata streams instead of docinfo
1093 dictionaries, and these metadata streams will not be accessed by this
1094 function.
1096 :return: the document information of this PDF file
1097 :rtype: :class:`DocumentInformation<pdf.DocumentInformation>` or ``None`` if none exists.
1098 """
1099 if "/Info" not in self.trailer:
1100 return None
1101 obj = self.trailer['/Info']
1102 retval = DocumentInformation()
1103 retval.update(obj)
1104 return retval
1106 documentInfo = property(lambda self: self.getDocumentInfo(), None, None)
1107 """Read-only property that accesses the :meth:`getDocumentInfo()<PdfFileReader.getDocumentInfo>` function."""
1109 def getXmpMetadata(self):
1110 """
1111 Retrieves XMP (Extensible Metadata Platform) data from the PDF document
1112 root.
1114 :return: a :class:`XmpInformation<xmp.XmpInformation>`
1115 instance that can be used to access XMP metadata from the document.
1116 :rtype: :class:`XmpInformation<xmp.XmpInformation>` or
1117 ``None`` if no metadata was found on the document root.
1118 """
1119 try:
1120 self._override_encryption = True
1121 return self.trailer["/Root"].getXmpMetadata()
1122 finally:
1123 self._override_encryption = False
1125 xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
1126 """
1127 Read-only property that accesses the
1128 :meth:`getXmpMetadata()<PdfFileReader.getXmpMetadata>` function.
1129 """
1131 def getNumPages(self):
1132 """
1133 Calculates the number of pages in this PDF file.
1135 :return: number of pages
1136 :rtype: int
1137 :raises PdfReadError: if file is encrypted and restrictions prevent
1138 this action.
1139 """
1141 # Flattened pages will not work on an Encrypted PDF;
1142 # the PDF file's page count is used in this case. Otherwise,
1143 # the original method (flattened page count) is used.
1144 if self.isEncrypted:
1145 try:
1146 self._override_encryption = True
1147 self.decrypt('')
1148 return self.trailer["/Root"]["/Pages"]["/Count"]
1149 except:
1150 raise utils.PdfReadError("File has not been decrypted")
1151 finally:
1152 self._override_encryption = False
1153 else:
1154 if self.flattenedPages == None:
1155 self._flatten()
1156 return len(self.flattenedPages)
1158 numPages = property(lambda self: self.getNumPages(), None, None)
1159 """
1160 Read-only property that accesses the
1161 :meth:`getNumPages()<PdfFileReader.getNumPages>` function.
1162 """
1164 def getPage(self, pageNumber):
1165 """
1166 Retrieves a page by number from this PDF file.
1168 :param int pageNumber: The page number to retrieve
1169 (pages begin at zero)
1170 :return: a :class:`PageObject<pdf.PageObject>` instance.
1171 :rtype: :class:`PageObject<pdf.PageObject>`
1172 """
1173 ## ensure that we're not trying to access an encrypted PDF
1174 #assert not self.trailer.has_key("/Encrypt")
1175 if self.flattenedPages == None:
1176 self._flatten()
1177 return self.flattenedPages[pageNumber]
1179 namedDestinations = property(lambda self:
1180 self.getNamedDestinations(), None, None)
1181 """
1182 Read-only property that accesses the
1183 :meth:`getNamedDestinations()<PdfFileReader.getNamedDestinations>` function.
1184 """
1186 # A select group of relevant field attributes. For the complete list,
1187 # see section 8.6.2 of the PDF 1.7 reference.
1189 def getFields(self, tree = None, retval = None, fileobj = None):
1190 """
1191 Extracts field data if this PDF contains interactive form fields.
1192 The *tree* and *retval* parameters are for recursive use.
1194 :param fileobj: A file object (usually a text file) to write
1195 a report to on all interactive form fields found.
1196 :return: A dictionary where each key is a field name, and each
1197 value is a :class:`Field<PyPDF2.generic.Field>` object. By
1198 default, the mapping name is used for keys.
1199 :rtype: dict, or ``None`` if form data could not be located.
1200 """
1201 fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent",
1202 "/T" : "Field Name", "/TU" : "Alternate Field Name",
1203 "/TM" : "Mapping Name", "/Ff" : "Field Flags",
1204 "/V" : "Value", "/DV" : "Default Value"}
1205 if retval == None:
1206 retval = {}
1207 catalog = self.trailer["/Root"]
1208 # get the AcroForm tree
1209 if "/AcroForm" in catalog:
1210 tree = catalog["/AcroForm"]
1211 else:
1212 return None
1213 if tree == None:
1214 return retval
1216 self._checkKids(tree, retval, fileobj)
1217 for attr in fieldAttributes:
1218 if attr in tree:
1219 # Tree is a field
1220 self._buildField(tree, retval, fileobj, fieldAttributes)
1221 break
1223 if "/Fields" in tree:
1224 fields = tree["/Fields"]
1225 for f in fields:
1226 field = f.getObject()
1227 self._buildField(field, retval, fileobj, fieldAttributes)
1229 return retval
1231 def _buildField(self, field, retval, fileobj, fieldAttributes):
1232 self._checkKids(field, retval, fileobj)
1233 try:
1234 key = field["/TM"]
1235 except KeyError:
1236 try:
1237 key = field["/T"]
1238 except KeyError:
1239 # Ignore no-name field for now
1240 return
1241 if fileobj:
1242 self._writeField(fileobj, field, fieldAttributes)
1243 fileobj.write("\n")
1244 retval[key] = Field(field)
1246 def _checkKids(self, tree, retval, fileobj):
1247 if "/Kids" in tree:
1248 # recurse down the tree
1249 for kid in tree["/Kids"]:
1250 self.getFields(kid.getObject(), retval, fileobj)
1252 def _writeField(self, fileobj, field, fieldAttributes):
1253 order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"]
1254 for attr in order:
1255 attrName = fieldAttributes[attr]
1256 try:
1257 if attr == "/FT":
1258 # Make the field type value more clear
1259 types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice",
1260 "/Sig":"Signature"}
1261 if field[attr] in types:
1262 fileobj.write(attrName + ": " + types[field[attr]] + "\n")
1263 elif attr == "/Parent":
1264 # Let's just write the name of the parent
1265 try:
1266 name = field["/Parent"]["/TM"]
1267 except KeyError:
1268 name = field["/Parent"]["/T"]
1269 fileobj.write(attrName + ": " + name + "\n")
1270 else:
1271 fileobj.write(attrName + ": " + str(field[attr]) + "\n")
1272 except KeyError:
1273 # Field attribute is N/A or unknown, so don't write anything
1274 pass
1276 def getFormTextFields(self):
1277 ''' Retrieves form fields from the document with textual data (inputs, dropdowns)
1278 '''
1279 # Retrieve document form fields
1280 formfields = self.getFields()
1281 return dict(
1282 (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \
1283 if formfields[field].get('/FT') == '/Tx'
1284 )
1286 def getNamedDestinations(self, tree=None, retval=None):
1287 """
1288 Retrieves the named destinations present in the document.
1290 :return: a dictionary which maps names to
1291 :class:`Destinations<PyPDF2.generic.Destination>`.
1292 :rtype: dict
1293 """
1294 if retval == None:
1295 retval = {}
1296 catalog = self.trailer["/Root"]
1298 # get the name tree
1299 if "/Dests" in catalog:
1300 tree = catalog["/Dests"]
1301 elif "/Names" in catalog:
1302 names = catalog['/Names']
1303 if "/Dests" in names:
1304 tree = names['/Dests']
1306 if tree == None:
1307 return retval
1309 if "/Kids" in tree:
1310 # recurse down the tree
1311 for kid in tree["/Kids"]:
1312 self.getNamedDestinations(kid.getObject(), retval)
1314 if "/Names" in tree:
1315 names = tree["/Names"]
1316 for i in range(0, len(names), 2):
1317 key = names[i].getObject()
1318 val = names[i+1].getObject()
1319 if isinstance(val, DictionaryObject) and '/D' in val:
1320 val = val['/D']
1321 dest = self._buildDestination(key, val)
1322 if dest != None:
1323 retval[key] = dest
1325 return retval
1327 outlines = property(lambda self: self.getOutlines(), None, None)
1328 """
1329 Read-only property that accesses the
1330 :meth:`getOutlines()<PdfFileReader.getOutlines>` function.
1331 """
1333 def getOutlines(self, node=None, outlines=None):
1334 """
1335 Retrieves the document outline present in the document.
1337 :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`.
1338 """
1339 if outlines == None:
1340 outlines = []
1341 catalog = self.trailer["/Root"]
1343 # get the outline dictionary and named destinations
1344 if "/Outlines" in catalog:
1345 try:
1346 lines = catalog["/Outlines"]
1347 except utils.PdfReadError:
1348 # this occurs if the /Outlines object reference is incorrect
1349 # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf
1350 # so continue to load the file without the Bookmarks
1351 return outlines
1353 if "/First" in lines:
1354 node = lines["/First"]
1355 self._namedDests = self.getNamedDestinations()
1357 if node == None:
1358 return outlines
1360 # see if there are any more outlines
1361 while True:
1362 outline = self._buildOutline(node)
1363 if outline:
1364 outlines.append(outline)
1366 # check for sub-outlines
1367 if "/First" in node:
1368 subOutlines = []
1369 self.getOutlines(node["/First"], subOutlines)
1370 if subOutlines:
1371 outlines.append(subOutlines)
1373 if "/Next" not in node:
1374 break
1375 node = node["/Next"]
1377 return outlines
1379 def _getPageNumberByIndirect(self, indirectRef):
1380 """Generate _pageId2Num"""
1381 if self._pageId2Num is None:
1382 id2num = {}
1383 for i, x in enumerate(self.pages):
1384 id2num[x.indirectRef.idnum] = i
1385 self._pageId2Num = id2num
1387 if isinstance(indirectRef, int):
1388 idnum = indirectRef
1389 else:
1390 idnum = indirectRef.idnum
1392 ret = self._pageId2Num.get(idnum, -1)
1393 return ret
1395 def getPageNumber(self, page):
1396 """
1397 Retrieve page number of a given PageObject
1399 :param PageObject page: The page to get page number. Should be
1400 an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
1401 :return: the page number or -1 if page not found
1402 :rtype: int
1403 """
1404 indirectRef = page.indirectRef
1405 ret = self._getPageNumberByIndirect(indirectRef)
1406 return ret
1408 def getDestinationPageNumber(self, destination):
1409 """
1410 Retrieve page number of a given Destination object
1412 :param Destination destination: The destination to get page number.
1413 Should be an instance of
1414 :class:`Destination<PyPDF2.pdf.Destination>`
1415 :return: the page number or -1 if page not found
1416 :rtype: int
1417 """
1418 indirectRef = destination.page
1419 ret = self._getPageNumberByIndirect(indirectRef)
1420 return ret
1422 def _buildDestination(self, title, array):
1423 page, typ = array[0:2]
1424 array = array[2:]
1425 return Destination(title, page, typ, *array)
1427 def _buildOutline(self, node):
1428 dest, title, outline = None, None, None
1430 if "/A" in node and "/Title" in node:
1431 # Action, section 8.5 (only type GoTo supported)
1432 title = node["/Title"]
1433 action = node["/A"]
1434 if action["/S"] == "/GoTo":
1435 dest = action["/D"]
1436 elif "/Dest" in node and "/Title" in node:
1437 # Destination, section 8.2.1
1438 title = node["/Title"]
1439 dest = node["/Dest"]
1441 # if destination found, then create outline
1442 if dest:
1443 if isinstance(dest, ArrayObject):
1444 outline = self._buildDestination(title, dest)
1445 elif isString(dest) and dest in self._namedDests:
1446 outline = self._namedDests[dest]
1447 outline[NameObject("/Title")] = title
1448 else:
1449 raise utils.PdfReadError("Unexpected destination %r" % dest)
1450 return outline
1452 pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage),
1453 None, None)
1454 """
1455 Read-only property that emulates a list based upon the
1456 :meth:`getNumPages()<PdfFileReader.getNumPages>` and
1457 :meth:`getPage()<PdfFileReader.getPage>` methods.
1458 """
1460 def getPageLayout(self):
1461 """
1462 Get the page layout.
1463 See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>`
1464 for a description of valid layouts.
1466 :return: Page layout currently being used.
1467 :rtype: ``str``, ``None`` if not specified
1468 """
1469 try:
1470 return self.trailer['/Root']['/PageLayout']
1471 except KeyError:
1472 return None
1474 pageLayout = property(getPageLayout)
1475 """Read-only property accessing the
1476 :meth:`getPageLayout()<PdfFileReader.getPageLayout>` method."""
1478 def getPageMode(self):
1479 """
1480 Get the page mode.
1481 See :meth:`setPageMode()<PdfFileWriter.setPageMode>`
1482 for a description of valid modes.
1484 :return: Page mode currently being used.
1485 :rtype: ``str``, ``None`` if not specified
1486 """
1487 try:
1488 return self.trailer['/Root']['/PageMode']
1489 except KeyError:
1490 return None
1492 pageMode = property(getPageMode)
1493 """Read-only property accessing the
1494 :meth:`getPageMode()<PdfFileReader.getPageMode>` method."""
1496 def _flatten(self, pages=None, inherit=None, indirectRef=None):
1497 inheritablePageAttributes = (
1498 NameObject("/Resources"), NameObject("/MediaBox"),
1499 NameObject("/CropBox"), NameObject("/Rotate")
1500 )
1501 if inherit == None:
1502 inherit = dict()
1503 if pages == None:
1504 self.flattenedPages = []
1505 catalog = self.trailer["/Root"].getObject()
1506 pages = catalog["/Pages"].getObject()
1508 t = "/Pages"
1509 if "/Type" in pages:
1510 t = pages["/Type"]
1512 if t == "/Pages":
1513 for attr in inheritablePageAttributes:
1514 if attr in pages:
1515 inherit[attr] = pages[attr]
1516 for page in pages["/Kids"]:
1517 addt = {}
1518 if isinstance(page, IndirectObject):
1519 addt["indirectRef"] = page
1520 self._flatten(page.getObject(), inherit, **addt)
1521 elif t == "/Page":
1522 for attr, value in list(inherit.items()):
1523 # if the page has it's own value, it does not inherit the
1524 # parent's value:
1525 if attr not in pages:
1526 pages[attr] = value
1527 pageObj = PageObject(self, indirectRef)
1528 pageObj.update(pages)
1529 self.flattenedPages.append(pageObj)
1531 def _getObjectFromStream(self, indirectReference):
1532 # indirect reference to object in object stream
1533 # read the entire object stream into memory
1534 debug = False
1535 stmnum, idx = self.xref_objStm[indirectReference.idnum]
1536 if debug: print(("Here1: %s %s"%(stmnum, idx)))
1537 objStm = IndirectObject(stmnum, 0, self).getObject()
1538 if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData())))
1539 # This is an xref to a stream, so its type better be a stream
1540 assert objStm['/Type'] == '/ObjStm'
1541 # /N is the number of indirect objects in the stream
1542 assert idx < objStm['/N']
1543 streamData = BytesIO(b_(objStm.getData()))
1544 for i in range(objStm['/N']):
1545 readNonWhitespace(streamData)
1546 streamData.seek(-1, 1)
1547 objnum = NumberObject.readFromStream(streamData)
1548 readNonWhitespace(streamData)
1549 streamData.seek(-1, 1)
1550 offset = NumberObject.readFromStream(streamData)
1551 readNonWhitespace(streamData)
1552 streamData.seek(-1, 1)
1553 if objnum != indirectReference.idnum:
1554 # We're only interested in one object
1555 continue
1556 if self.strict and idx != i:
1557 raise utils.PdfReadError("Object is in wrong index.")
1558 streamData.seek(objStm['/First']+offset, 0)
1559 if debug:
1560 pos = streamData.tell()
1561 streamData.seek(0, 0)
1562 lines = streamData.readlines()
1563 for i in range(0, len(lines)):
1564 print((lines[i]))
1565 streamData.seek(pos, 0)
1566 try:
1567 obj = readObject(streamData, self)
1568 except utils.PdfStreamError as e:
1569 # Stream object cannot be read. Normally, a critical error, but
1570 # Adobe Reader doesn't complain, so continue (in strict mode?)
1571 e = sys.exc_info()[1]
1572 warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \
1573 (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning)
1575 if self.strict:
1576 raise utils.PdfReadError("Can't read object stream: %s"%e)
1577 # Replace with null. Hopefully it's nothing important.
1578 obj = NullObject()
1579 return obj
1581 if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.")
1582 return NullObject()
1584 def getObject(self, indirectReference):
1585 debug = False
1586 if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation))
1587 retval = self.cacheGetIndirectObject(indirectReference.generation,
1588 indirectReference.idnum)
1589 if retval != None:
1590 return retval
1591 if indirectReference.generation == 0 and \
1592 indirectReference.idnum in self.xref_objStm:
1593 retval = self._getObjectFromStream(indirectReference)
1594 elif indirectReference.generation in self.xref and \
1595 indirectReference.idnum in self.xref[indirectReference.generation]:
1596 start = self.xref[indirectReference.generation][indirectReference.idnum]
1597 if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start))
1598 self.stream.seek(start, 0)
1599 idnum, generation = self.readObjectHeader(self.stream)
1600 if idnum != indirectReference.idnum and self.xrefIndex:
1601 # Xref table probably had bad indexes due to not being zero-indexed
1602 if self.strict:
1603 raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \
1604 % (indirectReference.idnum, indirectReference.generation, idnum, generation))
1605 else: pass # xref table is corrected in non-strict mode
1606 elif idnum != indirectReference.idnum:
1607 # some other problem
1608 raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \
1609 % (indirectReference.idnum, indirectReference.generation, idnum, generation))
1610 assert generation == indirectReference.generation
1611 retval = readObject(self.stream, self)
1613 # override encryption is used for the /Encrypt dictionary
1614 if not self._override_encryption and self.isEncrypted:
1615 # if we don't have the encryption key:
1616 if not hasattr(self, '_decryption_key'):
1617 raise utils.PdfReadError("file has not been decrypted")
1618 # otherwise, decrypt here...
1619 import struct
1620 pack1 = struct.pack("<i", indirectReference.idnum)[:3]
1621 pack2 = struct.pack("<i", indirectReference.generation)[:2]
1622 key = self._decryption_key + pack1 + pack2
1623 assert len(key) == (len(self._decryption_key) + 5)
1624 md5_hash = md5(key).digest()
1625 key = md5_hash[:min(16, len(self._decryption_key) + 5)]
1626 retval = self._decryptObject(retval, key)
1627 else:
1628 warnings.warn("Object %d %d not defined."%(indirectReference.idnum,
1629 indirectReference.generation), utils.PdfReadWarning)
1630 #if self.strict:
1631 raise utils.PdfReadError("Could not find object.")
1632 self.cacheIndirectObject(indirectReference.generation,
1633 indirectReference.idnum, retval)
1634 return retval
1636 def _decryptObject(self, obj, key):
1637 if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject):
1638 obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes))
1639 elif isinstance(obj, StreamObject):
1640 obj._data = utils.RC4_encrypt(key, obj._data)
1641 elif isinstance(obj, DictionaryObject):
1642 for dictkey, value in list(obj.items()):
1643 obj[dictkey] = self._decryptObject(value, key)
1644 elif isinstance(obj, ArrayObject):
1645 for i in range(len(obj)):
1646 obj[i] = self._decryptObject(obj[i], key)
1647 return obj
1649 def readObjectHeader(self, stream):
1650 # Should never be necessary to read out whitespace, since the
1651 # cross-reference table should put us in the right spot to read the
1652 # object header. In reality... some files have stupid cross reference
1653 # tables that are off by whitespace bytes.
1654 extra = False
1655 utils.skipOverComment(stream)
1656 extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1)
1657 idnum = readUntilWhitespace(stream)
1658 extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1)
1659 generation = readUntilWhitespace(stream)
1660 obj = stream.read(3)
1661 readNonWhitespace(stream)
1662 stream.seek(-1, 1)
1663 if (extra and self.strict):
1664 #not a fatal error
1665 warnings.warn("Superfluous whitespace found in object header %s %s" % \
1666 (idnum, generation), utils.PdfReadWarning)
1667 return int(idnum), int(generation)
1669 def cacheGetIndirectObject(self, generation, idnum):
1670 debug = False
1671 out = self.resolvedObjects.get((generation, idnum))
1672 if debug and out: print(("cache hit: %d %d"%(idnum, generation)))
1673 elif debug: print(("cache miss: %d %d"%(idnum, generation)))
1674 return out
1676 def cacheIndirectObject(self, generation, idnum, obj):
1677 # return None # Sometimes we want to turn off cache for debugging.
1678 if (generation, idnum) in self.resolvedObjects:
1679 msg = "Overwriting cache for %s %s"%(generation, idnum)
1680 if self.strict: raise utils.PdfReadError(msg)
1681 else: warnings.warn(msg)
1682 self.resolvedObjects[(generation, idnum)] = obj
1683 return obj
1685 def read(self, stream):
1686 debug = False
1687 if debug: print(">>read", stream)
1688 # start at the end:
1689 stream.seek(-1, 2)
1690 if not stream.tell():
1691 raise utils.PdfReadError('Cannot read an empty file')
1692 last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream
1693 line = b_('')
1694 while line[:5] != b_("%%EOF"):
1695 if stream.tell() < last1K:
1696 raise utils.PdfReadError("EOF marker not found")
1697 line = self.readNextEndLine(stream)
1698 if debug: print(" line:",line)
1700 # find startxref entry - the location of the xref table
1701 line = self.readNextEndLine(stream)
1702 try:
1703 startxref = int(line)
1704 except ValueError:
1705 # 'startxref' may be on the same line as the location
1706 if not line.startswith(b_("startxref")):
1707 raise utils.PdfReadError("startxref not found")
1708 startxref = int(line[9:].strip())
1709 warnings.warn("startxref on same line as offset")
1710 else:
1711 line = self.readNextEndLine(stream)
1712 if line[:9] != b_("startxref"):
1713 raise utils.PdfReadError("startxref not found")
1715 # read all cross reference tables and their trailers
1716 self.xref = {}
1717 self.xref_objStm = {}
1718 self.trailer = DictionaryObject()
1719 while True:
1720 # load the xref table
1721 stream.seek(startxref, 0)
1722 x = stream.read(1)
1723 if x == b_("x"):
1724 # standard cross-reference table
1725 ref = stream.read(4)
1726 if ref[:3] != b_("ref"):
1727 raise utils.PdfReadError("xref table read error")
1728 readNonWhitespace(stream)
1729 stream.seek(-1, 1)
1730 firsttime = True; # check if the first time looking at the xref table
1731 while True:
1732 num = readObject(stream, self)
1733 if firsttime and num != 0:
1734 self.xrefIndex = num
1735 if self.strict:
1736 warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning)
1737 #if table not zero indexed, could be due to error from when PDF was created
1738 #which will lead to mismatched indices later on, only warned and corrected if self.strict=True
1739 firsttime = False
1740 readNonWhitespace(stream)
1741 stream.seek(-1, 1)
1742 size = readObject(stream, self)
1743 readNonWhitespace(stream)
1744 stream.seek(-1, 1)
1745 cnt = 0
1746 while cnt < size:
1747 line = stream.read(20)
1749 # It's very clear in section 3.4.3 of the PDF spec
1750 # that all cross-reference table lines are a fixed
1751 # 20 bytes (as of PDF 1.7). However, some files have
1752 # 21-byte entries (or more) due to the use of \r\n
1753 # (CRLF) EOL's. Detect that case, and adjust the line
1754 # until it does not begin with a \r (CR) or \n (LF).
1755 while line[0] in b_("\x0D\x0A"):
1756 stream.seek(-20 + 1, 1)
1757 line = stream.read(20)
1759 # On the other hand, some malformed PDF files
1760 # use a single character EOL without a preceeding
1761 # space. Detect that case, and seek the stream
1762 # back one character. (0-9 means we've bled into
1763 # the next xref entry, t means we've bled into the
1764 # text "trailer"):
1765 if line[-1] in b_("0123456789t"):
1766 stream.seek(-1, 1)
1768 offset, generation = line[:16].split(b_(" "))
1769 offset, generation = int(offset), int(generation)
1770 if generation not in self.xref:
1771 self.xref[generation] = {}
1772 if num in self.xref[generation]:
1773 # It really seems like we should allow the last
1774 # xref table in the file to override previous
1775 # ones. Since we read the file backwards, assume
1776 # any existing key is already set correctly.
1777 pass
1778 else:
1779 self.xref[generation][num] = offset
1780 cnt += 1
1781 num += 1
1782 readNonWhitespace(stream)
1783 stream.seek(-1, 1)
1784 trailertag = stream.read(7)
1785 if trailertag != b_("trailer"):
1786 # more xrefs!
1787 stream.seek(-7, 1)
1788 else:
1789 break
1790 readNonWhitespace(stream)
1791 stream.seek(-1, 1)
1792 newTrailer = readObject(stream, self)
1793 for key, value in list(newTrailer.items()):
1794 if key not in self.trailer:
1795 self.trailer[key] = value
1796 if "/Prev" in newTrailer:
1797 startxref = newTrailer["/Prev"]
1798 else:
1799 break
1800 elif x.isdigit():
1801 # PDF 1.5+ Cross-Reference Stream
1802 stream.seek(-1, 1)
1803 idnum, generation = self.readObjectHeader(stream)
1804 xrefstream = readObject(stream, self)
1805 assert xrefstream["/Type"] == "/XRef"
1806 self.cacheIndirectObject(generation, idnum, xrefstream)
1807 streamData = BytesIO(b_(xrefstream.getData()))
1808 # Index pairs specify the subsections in the dictionary. If
1809 # none create one subsection that spans everything.
1810 idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
1811 if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs))))
1812 entrySizes = xrefstream.get("/W")
1813 assert len(entrySizes) >= 3
1814 if self.strict and len(entrySizes) > 3:
1815 raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes)
1817 def getEntry(i):
1818 # Reads the correct number of bytes for each entry. See the
1819 # discussion of the W parameter in PDF spec table 17.
1820 if entrySizes[i] > 0:
1821 d = streamData.read(entrySizes[i])
1822 return convertToInt(d, entrySizes[i])
1824 # PDF Spec Table 17: A value of zero for an element in the
1825 # W array indicates...the default value shall be used
1826 if i == 0: return 1 # First value defaults to 1
1827 else: return 0
1829 def used_before(num, generation):
1830 # We move backwards through the xrefs, don't replace any.
1831 return num in self.xref.get(generation, []) or \
1832 num in self.xref_objStm
1834 # Iterate through each subsection
1835 last_end = 0
1836 for start, size in self._pairs(idx_pairs):
1837 # The subsections must increase
1838 assert start >= last_end
1839 last_end = start + size
1840 for num in range(start, start+size):
1841 # The first entry is the type
1842 xref_type = getEntry(0)
1843 # The rest of the elements depend on the xref_type
1844 if xref_type == 0:
1845 # linked list of free objects
1846 next_free_object = getEntry(1)
1847 next_generation = getEntry(2)
1848 elif xref_type == 1:
1849 # objects that are in use but are not compressed
1850 byte_offset = getEntry(1)
1851 generation = getEntry(2)
1852 if generation not in self.xref:
1853 self.xref[generation] = {}
1854 if not used_before(num, generation):
1855 self.xref[generation][num] = byte_offset
1856 if debug: print(("XREF Uncompressed: %s %s"%(
1857 num, generation)))
1858 elif xref_type == 2:
1859 # compressed objects
1860 objstr_num = getEntry(1)
1861 obstr_idx = getEntry(2)
1862 generation = 0 # PDF spec table 18, generation is 0
1863 if not used_before(num, generation):
1864 if debug: print(("XREF Compressed: %s %s %s"%(
1865 num, objstr_num, obstr_idx)))
1866 self.xref_objStm[num] = (objstr_num, obstr_idx)
1867 elif self.strict:
1868 raise utils.PdfReadError("Unknown xref type: %s"%
1869 xref_type)
1871 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
1872 for key in trailerKeys:
1873 if key in xrefstream and key not in self.trailer:
1874 self.trailer[NameObject(key)] = xrefstream.raw_get(key)
1875 if "/Prev" in xrefstream:
1876 startxref = xrefstream["/Prev"]
1877 else:
1878 break
1879 else:
1880 # bad xref character at startxref. Let's see if we can find
1881 # the xref table nearby, as we've observed this error with an
1882 # off-by-one before.
1883 stream.seek(-11, 1)
1884 tmp = stream.read(20)
1885 xref_loc = tmp.find(b_("xref"))
1886 if xref_loc != -1:
1887 startxref -= (10 - xref_loc)
1888 continue
1889 # No explicit xref table, try finding a cross-reference stream.
1890 stream.seek(startxref, 0)
1891 found = False
1892 for look in range(5):
1893 if stream.read(1).isdigit():
1894 # This is not a standard PDF, consider adding a warning
1895 startxref += look
1896 found = True
1897 break
1898 if found:
1899 continue
1900 # no xref table found at specified location
1901 raise utils.PdfReadError("Could not find xref table at specified location")
1902 #if not zero-indexed, verify that the table is correct; change it if necessary
1903 if self.xrefIndex and not self.strict:
1904 loc = stream.tell()
1905 for gen in self.xref:
1906 if gen == 65535: continue
1907 for id in self.xref[gen]:
1908 stream.seek(self.xref[gen][id], 0)
1909 try:
1910 pid, pgen = self.readObjectHeader(stream)
1911 except ValueError:
1912 break
1913 if pid == id - self.xrefIndex:
1914 self._zeroXref(gen)
1915 break
1916 #if not, then either it's just plain wrong, or the non-zero-index is actually correct
1917 stream.seek(loc, 0) #return to where it was
1919 def _zeroXref(self, generation):
1920 self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) )
1922 def _pairs(self, array):
1923 i = 0
1924 while True:
1925 yield array[i], array[i+1]
1926 i += 2
1927 if (i+1) >= len(array):
1928 break
1930 def readNextEndLine(self, stream):
1931 debug = False
1932 if debug: print(">>readNextEndLine")
1933 line = b_("")
1934 while True:
1935 # Prevent infinite loops in malformed PDFs
1936 if stream.tell() == 0:
1937 raise utils.PdfReadError("Could not read malformed PDF file")
1938 x = stream.read(1)
1939 if debug: print((" x:", x, "%x"%ord(x)))
1940 if stream.tell() < 2:
1941 raise utils.PdfReadError("EOL marker not found")
1942 stream.seek(-2, 1)
1943 if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR
1944 crlf = False
1945 while x == b_('\n') or x == b_('\r'):
1946 if debug:
1947 if ord(x) == 0x0D: print(" x is CR 0D")
1948 elif ord(x) == 0x0A: print(" x is LF 0A")
1949 x = stream.read(1)
1950 if x == b_('\n') or x == b_('\r'): # account for CR+LF
1951 stream.seek(-1, 1)
1952 crlf = True
1953 if stream.tell() < 2:
1954 raise utils.PdfReadError("EOL marker not found")
1955 stream.seek(-2, 1)
1956 stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1
1957 break
1958 else:
1959 if debug: print(" x is neither")
1960 line = x + line
1961 if debug: print((" RNEL line:", line))
1962 if debug: print("leaving RNEL")
1963 return line
1965 def decrypt(self, password):
1966 """
1967 When using an encrypted / secured PDF file with the PDF Standard
1968 encryption handler, this function will allow the file to be decrypted.
1969 It checks the given password against the document's user password and
1970 owner password, and then stores the resulting decryption key if either
1971 password is correct.
1973 It does not matter which password was matched. Both passwords provide
1974 the correct decryption key that will allow the document to be used with
1975 this library.
1977 :param str password: The password to match.
1978 :return: ``0`` if the password failed, ``1`` if the password matched the user
1979 password, and ``2`` if the password matched the owner password.
1980 :rtype: int
1981 :raises NotImplementedError: if document uses an unsupported encryption
1982 method.
1983 """
1985 self._override_encryption = True
1986 try:
1987 return self._decrypt(password)
1988 finally:
1989 self._override_encryption = False
1991 def _decrypt(self, password):
1992 encrypt = self.trailer['/Encrypt'].getObject()
1993 if encrypt['/Filter'] != '/Standard':
1994 raise NotImplementedError("only Standard PDF encryption handler is available")
1995 if not (encrypt['/V'] in (1, 2)):
1996 raise NotImplementedError("only algorithm code 1 and 2 are supported")
1997 user_password, key = self._authenticateUserPassword(password)
1998 if user_password:
1999 self._decryption_key = key
2000 return 1
2001 else:
2002 rev = encrypt['/R'].getObject()
2003 if rev == 2:
2004 keylen = 5
2005 else:
2006 keylen = encrypt['/Length'].getObject() // 8
2007 key = _alg33_1(password, rev, keylen)
2008 real_O = encrypt["/O"].getObject()
2009 if rev == 2:
2010 userpass = utils.RC4_encrypt(key, real_O)
2011 else:
2012 val = real_O
2013 for i in range(19, -1, -1):
2014 new_key = b_('')
2015 for l in range(len(key)):
2016 new_key += b_(chr(utils.ord_(key[l]) ^ i))
2017 val = utils.RC4_encrypt(new_key, val)
2018 userpass = val
2019 owner_password, key = self._authenticateUserPassword(userpass)
2020 if owner_password:
2021 self._decryption_key = key
2022 return 2
2023 return 0
2025 def _authenticateUserPassword(self, password):
2026 encrypt = self.trailer['/Encrypt'].getObject()
2027 rev = encrypt['/R'].getObject()
2028 owner_entry = encrypt['/O'].getObject()
2029 p_entry = encrypt['/P'].getObject()
2030 id_entry = self.trailer['/ID'].getObject()
2031 id1_entry = id_entry[0].getObject()
2032 real_U = encrypt['/U'].getObject().original_bytes
2033 if rev == 2:
2034 U, key = _alg34(password, owner_entry, p_entry, id1_entry)
2035 elif rev >= 3:
2036 U, key = _alg35(password, rev,
2037 encrypt["/Length"].getObject() // 8, owner_entry,
2038 p_entry, id1_entry,
2039 encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject())
2040 U, real_U = U[:16], real_U[:16]
2041 return U == real_U, key
2043 def getIsEncrypted(self):
2044 return "/Encrypt" in self.trailer
2046 isEncrypted = property(lambda self: self.getIsEncrypted(), None, None)
2047 """
2048 Read-only boolean property showing whether this PDF file is encrypted.
2049 Note that this property, if true, will remain true even after the
2050 :meth:`decrypt()<PdfFileReader.decrypt>` method is called.
2051 """
2054def getRectangle(self, name, defaults):
2055 retval = self.get(name)
2056 if isinstance(retval, RectangleObject):
2057 return retval
2058 if retval == None:
2059 for d in defaults:
2060 retval = self.get(d)
2061 if retval != None:
2062 break
2063 if isinstance(retval, IndirectObject):
2064 retval = self.pdf.getObject(retval)
2065 retval = RectangleObject(retval)
2066 setRectangle(self, name, retval)
2067 return retval
2070def setRectangle(self, name, value):
2071 if not isinstance(name, NameObject):
2072 name = NameObject(name)
2073 self[name] = value
2076def deleteRectangle(self, name):
2077 del self[name]
2080def createRectangleAccessor(name, fallback):
2081 return \
2082 property(
2083 lambda self: getRectangle(self, name, fallback),
2084 lambda self, value: setRectangle(self, name, value),
2085 lambda self: deleteRectangle(self, name)
2086 )
2089class PageObject(DictionaryObject):
2090 """
2091 This class represents a single page within a PDF file. Typically this
2092 object will be created by accessing the
2093 :meth:`getPage()<PyPDF2.PdfFileReader.getPage>` method of the
2094 :class:`PdfFileReader<PyPDF2.PdfFileReader>` class, but it is
2095 also possible to create an empty page with the
2096 :meth:`createBlankPage()<PageObject.createBlankPage>` static method.
2098 :param pdf: PDF file the page belongs to.
2099 :param indirectRef: Stores the original indirect reference to
2100 this object in its source PDF
2101 """
2102 def __init__(self, pdf=None, indirectRef=None):
2103 DictionaryObject.__init__(self)
2104 self.pdf = pdf
2105 self.indirectRef = indirectRef
2107 def createBlankPage(pdf=None, width=None, height=None):
2108 """
2109 Returns a new blank page.
2110 If ``width`` or ``height`` is ``None``, try to get the page size
2111 from the last page of *pdf*.
2113 :param pdf: PDF file the page belongs to
2114 :param float width: The width of the new page expressed in default user
2115 space units.
2116 :param float height: The height of the new page expressed in default user
2117 space units.
2118 :return: the new blank page:
2119 :rtype: :class:`PageObject<PageObject>`
2120 :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
2121 no page
2122 """
2123 page = PageObject(pdf)
2125 # Creates a new page (cf PDF Reference 7.7.3.3)
2126 page.__setitem__(NameObject('/Type'), NameObject('/Page'))
2127 page.__setitem__(NameObject('/Parent'), NullObject())
2128 page.__setitem__(NameObject('/Resources'), DictionaryObject())
2129 if width is None or height is None:
2130 if pdf is not None and pdf.getNumPages() > 0:
2131 lastpage = pdf.getPage(pdf.getNumPages() - 1)
2132 width = lastpage.mediaBox.getWidth()
2133 height = lastpage.mediaBox.getHeight()
2134 else:
2135 raise utils.PageSizeNotDefinedError()
2136 page.__setitem__(NameObject('/MediaBox'),
2137 RectangleObject([0, 0, width, height]))
2139 return page
2140 createBlankPage = staticmethod(createBlankPage)
2142 def rotateClockwise(self, angle):
2143 """
2144 Rotates a page clockwise by increments of 90 degrees.
2146 :param int angle: Angle to rotate the page. Must be an increment
2147 of 90 deg.
2148 """
2149 assert angle % 90 == 0
2150 self._rotate(angle)
2151 return self
2153 def rotateCounterClockwise(self, angle):
2154 """
2155 Rotates a page counter-clockwise by increments of 90 degrees.
2157 :param int angle: Angle to rotate the page. Must be an increment
2158 of 90 deg.
2159 """
2160 assert angle % 90 == 0
2161 self._rotate(-angle)
2162 return self
2164 def _rotate(self, angle):
2165 currentAngle = self.get("/Rotate", 0)
2166 self[NameObject("/Rotate")] = NumberObject(currentAngle + angle)
2168 def _mergeResources(res1, res2, resource):
2169 newRes = DictionaryObject()
2170 newRes.update(res1.get(resource, DictionaryObject()).getObject())
2171 page2Res = res2.get(resource, DictionaryObject()).getObject()
2172 renameRes = {}
2173 for key in list(page2Res.keys()):
2174 if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key):
2175 newname = NameObject(key + str(uuid.uuid4()))
2176 renameRes[key] = newname
2177 newRes[newname] = page2Res[key]
2178 elif key not in newRes:
2179 newRes[key] = page2Res.raw_get(key)
2180 return newRes, renameRes
2181 _mergeResources = staticmethod(_mergeResources)
2183 def _contentStreamRename(stream, rename, pdf):
2184 if not rename:
2185 return stream
2186 stream = ContentStream(stream, pdf)
2187 for operands, operator in stream.operations:
2188 for i in range(len(operands)):
2189 op = operands[i]
2190 if isinstance(op, NameObject):
2191 operands[i] = rename.get(op,op)
2192 return stream
2193 _contentStreamRename = staticmethod(_contentStreamRename)
2195 def _pushPopGS(contents, pdf):
2196 # adds a graphics state "push" and "pop" to the beginning and end
2197 # of a content stream. This isolates it from changes such as
2198 # transformation matricies.
2199 stream = ContentStream(contents, pdf)
2200 stream.operations.insert(0, [[], "q"])
2201 stream.operations.append([[], "Q"])
2202 return stream
2203 _pushPopGS = staticmethod(_pushPopGS)
2205 def _addTransformationMatrix(contents, pdf, ctm):
2206 # adds transformation matrix at the beginning of the given
2207 # contents stream.
2208 a, b, c, d, e, f = ctm
2209 contents = ContentStream(contents, pdf)
2210 contents.operations.insert(0, [[FloatObject(a), FloatObject(b),
2211 FloatObject(c), FloatObject(d), FloatObject(e),
2212 FloatObject(f)], " cm"])
2213 return contents
2214 _addTransformationMatrix = staticmethod(_addTransformationMatrix)
2216 def getContents(self):
2217 """
2218 Accesses the page contents.
2220 :return: the ``/Contents`` object, or ``None`` if it doesn't exist.
2221 ``/Contents`` is optional, as described in PDF Reference 7.7.3.3
2222 """
2223 if "/Contents" in self:
2224 return self["/Contents"].getObject()
2225 else:
2226 return None
2228 def mergePage(self, page2):
2229 """
2230 Merges the content streams of two pages into one. Resource references
2231 (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc
2232 of this page are not altered. The parameter page's content stream will
2233 be added to the end of this page's content stream, meaning that it will
2234 be drawn after, or "on top" of this page.
2236 :param PageObject page2: The page to be merged into this one. Should be
2237 an instance of :class:`PageObject<PageObject>`.
2238 """
2239 self._mergePage(page2)
2241 def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False):
2242 # First we work on merging the resource dictionaries. This allows us
2243 # to find out what symbols in the content streams we might need to
2244 # rename.
2246 newResources = DictionaryObject()
2247 rename = {}
2248 originalResources = self["/Resources"].getObject()
2249 page2Resources = page2["/Resources"].getObject()
2250 newAnnots = ArrayObject()
2252 for page in (self, page2):
2253 if "/Annots" in page:
2254 annots = page["/Annots"]
2255 if isinstance(annots, ArrayObject):
2256 for ref in annots:
2257 newAnnots.append(ref)
2259 for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties":
2260 new, newrename = PageObject._mergeResources(originalResources, page2Resources, res)
2261 if new:
2262 newResources[NameObject(res)] = new
2263 rename.update(newrename)
2265 # Combine /ProcSet sets.
2266 newResources[NameObject("/ProcSet")] = ArrayObject(
2267 frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union(
2268 frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject())
2269 )
2270 )
2272 newContentArray = ArrayObject()
2274 originalContent = self.getContents()
2275 if originalContent is not None:
2276 newContentArray.append(PageObject._pushPopGS(
2277 originalContent, self.pdf))
2279 page2Content = page2.getContents()
2280 if page2Content is not None:
2281 if page2transformation is not None:
2282 page2Content = page2transformation(page2Content)
2283 page2Content = PageObject._contentStreamRename(
2284 page2Content, rename, self.pdf)
2285 page2Content = PageObject._pushPopGS(page2Content, self.pdf)
2286 newContentArray.append(page2Content)
2288 # if expanding the page to fit a new page, calculate the new media box size
2289 if expand:
2290 corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(),
2291 self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()]
2292 corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(),
2293 page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(),
2294 page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(),
2295 page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()]
2296 if ctm is not None:
2297 ctm = [float(x) for x in ctm]
2298 new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)]
2299 new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)]
2300 else:
2301 new_x = corners2[0:8:2]
2302 new_y = corners2[1:8:2]
2303 lowerleft = [min(new_x), min(new_y)]
2304 upperright = [max(new_x), max(new_y)]
2305 lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])]
2306 upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])]
2308 self.mediaBox.setLowerLeft(lowerleft)
2309 self.mediaBox.setUpperRight(upperright)
2311 self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf)
2312 self[NameObject('/Resources')] = newResources
2313 self[NameObject('/Annots')] = newAnnots
2315 def mergeTransformedPage(self, page2, ctm, expand=False):
2316 """
2317 This is similar to mergePage, but a transformation matrix is
2318 applied to the merged stream.
2320 :param PageObject page2: The page to be merged into this one. Should be
2321 an instance of :class:`PageObject<PageObject>`.
2322 :param tuple ctm: a 6-element tuple containing the operands of the
2323 transformation matrix
2324 :param bool expand: Whether the page should be expanded to fit the dimensions
2325 of the page to be merged.
2326 """
2327 self._mergePage(page2, lambda page2Content:
2328 PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand)
2330 def mergeScaledPage(self, page2, scale, expand=False):
2331 """
2332 This is similar to mergePage, but the stream to be merged is scaled
2333 by appling a transformation matrix.
2335 :param PageObject page2: The page to be merged into this one. Should be
2336 an instance of :class:`PageObject<PageObject>`.
2337 :param float scale: The scaling factor
2338 :param bool expand: Whether the page should be expanded to fit the
2339 dimensions of the page to be merged.
2340 """
2341 # CTM to scale : [ sx 0 0 sy 0 0 ]
2342 return self.mergeTransformedPage(page2, [scale, 0,
2343 0, scale,
2344 0, 0], expand)
2346 def mergeRotatedPage(self, page2, rotation, expand=False):
2347 """
2348 This is similar to mergePage, but the stream to be merged is rotated
2349 by appling a transformation matrix.
2351 :param PageObject page2: the page to be merged into this one. Should be
2352 an instance of :class:`PageObject<PageObject>`.
2353 :param float rotation: The angle of the rotation, in degrees
2354 :param bool expand: Whether the page should be expanded to fit the
2355 dimensions of the page to be merged.
2356 """
2357 rotation = math.radians(rotation)
2358 return self.mergeTransformedPage(page2,
2359 [math.cos(rotation), math.sin(rotation),
2360 -math.sin(rotation), math.cos(rotation),
2361 0, 0], expand)
2363 def mergeTranslatedPage(self, page2, tx, ty, expand=False):
2364 """
2365 This is similar to mergePage, but the stream to be merged is translated
2366 by appling a transformation matrix.
2368 :param PageObject page2: the page to be merged into this one. Should be
2369 an instance of :class:`PageObject<PageObject>`.
2370 :param float tx: The translation on X axis
2371 :param float ty: The translation on Y axis
2372 :param bool expand: Whether the page should be expanded to fit the
2373 dimensions of the page to be merged.
2374 """
2375 return self.mergeTransformedPage(page2, [1, 0,
2376 0, 1,
2377 tx, ty], expand)
2379 def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False):
2380 """
2381 This is similar to mergePage, but the stream to be merged is rotated
2382 and translated by appling a transformation matrix.
2384 :param PageObject page2: the page to be merged into this one. Should be
2385 an instance of :class:`PageObject<PageObject>`.
2386 :param float tx: The translation on X axis
2387 :param float ty: The translation on Y axis
2388 :param float rotation: The angle of the rotation, in degrees
2389 :param bool expand: Whether the page should be expanded to fit the
2390 dimensions of the page to be merged.
2391 """
2393 translation = [[1, 0, 0],
2394 [0, 1, 0],
2395 [-tx, -ty, 1]]
2396 rotation = math.radians(rotation)
2397 rotating = [[math.cos(rotation), math.sin(rotation), 0],
2398 [-math.sin(rotation), math.cos(rotation), 0],
2399 [0, 0, 1]]
2400 rtranslation = [[1, 0, 0],
2401 [0, 1, 0],
2402 [tx, ty, 1]]
2403 ctm = utils.matrixMultiply(translation, rotating)
2404 ctm = utils.matrixMultiply(ctm, rtranslation)
2406 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
2407 ctm[1][0], ctm[1][1],
2408 ctm[2][0], ctm[2][1]], expand)
2410 def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False):
2411 """
2412 This is similar to mergePage, but the stream to be merged is rotated
2413 and scaled by appling a transformation matrix.
2415 :param PageObject page2: the page to be merged into this one. Should be
2416 an instance of :class:`PageObject<PageObject>`.
2417 :param float rotation: The angle of the rotation, in degrees
2418 :param float scale: The scaling factor
2419 :param bool expand: Whether the page should be expanded to fit the
2420 dimensions of the page to be merged.
2421 """
2422 rotation = math.radians(rotation)
2423 rotating = [[math.cos(rotation), math.sin(rotation), 0],
2424 [-math.sin(rotation), math.cos(rotation), 0],
2425 [0, 0, 1]]
2426 scaling = [[scale, 0, 0],
2427 [0, scale, 0],
2428 [0, 0, 1]]
2429 ctm = utils.matrixMultiply(rotating, scaling)
2431 return self.mergeTransformedPage(page2,
2432 [ctm[0][0], ctm[0][1],
2433 ctm[1][0], ctm[1][1],
2434 ctm[2][0], ctm[2][1]], expand)
2436 def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False):
2437 """
2438 This is similar to mergePage, but the stream to be merged is translated
2439 and scaled by appling a transformation matrix.
2441 :param PageObject page2: the page to be merged into this one. Should be
2442 an instance of :class:`PageObject<PageObject>`.
2443 :param float scale: The scaling factor
2444 :param float tx: The translation on X axis
2445 :param float ty: The translation on Y axis
2446 :param bool expand: Whether the page should be expanded to fit the
2447 dimensions of the page to be merged.
2448 """
2450 translation = [[1, 0, 0],
2451 [0, 1, 0],
2452 [tx, ty, 1]]
2453 scaling = [[scale, 0, 0],
2454 [0, scale, 0],
2455 [0, 0, 1]]
2456 ctm = utils.matrixMultiply(scaling, translation)
2458 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
2459 ctm[1][0], ctm[1][1],
2460 ctm[2][0], ctm[2][1]], expand)
2462 def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False):
2463 """
2464 This is similar to mergePage, but the stream to be merged is translated,
2465 rotated and scaled by appling a transformation matrix.
2467 :param PageObject page2: the page to be merged into this one. Should be
2468 an instance of :class:`PageObject<PageObject>`.
2469 :param float tx: The translation on X axis
2470 :param float ty: The translation on Y axis
2471 :param float rotation: The angle of the rotation, in degrees
2472 :param float scale: The scaling factor
2473 :param bool expand: Whether the page should be expanded to fit the
2474 dimensions of the page to be merged.
2475 """
2476 translation = [[1, 0, 0],
2477 [0, 1, 0],
2478 [tx, ty, 1]]
2479 rotation = math.radians(rotation)
2480 rotating = [[math.cos(rotation), math.sin(rotation), 0],
2481 [-math.sin(rotation), math.cos(rotation), 0],
2482 [0, 0, 1]]
2483 scaling = [[scale, 0, 0],
2484 [0, scale, 0],
2485 [0, 0, 1]]
2486 ctm = utils.matrixMultiply(rotating, scaling)
2487 ctm = utils.matrixMultiply(ctm, translation)
2489 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
2490 ctm[1][0], ctm[1][1],
2491 ctm[2][0], ctm[2][1]], expand)
2493 ##
2494 # Applys a transformation matrix the page.
2495 #
2496 # @param ctm A 6 elements tuple containing the operands of the
2497 # transformation matrix
2498 def addTransformation(self, ctm):
2499 """
2500 Applies a transformation matrix to the page.
2502 :param tuple ctm: A 6-element tuple containing the operands of the
2503 transformation matrix.
2504 """
2505 originalContent = self.getContents()
2506 if originalContent is not None:
2507 newContent = PageObject._addTransformationMatrix(
2508 originalContent, self.pdf, ctm)
2509 newContent = PageObject._pushPopGS(newContent, self.pdf)
2510 self[NameObject('/Contents')] = newContent
2512 def scale(self, sx, sy):
2513 """
2514 Scales a page by the given factors by appling a transformation
2515 matrix to its content and updating the page size.
2517 :param float sx: The scaling factor on horizontal axis.
2518 :param float sy: The scaling factor on vertical axis.
2519 """
2520 self.addTransformation([sx, 0,
2521 0, sy,
2522 0, 0])
2523 self.mediaBox = RectangleObject([
2524 float(self.mediaBox.getLowerLeft_x()) * sx,
2525 float(self.mediaBox.getLowerLeft_y()) * sy,
2526 float(self.mediaBox.getUpperRight_x()) * sx,
2527 float(self.mediaBox.getUpperRight_y()) * sy])
2528 if "/VP" in self:
2529 viewport = self["/VP"]
2530 if isinstance(viewport, ArrayObject):
2531 bbox = viewport[0]["/BBox"]
2532 else:
2533 bbox = viewport["/BBox"]
2534 scaled_bbox = RectangleObject([
2535 float(bbox[0]) * sx,
2536 float(bbox[1]) * sy,
2537 float(bbox[2]) * sx,
2538 float(bbox[3]) * sy])
2539 if isinstance(viewport, ArrayObject):
2540 self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox
2541 else:
2542 self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox
2544 def scaleBy(self, factor):
2545 """
2546 Scales a page by the given factor by appling a transformation
2547 matrix to its content and updating the page size.
2549 :param float factor: The scaling factor (for both X and Y axis).
2550 """
2551 self.scale(factor, factor)
2553 def scaleTo(self, width, height):
2554 """
2555 Scales a page to the specified dimentions by appling a
2556 transformation matrix to its content and updating the page size.
2558 :param float width: The new width.
2559 :param float height: The new heigth.
2560 """
2561 sx = width / float(self.mediaBox.getUpperRight_x() -
2562 self.mediaBox.getLowerLeft_x ())
2563 sy = height / float(self.mediaBox.getUpperRight_y() -
2564 self.mediaBox.getLowerLeft_y ())
2565 self.scale(sx, sy)
2567 def compressContentStreams(self):
2568 """
2569 Compresses the size of this page by joining all content streams and
2570 applying a FlateDecode filter.
2572 However, it is possible that this function will perform no action if
2573 content stream compression becomes "automatic" for some reason.
2574 """
2575 content = self.getContents()
2576 if content is not None:
2577 if not isinstance(content, ContentStream):
2578 content = ContentStream(content, self.pdf)
2579 self[NameObject("/Contents")] = content.flateEncode()
2581 def extractText(self):
2582 """
2583 Locate all text drawing commands, in the order they are provided in the
2584 content stream, and extract the text. This works well for some PDF
2585 files, but poorly for others, depending on the generator used. This will
2586 be refined in the future. Do not rely on the order of text coming out of
2587 this function, as it will change if this function is made more
2588 sophisticated.
2590 :return: a unicode string object.
2591 """
2592 text = u_("")
2593 content = self["/Contents"].getObject()
2594 if not isinstance(content, ContentStream):
2595 content = ContentStream(content, self.pdf)
2596 # Note: we check all strings are TextStringObjects. ByteStringObjects
2597 # are strings where the byte->string encoding was unknown, so adding
2598 # them to the text here would be gibberish.
2599 for operands, operator in content.operations:
2600 if operator == b_("Tj"):
2601 _text = operands[0]
2602 if isinstance(_text, TextStringObject):
2603 text += _text
2604 elif operator == b_("T*"):
2605 text += "\n"
2606 elif operator == b_("'"):
2607 text += "\n"
2608 _text = operands[0]
2609 if isinstance(_text, TextStringObject):
2610 text += operands[0]
2611 elif operator == b_('"'):
2612 _text = operands[2]
2613 if isinstance(_text, TextStringObject):
2614 text += "\n"
2615 text += _text
2616 elif operator == b_("TJ"):
2617 for i in operands[0]:
2618 if isinstance(i, TextStringObject):
2619 text += i
2620 text += "\n"
2621 return text
2623 mediaBox = createRectangleAccessor("/MediaBox", ())
2624 """
2625 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2626 defining the boundaries of the physical medium on which the page is
2627 intended to be displayed or printed.
2628 """
2630 cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",))
2631 """
2632 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2633 defining the visible region of default user space. When the page is
2634 displayed or printed, its contents are to be clipped (cropped) to this
2635 rectangle and then imposed on the output medium in some
2636 implementation-defined manner. Default value: same as :attr:`mediaBox<mediaBox>`.
2637 """
2639 bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox"))
2640 """
2641 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2642 defining the region to which the contents of the page should be clipped
2643 when output in a production enviroment.
2644 """
2646 trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox"))
2647 """
2648 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2649 defining the intended dimensions of the finished page after trimming.
2650 """
2652 artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox"))
2653 """
2654 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2655 defining the extent of the page's meaningful content as intended by the
2656 page's creator.
2657 """
2660class ContentStream(DecodedStreamObject):
2661 def __init__(self, stream, pdf):
2662 self.pdf = pdf
2663 self.operations = []
2664 # stream may be a StreamObject or an ArrayObject containing
2665 # multiple StreamObjects to be cat'd together.
2666 stream = stream.getObject()
2667 if isinstance(stream, ArrayObject):
2668 data = b_("")
2669 for s in stream:
2670 data += s.getObject().getData()
2671 stream = BytesIO(b_(data))
2672 else:
2673 stream = BytesIO(b_(stream.getData()))
2674 self.__parseContentStream(stream)
2676 def __parseContentStream(self, stream):
2677 # file("f:\\tmp.txt", "w").write(stream.read())
2678 stream.seek(0, 0)
2679 operands = []
2680 while True:
2681 peek = readNonWhitespace(stream)
2682 if peek == b_('') or ord_(peek) == 0:
2683 break
2684 stream.seek(-1, 1)
2685 if peek.isalpha() or peek == b_("'") or peek == b_('"'):
2686 operator = utils.readUntilRegex(stream,
2687 NameObject.delimiterPattern, True)
2688 if operator == b_("BI"):
2689 # begin inline image - a completely different parsing
2690 # mechanism is required, of course... thanks buddy...
2691 assert operands == []
2692 ii = self._readInlineImage(stream)
2693 self.operations.append((ii, b_("INLINE IMAGE")))
2694 else:
2695 self.operations.append((operands, operator))
2696 operands = []
2697 elif peek == b_('%'):
2698 # If we encounter a comment in the content stream, we have to
2699 # handle it here. Typically, readObject will handle
2700 # encountering a comment -- but readObject assumes that
2701 # following the comment must be the object we're trying to
2702 # read. In this case, it could be an operator instead.
2703 while peek not in (b_('\r'), b_('\n')):
2704 peek = stream.read(1)
2705 else:
2706 operands.append(readObject(stream, None))
2708 def _readInlineImage(self, stream):
2709 # begin reading just after the "BI" - begin image
2710 # first read the dictionary of settings.
2711 settings = DictionaryObject()
2712 while True:
2713 tok = readNonWhitespace(stream)
2714 stream.seek(-1, 1)
2715 if tok == b_("I"):
2716 # "ID" - begin of image data
2717 break
2718 key = readObject(stream, self.pdf)
2719 tok = readNonWhitespace(stream)
2720 stream.seek(-1, 1)
2721 value = readObject(stream, self.pdf)
2722 settings[key] = value
2723 # left at beginning of ID
2724 tmp = stream.read(3)
2725 assert tmp[:2] == b_("ID")
2726 data = b_("")
2727 while True:
2728 # Read the inline image, while checking for EI (End Image) operator.
2729 tok = stream.read(1)
2730 if tok == b_("E"):
2731 # Check for End Image
2732 tok2 = stream.read(1)
2733 if tok2 == b_("I"):
2734 # Data can contain EI, so check for the Q operator.
2735 tok3 = stream.read(1)
2736 info = tok + tok2
2737 # We need to find whitespace between EI and Q.
2738 has_q_whitespace = False
2739 while tok3 in utils.WHITESPACES:
2740 has_q_whitespace = True
2741 info += tok3
2742 tok3 = stream.read(1)
2743 if tok3 == b_("Q") and has_q_whitespace:
2744 stream.seek(-1, 1)
2745 break
2746 else:
2747 stream.seek(-1,1)
2748 data += info
2749 else:
2750 stream.seek(-1, 1)
2751 data += tok
2752 else:
2753 data += tok
2754 return {"settings": settings, "data": data}
2756 def _getData(self):
2757 newdata = BytesIO()
2758 for operands, operator in self.operations:
2759 if operator == b_("INLINE IMAGE"):
2760 newdata.write(b_("BI"))
2761 dicttext = BytesIO()
2762 operands["settings"].writeToStream(dicttext, None)
2763 newdata.write(dicttext.getvalue()[2:-2])
2764 newdata.write(b_("ID "))
2765 newdata.write(operands["data"])
2766 newdata.write(b_("EI"))
2767 else:
2768 for op in operands:
2769 op.writeToStream(newdata, None)
2770 newdata.write(b_(" "))
2771 newdata.write(b_(operator))
2772 newdata.write(b_("\n"))
2773 return newdata.getvalue()
2775 def _setData(self, value):
2776 self.__parseContentStream(BytesIO(b_(value)))
2778 _data = property(_getData, _setData)
2781class DocumentInformation(DictionaryObject):
2782 """
2783 A class representing the basic document metadata provided in a PDF File.
2784 This class is accessible through
2785 :meth:`getDocumentInfo()<PyPDF2.PdfFileReader.getDocumentInfo()>`
2787 All text properties of the document metadata have
2788 *two* properties, eg. author and author_raw. The non-raw property will
2789 always return a ``TextStringObject``, making it ideal for a case where
2790 the metadata is being displayed. The raw property can sometimes return
2791 a ``ByteStringObject``, if PyPDF2 was unable to decode the string's
2792 text encoding; this requires additional safety in the caller and
2793 therefore is not as commonly accessed.
2794 """
2796 def __init__(self):
2797 DictionaryObject.__init__(self)
2799 def getText(self, key):
2800 retval = self.get(key, None)
2801 if isinstance(retval, TextStringObject):
2802 return retval
2803 return None
2805 title = property(lambda self: self.getText("/Title"))
2806 """Read-only property accessing the document's **title**.
2807 Returns a unicode string (``TextStringObject``) or ``None``
2808 if the title is not specified."""
2809 title_raw = property(lambda self: self.get("/Title"))
2810 """The "raw" version of title; can return a ``ByteStringObject``."""
2812 author = property(lambda self: self.getText("/Author"))
2813 """Read-only property accessing the document's **author**.
2814 Returns a unicode string (``TextStringObject``) or ``None``
2815 if the author is not specified."""
2816 author_raw = property(lambda self: self.get("/Author"))
2817 """The "raw" version of author; can return a ``ByteStringObject``."""
2819 subject = property(lambda self: self.getText("/Subject"))
2820 """Read-only property accessing the document's **subject**.
2821 Returns a unicode string (``TextStringObject``) or ``None``
2822 if the subject is not specified."""
2823 subject_raw = property(lambda self: self.get("/Subject"))
2824 """The "raw" version of subject; can return a ``ByteStringObject``."""
2826 creator = property(lambda self: self.getText("/Creator"))
2827 """Read-only property accessing the document's **creator**. If the
2828 document was converted to PDF from another format, this is the name of the
2829 application (e.g. OpenOffice) that created the original document from
2830 which it was converted. Returns a unicode string (``TextStringObject``)
2831 or ``None`` if the creator is not specified."""
2832 creator_raw = property(lambda self: self.get("/Creator"))
2833 """The "raw" version of creator; can return a ``ByteStringObject``."""
2835 producer = property(lambda self: self.getText("/Producer"))
2836 """Read-only property accessing the document's **producer**.
2837 If the document was converted to PDF from another format, this is
2838 the name of the application (for example, OSX Quartz) that converted
2839 it to PDF. Returns a unicode string (``TextStringObject``)
2840 or ``None`` if the producer is not specified."""
2841 producer_raw = property(lambda self: self.get("/Producer"))
2842 """The "raw" version of producer; can return a ``ByteStringObject``."""
2845def convertToInt(d, size):
2846 if size > 8:
2847 raise utils.PdfReadError("invalid size in convertToInt")
2848 d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d)
2849 d = d[-8:]
2850 return struct.unpack(">q", d)[0]
2852# ref: pdf1.8 spec section 3.5.2 algorithm 3.2
2853_encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \
2854 b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \
2855 b_('\xa9\xfe\x64\x53\x69\x7a')
2858# Implementation of algorithm 3.2 of the PDF standard security handler,
2859# section 3.5.2 of the PDF 1.6 reference.
2860def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
2861 # 1. Pad or truncate the password string to exactly 32 bytes. If the
2862 # password string is more than 32 bytes long, use only its first 32 bytes;
2863 # if it is less than 32 bytes long, pad it by appending the required number
2864 # of additional bytes from the beginning of the padding string
2865 # (_encryption_padding).
2866 password = b_((str_(password) + str_(_encryption_padding))[:32])
2867 # 2. Initialize the MD5 hash function and pass the result of step 1 as
2868 # input to this function.
2869 import struct
2870 m = md5(password)
2871 # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash
2872 # function.
2873 m.update(owner_entry.original_bytes)
2874 # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass
2875 # these bytes to the MD5 hash function, low-order byte first.
2876 p_entry = struct.pack('<i', p_entry)
2877 m.update(p_entry)
2878 # 5. Pass the first element of the file's file identifier array to the MD5
2879 # hash function.
2880 m.update(id1_entry.original_bytes)
2881 # 6. (Revision 3 or greater) If document metadata is not being encrypted,
2882 # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function.
2883 if rev >= 3 and not metadata_encrypt:
2884 m.update(b_("\xff\xff\xff\xff"))
2885 # 7. Finish the hash.
2886 md5_hash = m.digest()
2887 # 8. (Revision 3 or greater) Do the following 50 times: Take the output
2888 # from the previous MD5 hash and pass the first n bytes of the output as
2889 # input into a new MD5 hash, where n is the number of bytes of the
2890 # encryption key as defined by the value of the encryption dictionary's
2891 # /Length entry.
2892 if rev >= 3:
2893 for i in range(50):
2894 md5_hash = md5(md5_hash[:keylen]).digest()
2895 # 9. Set the encryption key to the first n bytes of the output from the
2896 # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or
2897 # greater, depends on the value of the encryption dictionary's /Length
2898 # entry.
2899 return md5_hash[:keylen]
2902# Implementation of algorithm 3.3 of the PDF standard security handler,
2903# section 3.5.2 of the PDF 1.6 reference.
2904def _alg33(owner_pwd, user_pwd, rev, keylen):
2905 # steps 1 - 4
2906 key = _alg33_1(owner_pwd, rev, keylen)
2907 # 5. Pad or truncate the user password string as described in step 1 of
2908 # algorithm 3.2.
2909 user_pwd = b_((user_pwd + str_(_encryption_padding))[:32])
2910 # 6. Encrypt the result of step 5, using an RC4 encryption function with
2911 # the encryption key obtained in step 4.
2912 val = utils.RC4_encrypt(key, user_pwd)
2913 # 7. (Revision 3 or greater) Do the following 19 times: Take the output
2914 # from the previous invocation of the RC4 function and pass it as input to
2915 # a new invocation of the function; use an encryption key generated by
2916 # taking each byte of the encryption key obtained in step 4 and performing
2917 # an XOR operation between that byte and the single-byte value of the
2918 # iteration counter (from 1 to 19).
2919 if rev >= 3:
2920 for i in range(1, 20):
2921 new_key = ''
2922 for l in range(len(key)):
2923 new_key += chr(ord_(key[l]) ^ i)
2924 val = utils.RC4_encrypt(new_key, val)
2925 # 8. Store the output from the final invocation of the RC4 as the value of
2926 # the /O entry in the encryption dictionary.
2927 return val
2930# Steps 1-4 of algorithm 3.3
2931def _alg33_1(password, rev, keylen):
2932 # 1. Pad or truncate the owner password string as described in step 1 of
2933 # algorithm 3.2. If there is no owner password, use the user password
2934 # instead.
2935 password = b_((password + str_(_encryption_padding))[:32])
2936 # 2. Initialize the MD5 hash function and pass the result of step 1 as
2937 # input to this function.
2938 m = md5(password)
2939 # 3. (Revision 3 or greater) Do the following 50 times: Take the output
2940 # from the previous MD5 hash and pass it as input into a new MD5 hash.
2941 md5_hash = m.digest()
2942 if rev >= 3:
2943 for i in range(50):
2944 md5_hash = md5(md5_hash).digest()
2945 # 4. Create an RC4 encryption key using the first n bytes of the output
2946 # from the final MD5 hash, where n is always 5 for revision 2 but, for
2947 # revision 3 or greater, depends on the value of the encryption
2948 # dictionary's /Length entry.
2949 key = md5_hash[:keylen]
2950 return key
2953# Implementation of algorithm 3.4 of the PDF standard security handler,
2954# section 3.5.2 of the PDF 1.6 reference.
2955def _alg34(password, owner_entry, p_entry, id1_entry):
2956 # 1. Create an encryption key based on the user password string, as
2957 # described in algorithm 3.2.
2958 key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
2959 # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2,
2960 # using an RC4 encryption function with the encryption key from the
2961 # preceding step.
2962 U = utils.RC4_encrypt(key, _encryption_padding)
2963 # 3. Store the result of step 2 as the value of the /U entry in the
2964 # encryption dictionary.
2965 return U, key
2968# Implementation of algorithm 3.4 of the PDF standard security handler,
2969# section 3.5.2 of the PDF 1.6 reference.
2970def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
2971 # 1. Create an encryption key based on the user password string, as
2972 # described in Algorithm 3.2.
2973 key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
2974 # 2. Initialize the MD5 hash function and pass the 32-byte padding string
2975 # shown in step 1 of Algorithm 3.2 as input to this function.
2976 m = md5()
2977 m.update(_encryption_padding)
2978 # 3. Pass the first element of the file's file identifier array (the value
2979 # of the ID entry in the document's trailer dictionary; see Table 3.13 on
2980 # page 73) to the hash function and finish the hash. (See implementation
2981 # note 25 in Appendix H.)
2982 m.update(id1_entry.original_bytes)
2983 md5_hash = m.digest()
2984 # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
2985 # function with the encryption key from step 1.
2986 val = utils.RC4_encrypt(key, md5_hash)
2987 # 5. Do the following 19 times: Take the output from the previous
2988 # invocation of the RC4 function and pass it as input to a new invocation
2989 # of the function; use an encryption key generated by taking each byte of
2990 # the original encryption key (obtained in step 2) and performing an XOR
2991 # operation between that byte and the single-byte value of the iteration
2992 # counter (from 1 to 19).
2993 for i in range(1, 20):
2994 new_key = b_('')
2995 for l in range(len(key)):
2996 new_key += b_(chr(ord_(key[l]) ^ i))
2997 val = utils.RC4_encrypt(new_key, val)
2998 # 6. Append 16 bytes of arbitrary padding to the output from the final
2999 # invocation of the RC4 function and store the 32-byte result as the value
3000 # of the U entry in the encryption dictionary.
3001 # (implementator note: I don't know what "arbitrary padding" is supposed to
3002 # mean, so I have used null bytes. This seems to match a few other
3003 # people's implementations)
3004 return val + (b_('\x00') * 16), key