Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/PyPDF2/generic.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# vim: sw=4:expandtab:foldmethod=marker
2#
3# Copyright (c) 2006, Mathieu Fenniak
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
31"""
32Implementation of generic PDF objects (dictionary, number, string, and so on)
33"""
34__author__ = "Mathieu Fenniak"
35__author_email__ = "biziqe@mathieu.fenniak.net"
37import re
38from .utils import readNonWhitespace, RC4_encrypt, skipOverComment
39from .utils import b_, u_, chr_, ord_
40from .utils import PdfStreamError
41import warnings
42from . import filters
43from . import utils
44import decimal
45import codecs
46import sys
47#import debugging
49ObjectPrefix = b_('/<[tf(n%')
50NumberSigns = b_('+-')
51IndirectPattern = re.compile(b_(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
54def readObject(stream, pdf):
55 tok = stream.read(1)
56 stream.seek(-1, 1) # reset to start
57 idx = ObjectPrefix.find(tok)
58 if idx == 0:
59 # name object
60 return NameObject.readFromStream(stream, pdf)
61 elif idx == 1:
62 # hexadecimal string OR dictionary
63 peek = stream.read(2)
64 stream.seek(-2, 1) # reset to start
65 if peek == b_('<<'):
66 return DictionaryObject.readFromStream(stream, pdf)
67 else:
68 return readHexStringFromStream(stream)
69 elif idx == 2:
70 # array object
71 return ArrayObject.readFromStream(stream, pdf)
72 elif idx == 3 or idx == 4:
73 # boolean object
74 return BooleanObject.readFromStream(stream)
75 elif idx == 5:
76 # string object
77 return readStringFromStream(stream)
78 elif idx == 6:
79 # null object
80 return NullObject.readFromStream(stream)
81 elif idx == 7:
82 # comment
83 while tok not in (b_('\r'), b_('\n')):
84 tok = stream.read(1)
85 tok = readNonWhitespace(stream)
86 stream.seek(-1, 1)
87 return readObject(stream, pdf)
88 else:
89 # number object OR indirect reference
90 if tok in NumberSigns:
91 # number
92 return NumberObject.readFromStream(stream)
93 peek = stream.read(20)
94 stream.seek(-len(peek), 1) # reset to start
95 if IndirectPattern.match(peek) != None:
96 return IndirectObject.readFromStream(stream, pdf)
97 else:
98 return NumberObject.readFromStream(stream)
101class PdfObject(object):
102 def getObject(self):
103 """Resolves indirect references."""
104 return self
107class NullObject(PdfObject):
108 def writeToStream(self, stream, encryption_key):
109 stream.write(b_("null"))
111 def readFromStream(stream):
112 nulltxt = stream.read(4)
113 if nulltxt != b_("null"):
114 raise utils.PdfReadError("Could not read Null object")
115 return NullObject()
116 readFromStream = staticmethod(readFromStream)
119class BooleanObject(PdfObject):
120 def __init__(self, value):
121 self.value = value
123 def writeToStream(self, stream, encryption_key):
124 if self.value:
125 stream.write(b_("true"))
126 else:
127 stream.write(b_("false"))
129 def readFromStream(stream):
130 word = stream.read(4)
131 if word == b_("true"):
132 return BooleanObject(True)
133 elif word == b_("fals"):
134 stream.read(1)
135 return BooleanObject(False)
136 else:
137 raise utils.PdfReadError('Could not read Boolean object')
138 readFromStream = staticmethod(readFromStream)
141class ArrayObject(list, PdfObject):
142 def writeToStream(self, stream, encryption_key):
143 stream.write(b_("["))
144 for data in self:
145 stream.write(b_(" "))
146 data.writeToStream(stream, encryption_key)
147 stream.write(b_(" ]"))
149 def readFromStream(stream, pdf):
150 arr = ArrayObject()
151 tmp = stream.read(1)
152 if tmp != b_("["):
153 raise utils.PdfReadError("Could not read array")
154 while True:
155 # skip leading whitespace
156 tok = stream.read(1)
157 while tok.isspace():
158 tok = stream.read(1)
159 stream.seek(-1, 1)
160 # check for array ending
161 peekahead = stream.read(1)
162 if peekahead == b_("]"):
163 break
164 stream.seek(-1, 1)
165 # read and append obj
166 arr.append(readObject(stream, pdf))
167 return arr
168 readFromStream = staticmethod(readFromStream)
171class IndirectObject(PdfObject):
172 def __init__(self, idnum, generation, pdf):
173 self.idnum = idnum
174 self.generation = generation
175 self.pdf = pdf
177 def getObject(self):
178 return self.pdf.getObject(self).getObject()
180 def __repr__(self):
181 return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
183 def __eq__(self, other):
184 return (
185 other != None and
186 isinstance(other, IndirectObject) and
187 self.idnum == other.idnum and
188 self.generation == other.generation and
189 self.pdf is other.pdf
190 )
192 def __ne__(self, other):
193 return not self.__eq__(other)
195 def writeToStream(self, stream, encryption_key):
196 stream.write(b_("%s %s R" % (self.idnum, self.generation)))
198 def readFromStream(stream, pdf):
199 idnum = b_("")
200 while True:
201 tok = stream.read(1)
202 if not tok:
203 # stream has truncated prematurely
204 raise PdfStreamError("Stream has ended unexpectedly")
205 if tok.isspace():
206 break
207 idnum += tok
208 generation = b_("")
209 while True:
210 tok = stream.read(1)
211 if not tok:
212 # stream has truncated prematurely
213 raise PdfStreamError("Stream has ended unexpectedly")
214 if tok.isspace():
215 if not generation:
216 continue
217 break
218 generation += tok
219 r = readNonWhitespace(stream)
220 if r != b_("R"):
221 raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell()))
222 return IndirectObject(int(idnum), int(generation), pdf)
223 readFromStream = staticmethod(readFromStream)
226class FloatObject(decimal.Decimal, PdfObject):
227 def __new__(cls, value="0", context=None):
228 try:
229 return decimal.Decimal.__new__(cls, utils.str_(value), context)
230 except:
231 return decimal.Decimal.__new__(cls, str(value))
233 def __repr__(self):
234 if self == self.to_integral():
235 return str(self.quantize(decimal.Decimal(1)))
236 else:
237 # Standard formatting adds useless extraneous zeros.
238 o = "%.5f" % self
239 # Remove the zeros.
240 while o and o[-1] == '0':
241 o = o[:-1]
242 return o
244 def as_numeric(self):
245 return float(b_(repr(self)))
247 def writeToStream(self, stream, encryption_key):
248 stream.write(b_(repr(self)))
251class NumberObject(int, PdfObject):
252 NumberPattern = re.compile(b_('[^+-.0-9]'))
253 ByteDot = b_(".")
255 def __new__(cls, value):
256 val = int(value)
257 try:
258 return int.__new__(cls, val)
259 except OverflowError:
260 return int.__new__(cls, 0)
262 def as_numeric(self):
263 return int(b_(repr(self)))
265 def writeToStream(self, stream, encryption_key):
266 stream.write(b_(repr(self)))
268 def readFromStream(stream):
269 num = utils.readUntilRegex(stream, NumberObject.NumberPattern)
270 if num.find(NumberObject.ByteDot) != -1:
271 return FloatObject(num)
272 else:
273 return NumberObject(num)
274 readFromStream = staticmethod(readFromStream)
277##
278# Given a string (either a "str" or "unicode"), create a ByteStringObject or a
279# TextStringObject to represent the string.
280def createStringObject(string):
281 if isinstance(string, utils.string_type):
282 return TextStringObject(string)
283 elif isinstance(string, utils.bytes_type):
284 try:
285 if string.startswith(codecs.BOM_UTF16_BE):
286 retval = TextStringObject(string.decode("utf-16"))
287 retval.autodetect_utf16 = True
288 return retval
289 else:
290 # This is probably a big performance hit here, but we need to
291 # convert string objects into the text/unicode-aware version if
292 # possible... and the only way to check if that's possible is
293 # to try. Some strings are strings, some are just byte arrays.
294 retval = TextStringObject(decode_pdfdocencoding(string))
295 retval.autodetect_pdfdocencoding = True
296 return retval
297 except UnicodeDecodeError:
298 return ByteStringObject(string)
299 else:
300 raise TypeError("createStringObject should have str or unicode arg")
303def readHexStringFromStream(stream):
304 stream.read(1)
305 txt = ""
306 x = b_("")
307 while True:
308 tok = readNonWhitespace(stream)
309 if not tok:
310 # stream has truncated prematurely
311 raise PdfStreamError("Stream has ended unexpectedly")
312 if tok == b_(">"):
313 break
314 x += tok
315 if len(x) == 2:
316 txt += chr(int(x, base=16))
317 x = b_("")
318 if len(x) == 1:
319 x += b_("0")
320 if len(x) == 2:
321 txt += chr(int(x, base=16))
322 return createStringObject(b_(txt))
325def readStringFromStream(stream):
326 tok = stream.read(1)
327 parens = 1
328 txt = b_("")
329 while True:
330 tok = stream.read(1)
331 if not tok:
332 # stream has truncated prematurely
333 raise PdfStreamError("Stream has ended unexpectedly")
334 if tok == b_("("):
335 parens += 1
336 elif tok == b_(")"):
337 parens -= 1
338 if parens == 0:
339 break
340 elif tok == b_("\\"):
341 tok = stream.read(1)
342 if tok == b_("n"):
343 tok = b_("\n")
344 elif tok == b_("r"):
345 tok = b_("\r")
346 elif tok == b_("t"):
347 tok = b_("\t")
348 elif tok == b_("b"):
349 tok = b_("\b")
350 elif tok == b_("f"):
351 tok = b_("\f")
352 elif tok == b_("c"):
353 tok = b_("\c")
354 elif tok == b_("("):
355 tok = b_("(")
356 elif tok == b_(")"):
357 tok = b_(")")
358 elif tok == b_("/"):
359 tok = b_("/")
360 elif tok == b_("\\"):
361 tok = b_("\\")
362 elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["),
363 b_("]"), b_("#"), b_("_"), b_("&"), b_('$')):
364 # odd/unnessecary escape sequences we have encountered
365 tok = b_(tok)
366 elif tok.isdigit():
367 # "The number ddd may consist of one, two, or three
368 # octal digits; high-order overflow shall be ignored.
369 # Three octal digits shall be used, with leading zeros
370 # as needed, if the next character of the string is also
371 # a digit." (PDF reference 7.3.4.2, p 16)
372 for i in range(2):
373 ntok = stream.read(1)
374 if ntok.isdigit():
375 tok += ntok
376 else:
377 break
378 tok = b_(chr(int(tok, base=8)))
379 elif tok in b_("\n\r"):
380 # This case is hit when a backslash followed by a line
381 # break occurs. If it's a multi-char EOL, consume the
382 # second character:
383 tok = stream.read(1)
384 if not tok in b_("\n\r"):
385 stream.seek(-1, 1)
386 # Then don't add anything to the actual string, since this
387 # line break was escaped:
388 tok = b_('')
389 else:
390 raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok)
391 txt += tok
392 return createStringObject(txt)
395##
396# Represents a string object where the text encoding could not be determined.
397# This occurs quite often, as the PDF spec doesn't provide an alternate way to
398# represent strings -- for example, the encryption data stored in files (like
399# /O) is clearly not text, but is still stored in a "String" object.
400class ByteStringObject(utils.bytes_type, PdfObject):
402 ##
403 # For compatibility with TextStringObject.original_bytes. This method
404 # returns self.
405 original_bytes = property(lambda self: self)
407 def writeToStream(self, stream, encryption_key):
408 bytearr = self
409 if encryption_key:
410 bytearr = RC4_encrypt(encryption_key, bytearr)
411 stream.write(b_("<"))
412 stream.write(utils.hexencode(bytearr))
413 stream.write(b_(">"))
416##
417# Represents a string object that has been decoded into a real unicode string.
418# If read from a PDF document, this string appeared to match the
419# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
420# occur.
421class TextStringObject(utils.string_type, PdfObject):
422 autodetect_pdfdocencoding = False
423 autodetect_utf16 = False
425 ##
426 # It is occasionally possible that a text string object gets created where
427 # a byte string object was expected due to the autodetection mechanism --
428 # if that occurs, this "original_bytes" property can be used to
429 # back-calculate what the original encoded bytes were.
430 original_bytes = property(lambda self: self.get_original_bytes())
432 def get_original_bytes(self):
433 # We're a text string object, but the library is trying to get our raw
434 # bytes. This can happen if we auto-detected this string as text, but
435 # we were wrong. It's pretty common. Return the original bytes that
436 # would have been used to create this object, based upon the autodetect
437 # method.
438 if self.autodetect_utf16:
439 return codecs.BOM_UTF16_BE + self.encode("utf-16be")
440 elif self.autodetect_pdfdocencoding:
441 return encode_pdfdocencoding(self)
442 else:
443 raise Exception("no information about original bytes")
445 def writeToStream(self, stream, encryption_key):
446 # Try to write the string out as a PDFDocEncoding encoded string. It's
447 # nicer to look at in the PDF file. Sadly, we take a performance hit
448 # here for trying...
449 try:
450 bytearr = encode_pdfdocencoding(self)
451 except UnicodeEncodeError:
452 bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
453 if encryption_key:
454 bytearr = RC4_encrypt(encryption_key, bytearr)
455 obj = ByteStringObject(bytearr)
456 obj.writeToStream(stream, None)
457 else:
458 stream.write(b_("("))
459 for c in bytearr:
460 if not chr_(c).isalnum() and c != b_(' '):
461 stream.write(b_("\\%03o" % ord_(c)))
462 else:
463 stream.write(b_(chr_(c)))
464 stream.write(b_(")"))
467class NameObject(str, PdfObject):
468 delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]"))
469 surfix = b_("/")
471 def writeToStream(self, stream, encryption_key):
472 stream.write(b_(self))
474 def readFromStream(stream, pdf):
475 debug = False
476 if debug: print((stream.tell()))
477 name = stream.read(1)
478 if name != NameObject.surfix:
479 raise utils.PdfReadError("name read error")
480 name += utils.readUntilRegex(stream, NameObject.delimiterPattern,
481 ignore_eof=True)
482 if debug: print(name)
483 try:
484 return NameObject(name.decode('utf-8'))
485 except (UnicodeEncodeError, UnicodeDecodeError) as e:
486 # Name objects should represent irregular characters
487 # with a '#' followed by the symbol's hex number
488 if not pdf.strict:
489 warnings.warn("Illegal character in Name Object", utils.PdfReadWarning)
490 return NameObject(name)
491 else:
492 raise utils.PdfReadError("Illegal character in Name Object")
494 readFromStream = staticmethod(readFromStream)
497class DictionaryObject(dict, PdfObject):
498 def raw_get(self, key):
499 return dict.__getitem__(self, key)
501 def __setitem__(self, key, value):
502 if not isinstance(key, PdfObject):
503 raise ValueError("key must be PdfObject")
504 if not isinstance(value, PdfObject):
505 raise ValueError("value must be PdfObject")
506 return dict.__setitem__(self, key, value)
508 def setdefault(self, key, value=None):
509 if not isinstance(key, PdfObject):
510 raise ValueError("key must be PdfObject")
511 if not isinstance(value, PdfObject):
512 raise ValueError("value must be PdfObject")
513 return dict.setdefault(self, key, value)
515 def __getitem__(self, key):
516 return dict.__getitem__(self, key).getObject()
518 ##
519 # Retrieves XMP (Extensible Metadata Platform) data relevant to the
520 # this object, if available.
521 # <p>
522 # Stability: Added in v1.12, will exist for all future v1.x releases.
523 # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance
524 # that can be used to access XMP metadata from the document. Can also
525 # return None if no metadata was found on the document root.
526 def getXmpMetadata(self):
527 metadata = self.get("/Metadata", None)
528 if metadata == None:
529 return None
530 metadata = metadata.getObject()
531 from . import xmp
532 if not isinstance(metadata, xmp.XmpInformation):
533 metadata = xmp.XmpInformation(metadata)
534 self[NameObject("/Metadata")] = metadata
535 return metadata
537 ##
538 # Read-only property that accesses the {@link
539 # #DictionaryObject.getXmpData getXmpData} function.
540 # <p>
541 # Stability: Added in v1.12, will exist for all future v1.x releases.
542 xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
544 def writeToStream(self, stream, encryption_key):
545 stream.write(b_("<<\n"))
546 for key, value in list(self.items()):
547 key.writeToStream(stream, encryption_key)
548 stream.write(b_(" "))
549 value.writeToStream(stream, encryption_key)
550 stream.write(b_("\n"))
551 stream.write(b_(">>"))
553 def readFromStream(stream, pdf):
554 debug = False
555 tmp = stream.read(2)
556 if tmp != b_("<<"):
557 raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
558 data = {}
559 while True:
560 tok = readNonWhitespace(stream)
561 if tok == b_('\x00'):
562 continue
563 elif tok == b_('%'):
564 stream.seek(-1, 1)
565 skipOverComment(stream)
566 continue
567 if not tok:
568 # stream has truncated prematurely
569 raise PdfStreamError("Stream has ended unexpectedly")
571 if debug: print(("Tok:", tok))
572 if tok == b_(">"):
573 stream.read(1)
574 break
575 stream.seek(-1, 1)
576 key = readObject(stream, pdf)
577 tok = readNonWhitespace(stream)
578 stream.seek(-1, 1)
579 value = readObject(stream, pdf)
580 if not data.get(key):
581 data[key] = value
582 elif pdf.strict:
583 # multiple definitions of key not permitted
584 raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
585 % (utils.hexStr(stream.tell()), key))
586 else:
587 warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
588 % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning)
590 pos = stream.tell()
591 s = readNonWhitespace(stream)
592 if s == b_('s') and stream.read(5) == b_('tream'):
593 eol = stream.read(1)
594 # odd PDF file output has spaces after 'stream' keyword but before EOL.
595 # patch provided by Danial Sandler
596 while eol == b_(' '):
597 eol = stream.read(1)
598 assert eol in (b_("\n"), b_("\r"))
599 if eol == b_("\r"):
600 # read \n after
601 if stream.read(1) != b_('\n'):
602 stream.seek(-1, 1)
603 # this is a stream object, not a dictionary
604 assert "/Length" in data
605 length = data["/Length"]
606 if debug: print(data)
607 if isinstance(length, IndirectObject):
608 t = stream.tell()
609 length = pdf.getObject(length)
610 stream.seek(t, 0)
611 data["__streamdata__"] = stream.read(length)
612 if debug: print("here")
613 #if debug: print(binascii.hexlify(data["__streamdata__"]))
614 e = readNonWhitespace(stream)
615 ndstream = stream.read(8)
616 if (e + ndstream) != b_("endstream"):
617 # (sigh) - the odd PDF file has a length that is too long, so
618 # we need to read backwards to find the "endstream" ending.
619 # ReportLab (unknown version) generates files with this bug,
620 # and Python users into PDF files tend to be our audience.
621 # we need to do this to correct the streamdata and chop off
622 # an extra character.
623 pos = stream.tell()
624 stream.seek(-10, 1)
625 end = stream.read(9)
626 if end == b_("endstream"):
627 # we found it by looking back one character further.
628 data["__streamdata__"] = data["__streamdata__"][:-1]
629 else:
630 if debug: print(("E", e, ndstream, debugging.toHex(end)))
631 stream.seek(pos, 0)
632 raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
633 else:
634 stream.seek(pos, 0)
635 if "__streamdata__" in data:
636 return StreamObject.initializeFromDictionary(data)
637 else:
638 retval = DictionaryObject()
639 retval.update(data)
640 return retval
641 readFromStream = staticmethod(readFromStream)
644class TreeObject(DictionaryObject):
645 def __init__(self):
646 DictionaryObject.__init__(self)
648 def hasChildren(self):
649 return '/First' in self
651 def __iter__(self):
652 return self.children()
654 def children(self):
655 if not self.hasChildren():
656 raise StopIteration
658 child = self['/First']
659 while True:
660 yield child
661 if child == self['/Last']:
662 raise StopIteration
663 child = child['/Next']
665 def addChild(self, child, pdf):
666 childObj = child.getObject()
667 child = pdf.getReference(childObj)
668 assert isinstance(child, IndirectObject)
670 if '/First' not in self:
671 self[NameObject('/First')] = child
672 self[NameObject('/Count')] = NumberObject(0)
673 prev = None
674 else:
675 prev = self['/Last']
677 self[NameObject('/Last')] = child
678 self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1)
680 if prev:
681 prevRef = pdf.getReference(prev)
682 assert isinstance(prevRef, IndirectObject)
683 childObj[NameObject('/Prev')] = prevRef
684 prev[NameObject('/Next')] = child
686 parentRef = pdf.getReference(self)
687 assert isinstance(parentRef, IndirectObject)
688 childObj[NameObject('/Parent')] = parentRef
690 def removeChild(self, child):
691 childObj = child.getObject()
693 if NameObject('/Parent') not in childObj:
694 raise ValueError("Removed child does not appear to be a tree item")
695 elif childObj[NameObject('/Parent')] != self:
696 raise ValueError("Removed child is not a member of this tree")
698 found = False
699 prevRef = None
700 prev = None
701 curRef = self[NameObject('/First')]
702 cur = curRef.getObject()
703 lastRef = self[NameObject('/Last')]
704 last = lastRef.getObject()
705 while cur != None:
706 if cur == childObj:
707 if prev == None:
708 if NameObject('/Next') in cur:
709 # Removing first tree node
710 nextRef = cur[NameObject('/Next')]
711 next = nextRef.getObject()
712 del next[NameObject('/Prev')]
713 self[NameObject('/First')] = nextRef
714 self[NameObject('/Count')] = self[NameObject('/Count')] - 1
716 else:
717 # Removing only tree node
718 assert self[NameObject('/Count')] == 1
719 del self[NameObject('/Count')]
720 del self[NameObject('/First')]
721 if NameObject('/Last') in self:
722 del self[NameObject('/Last')]
723 else:
724 if NameObject('/Next') in cur:
725 # Removing middle tree node
726 nextRef = cur[NameObject('/Next')]
727 next = nextRef.getObject()
728 next[NameObject('/Prev')] = prevRef
729 prev[NameObject('/Next')] = nextRef
730 self[NameObject('/Count')] = self[NameObject('/Count')] - 1
731 else:
732 # Removing last tree node
733 assert cur == last
734 del prev[NameObject('/Next')]
735 self[NameObject('/Last')] = prevRef
736 self[NameObject('/Count')] = self[NameObject('/Count')] - 1
737 found = True
738 break
740 prevRef = curRef
741 prev = cur
742 if NameObject('/Next') in cur:
743 curRef = cur[NameObject('/Next')]
744 cur = curRef.getObject()
745 else:
746 curRef = None
747 cur = None
749 if not found:
750 raise ValueError("Removal couldn't find item in tree")
752 del childObj[NameObject('/Parent')]
753 if NameObject('/Next') in childObj:
754 del childObj[NameObject('/Next')]
755 if NameObject('/Prev') in childObj:
756 del childObj[NameObject('/Prev')]
758 def emptyTree(self):
759 for child in self:
760 childObj = child.getObject()
761 del childObj[NameObject('/Parent')]
762 if NameObject('/Next') in childObj:
763 del childObj[NameObject('/Next')]
764 if NameObject('/Prev') in childObj:
765 del childObj[NameObject('/Prev')]
767 if NameObject('/Count') in self:
768 del self[NameObject('/Count')]
769 if NameObject('/First') in self:
770 del self[NameObject('/First')]
771 if NameObject('/Last') in self:
772 del self[NameObject('/Last')]
775class StreamObject(DictionaryObject):
776 def __init__(self):
777 self._data = None
778 self.decodedSelf = None
780 def writeToStream(self, stream, encryption_key):
781 self[NameObject("/Length")] = NumberObject(len(self._data))
782 DictionaryObject.writeToStream(self, stream, encryption_key)
783 del self["/Length"]
784 stream.write(b_("\nstream\n"))
785 data = self._data
786 if encryption_key:
787 data = RC4_encrypt(encryption_key, data)
788 stream.write(data)
789 stream.write(b_("\nendstream"))
791 def initializeFromDictionary(data):
792 if "/Filter" in data:
793 retval = EncodedStreamObject()
794 else:
795 retval = DecodedStreamObject()
796 retval._data = data["__streamdata__"]
797 del data["__streamdata__"]
798 del data["/Length"]
799 retval.update(data)
800 return retval
801 initializeFromDictionary = staticmethod(initializeFromDictionary)
803 def flateEncode(self):
804 if "/Filter" in self:
805 f = self["/Filter"]
806 if isinstance(f, ArrayObject):
807 f.insert(0, NameObject("/FlateDecode"))
808 else:
809 newf = ArrayObject()
810 newf.append(NameObject("/FlateDecode"))
811 newf.append(f)
812 f = newf
813 else:
814 f = NameObject("/FlateDecode")
815 retval = EncodedStreamObject()
816 retval[NameObject("/Filter")] = f
817 retval._data = filters.FlateDecode.encode(self._data)
818 return retval
821class DecodedStreamObject(StreamObject):
822 def getData(self):
823 return self._data
825 def setData(self, data):
826 self._data = data
829class EncodedStreamObject(StreamObject):
830 def __init__(self):
831 self.decodedSelf = None
833 def getData(self):
834 if self.decodedSelf:
835 # cached version of decoded object
836 return self.decodedSelf.getData()
837 else:
838 # create decoded object
839 decoded = DecodedStreamObject()
841 decoded._data = filters.decodeStreamData(self)
842 for key, value in list(self.items()):
843 if not key in ("/Length", "/Filter", "/DecodeParms"):
844 decoded[key] = value
845 self.decodedSelf = decoded
846 return decoded._data
848 def setData(self, data):
849 raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported")
852class RectangleObject(ArrayObject):
853 """
854 This class is used to represent *page boxes* in PyPDF2. These boxes include:
856 * :attr:`artBox <PyPDF2.pdf.PageObject.artBox>`
857 * :attr:`bleedBox <PyPDF2.pdf.PageObject.bleedBox>`
858 * :attr:`cropBox <PyPDF2.pdf.PageObject.cropBox>`
859 * :attr:`mediaBox <PyPDF2.pdf.PageObject.mediaBox>`
860 * :attr:`trimBox <PyPDF2.pdf.PageObject.trimBox>`
861 """
862 def __init__(self, arr):
863 # must have four points
864 assert len(arr) == 4
865 # automatically convert arr[x] into NumberObject(arr[x]) if necessary
866 ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
868 def ensureIsNumber(self, value):
869 if not isinstance(value, (NumberObject, FloatObject)):
870 value = FloatObject(value)
871 return value
873 def __repr__(self):
874 return "RectangleObject(%s)" % repr(list(self))
876 def getLowerLeft_x(self):
877 return self[0]
879 def getLowerLeft_y(self):
880 return self[1]
882 def getUpperRight_x(self):
883 return self[2]
885 def getUpperRight_y(self):
886 return self[3]
888 def getUpperLeft_x(self):
889 return self.getLowerLeft_x()
891 def getUpperLeft_y(self):
892 return self.getUpperRight_y()
894 def getLowerRight_x(self):
895 return self.getUpperRight_x()
897 def getLowerRight_y(self):
898 return self.getLowerLeft_y()
900 def getLowerLeft(self):
901 return self.getLowerLeft_x(), self.getLowerLeft_y()
903 def getLowerRight(self):
904 return self.getLowerRight_x(), self.getLowerRight_y()
906 def getUpperLeft(self):
907 return self.getUpperLeft_x(), self.getUpperLeft_y()
909 def getUpperRight(self):
910 return self.getUpperRight_x(), self.getUpperRight_y()
912 def setLowerLeft(self, value):
913 self[0], self[1] = [self.ensureIsNumber(x) for x in value]
915 def setLowerRight(self, value):
916 self[2], self[1] = [self.ensureIsNumber(x) for x in value]
918 def setUpperLeft(self, value):
919 self[0], self[3] = [self.ensureIsNumber(x) for x in value]
921 def setUpperRight(self, value):
922 self[2], self[3] = [self.ensureIsNumber(x) for x in value]
924 def getWidth(self):
925 return self.getUpperRight_x() - self.getLowerLeft_x()
927 def getHeight(self):
928 return self.getUpperRight_y() - self.getLowerLeft_y()
930 lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
931 """
932 Property to read and modify the lower left coordinate of this box
933 in (x,y) form.
934 """
935 lowerRight = property(getLowerRight, setLowerRight, None, None)
936 """
937 Property to read and modify the lower right coordinate of this box
938 in (x,y) form.
939 """
940 upperLeft = property(getUpperLeft, setUpperLeft, None, None)
941 """
942 Property to read and modify the upper left coordinate of this box
943 in (x,y) form.
944 """
945 upperRight = property(getUpperRight, setUpperRight, None, None)
946 """
947 Property to read and modify the upper right coordinate of this box
948 in (x,y) form.
949 """
952class Field(TreeObject):
953 """
954 A class representing a field dictionary. This class is accessed through
955 :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
956 """
957 def __init__(self, data):
958 DictionaryObject.__init__(self)
959 attributes = ("/FT", "/Parent", "/Kids", "/T", "/TU", "/TM", "/Ff",
960 "/V", "/DV", "/AA")
961 for attr in attributes:
962 try:
963 self[NameObject(attr)] = data[attr]
964 except KeyError:
965 pass
967 fieldType = property(lambda self: self.get("/FT"))
968 """
969 Read-only property accessing the type of this field.
970 """
972 parent = property(lambda self: self.get("/Parent"))
973 """
974 Read-only property accessing the parent of this field.
975 """
977 kids = property(lambda self: self.get("/Kids"))
978 """
979 Read-only property accessing the kids of this field.
980 """
982 name = property(lambda self: self.get("/T"))
983 """
984 Read-only property accessing the name of this field.
985 """
987 altName = property(lambda self: self.get("/TU"))
988 """
989 Read-only property accessing the alternate name of this field.
990 """
992 mappingName = property(lambda self: self.get("/TM"))
993 """
994 Read-only property accessing the mapping name of this field. This
995 name is used by PyPDF2 as a key in the dictionary returned by
996 :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
997 """
999 flags = property(lambda self: self.get("/Ff"))
1000 """
1001 Read-only property accessing the field flags, specifying various
1002 characteristics of the field (see Table 8.70 of the PDF 1.7 reference).
1003 """
1005 value = property(lambda self: self.get("/V"))
1006 """
1007 Read-only property accessing the value of this field. Format
1008 varies based on field type.
1009 """
1011 defaultValue = property(lambda self: self.get("/DV"))
1012 """
1013 Read-only property accessing the default value of this field.
1014 """
1016 additionalActions = property(lambda self: self.get("/AA"))
1017 """
1018 Read-only property accessing the additional actions dictionary.
1019 This dictionary defines the field's behavior in response to trigger events.
1020 See Section 8.5.2 of the PDF 1.7 reference.
1021 """
1024class Destination(TreeObject):
1025 """
1026 A class representing a destination within a PDF file.
1027 See section 8.2.1 of the PDF 1.6 reference.
1029 :param str title: Title of this destination.
1030 :param int page: Page number of this destination.
1031 :param str typ: How the destination is displayed.
1032 :param args: Additional arguments may be necessary depending on the type.
1033 :raises PdfReadError: If destination type is invalid.
1035 Valid ``typ`` arguments (see PDF spec for details):
1036 /Fit No additional arguments
1037 /XYZ [left] [top] [zoomFactor]
1038 /FitH [top]
1039 /FitV [left]
1040 /FitR [left] [bottom] [right] [top]
1041 /FitB No additional arguments
1042 /FitBH [top]
1043 /FitBV [left]
1044 """
1045 def __init__(self, title, page, typ, *args):
1046 DictionaryObject.__init__(self)
1047 self[NameObject("/Title")] = title
1048 self[NameObject("/Page")] = page
1049 self[NameObject("/Type")] = typ
1051 # from table 8.2 of the PDF 1.7 reference.
1052 if typ == "/XYZ":
1053 (self[NameObject("/Left")], self[NameObject("/Top")],
1054 self[NameObject("/Zoom")]) = args
1055 elif typ == "/FitR":
1056 (self[NameObject("/Left")], self[NameObject("/Bottom")],
1057 self[NameObject("/Right")], self[NameObject("/Top")]) = args
1058 elif typ in ["/FitH", "/FitBH"]:
1059 self[NameObject("/Top")], = args
1060 elif typ in ["/FitV", "/FitBV"]:
1061 self[NameObject("/Left")], = args
1062 elif typ in ["/Fit", "/FitB"]:
1063 pass
1064 else:
1065 raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
1067 def getDestArray(self):
1068 return ArrayObject([self.raw_get('/Page'), self['/Type']] + [self[x] for x in ['/Left', '/Bottom', '/Right', '/Top', '/Zoom'] if x in self])
1070 def writeToStream(self, stream, encryption_key):
1071 stream.write(b_("<<\n"))
1072 key = NameObject('/D')
1073 key.writeToStream(stream, encryption_key)
1074 stream.write(b_(" "))
1075 value = self.getDestArray()
1076 value.writeToStream(stream, encryption_key)
1078 key = NameObject("/S")
1079 key.writeToStream(stream, encryption_key)
1080 stream.write(b_(" "))
1081 value = NameObject("/GoTo")
1082 value.writeToStream(stream, encryption_key)
1084 stream.write(b_("\n"))
1085 stream.write(b_(">>"))
1087 title = property(lambda self: self.get("/Title"))
1088 """
1089 Read-only property accessing the destination title.
1091 :rtype: str
1092 """
1094 page = property(lambda self: self.get("/Page"))
1095 """
1096 Read-only property accessing the destination page number.
1098 :rtype: int
1099 """
1101 typ = property(lambda self: self.get("/Type"))
1102 """
1103 Read-only property accessing the destination type.
1105 :rtype: str
1106 """
1108 zoom = property(lambda self: self.get("/Zoom", None))
1109 """
1110 Read-only property accessing the zoom factor.
1112 :rtype: int, or ``None`` if not available.
1113 """
1115 left = property(lambda self: self.get("/Left", None))
1116 """
1117 Read-only property accessing the left horizontal coordinate.
1119 :rtype: int, or ``None`` if not available.
1120 """
1122 right = property(lambda self: self.get("/Right", None))
1123 """
1124 Read-only property accessing the right horizontal coordinate.
1126 :rtype: int, or ``None`` if not available.
1127 """
1129 top = property(lambda self: self.get("/Top", None))
1130 """
1131 Read-only property accessing the top vertical coordinate.
1133 :rtype: int, or ``None`` if not available.
1134 """
1136 bottom = property(lambda self: self.get("/Bottom", None))
1137 """
1138 Read-only property accessing the bottom vertical coordinate.
1140 :rtype: int, or ``None`` if not available.
1141 """
1144class Bookmark(Destination):
1145 def writeToStream(self, stream, encryption_key):
1146 stream.write(b_("<<\n"))
1147 for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if x in self]:
1148 key.writeToStream(stream, encryption_key)
1149 stream.write(b_(" "))
1150 value = self.raw_get(key)
1151 value.writeToStream(stream, encryption_key)
1152 stream.write(b_("\n"))
1153 key = NameObject('/Dest')
1154 key.writeToStream(stream, encryption_key)
1155 stream.write(b_(" "))
1156 value = self.getDestArray()
1157 value.writeToStream(stream, encryption_key)
1158 stream.write(b_("\n"))
1159 stream.write(b_(">>"))
1162def encode_pdfdocencoding(unicode_string):
1163 retval = b_('')
1164 for c in unicode_string:
1165 try:
1166 retval += b_(chr(_pdfDocEncoding_rev[c]))
1167 except KeyError:
1168 raise UnicodeEncodeError("pdfdocencoding", c, -1, -1,
1169 "does not exist in translation table")
1170 return retval
1173def decode_pdfdocencoding(byte_array):
1174 retval = u_('')
1175 for b in byte_array:
1176 c = _pdfDocEncoding[ord_(b)]
1177 if c == u_('\u0000'):
1178 raise UnicodeDecodeError("pdfdocencoding", utils.barray(b), -1, -1,
1179 "does not exist in translation table")
1180 retval += c
1181 return retval
1183_pdfDocEncoding = (
1184 u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
1185 u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
1186 u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
1187 u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'),
1188 u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'),
1189 u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'),
1190 u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'),
1191 u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'),
1192 u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'),
1193 u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'),
1194 u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'),
1195 u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'),
1196 u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'),
1197 u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'),
1198 u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'),
1199 u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'),
1200 u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'),
1201 u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'),
1202 u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'),
1203 u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'),
1204 u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'),
1205 u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'),
1206 u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'),
1207 u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'),
1208 u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'),
1209 u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'),
1210 u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'),
1211 u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'),
1212 u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'),
1213 u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'),
1214 u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'),
1215 u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff')
1216)
1218assert len(_pdfDocEncoding) == 256
1220_pdfDocEncoding_rev = {}
1221for i in range(256):
1222 char = _pdfDocEncoding[i]
1223 if char == u_("\u0000"):
1224 continue
1225 assert char not in _pdfDocEncoding_rev
1226 _pdfDocEncoding_rev[char] = i