Coverage for /Users/davegaeddert/Developer/dropseed/plain/plain/plain/http/multipartparser.py: 16%
361 statements
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-23 11:16 -0600
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-23 11:16 -0600
1"""
2Multi-part parsing for file uploads.
4Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
5file upload handlers for processing.
6"""
8import base64
9import binascii
10import collections
11import html
13from plain.exceptions import (
14 RequestDataTooBig,
15 SuspiciousMultipartForm,
16 TooManyFieldsSent,
17 TooManyFilesSent,
18)
19from plain.internal.files.uploadhandler import SkipFile, StopFutureHandlers, StopUpload
20from plain.runtime import settings
21from plain.utils.datastructures import MultiValueDict
22from plain.utils.encoding import force_str
23from plain.utils.http import parse_header_parameters
24from plain.utils.regex_helper import _lazy_re_compile
26__all__ = ("MultiPartParser", "MultiPartParserError", "InputStreamExhausted")
29class MultiPartParserError(Exception):
30 pass
33class InputStreamExhausted(Exception):
34 """
35 No more reads are allowed from this device.
36 """
38 pass
41RAW = "raw"
42FILE = "file"
43FIELD = "field"
44FIELD_TYPES = frozenset([FIELD, RAW])
47class MultiPartParser:
48 """
49 An RFC 7578 multipart/form-data parser.
51 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
52 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``.
53 """
55 boundary_re = _lazy_re_compile(r"[ -~]{0,200}[!-~]")
57 def __init__(self, META, input_data, upload_handlers, encoding=None):
58 """
59 Initialize the MultiPartParser object.
61 :META:
62 The standard ``META`` dictionary in Plain request objects.
63 :input_data:
64 The raw post data, as a file-like object.
65 :upload_handlers:
66 A list of UploadHandler instances that perform operations on the
67 uploaded data.
68 :encoding:
69 The encoding with which to treat the incoming data.
70 """
71 # Content-Type should contain multipart and the boundary information.
72 content_type = META.get("CONTENT_TYPE", "")
73 if not content_type.startswith("multipart/"):
74 raise MultiPartParserError(f"Invalid Content-Type: {content_type}")
76 try:
77 content_type.encode("ascii")
78 except UnicodeEncodeError:
79 raise MultiPartParserError(
80 f"Invalid non-ASCII Content-Type in multipart: {force_str(content_type)}"
81 )
83 # Parse the header to get the boundary to split the parts.
84 _, opts = parse_header_parameters(content_type)
85 boundary = opts.get("boundary")
86 if not boundary or not self.boundary_re.fullmatch(boundary):
87 raise MultiPartParserError(
88 f"Invalid boundary in multipart: {force_str(boundary)}"
89 )
91 # Content-Length should contain the length of the body we are about
92 # to receive.
93 try:
94 content_length = int(META.get("CONTENT_LENGTH", 0))
95 except (ValueError, TypeError):
96 content_length = 0
98 if content_length < 0:
99 # This means we shouldn't continue...raise an error.
100 raise MultiPartParserError(f"Invalid content length: {content_length!r}")
102 self._boundary = boundary.encode("ascii")
103 self._input_data = input_data
105 # For compatibility with low-level network APIs (with 32-bit integers),
106 # the chunk size should be < 2^31, but still divisible by 4.
107 possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]
108 self._chunk_size = min([2**31 - 4] + possible_sizes)
110 self._meta = META
111 self._encoding = encoding or settings.DEFAULT_CHARSET
112 self._content_length = content_length
113 self._upload_handlers = upload_handlers
115 def parse(self):
116 # Call the actual parse routine and close all open files in case of
117 # errors. This is needed because if exceptions are thrown the
118 # MultiPartParser will not be garbage collected immediately and
119 # resources would be kept alive. This is only needed for errors because
120 # the Request object closes all uploaded files at the end of the
121 # request.
122 try:
123 return self._parse()
124 except Exception:
125 if hasattr(self, "_files"):
126 for _, files in self._files.lists():
127 for fileobj in files:
128 fileobj.close()
129 raise
131 def _parse(self):
132 """
133 Parse the POST data and break it into a FILES MultiValueDict and a POST
134 MultiValueDict.
136 Return a tuple containing the POST and FILES dictionary, respectively.
137 """
138 from plain.http import QueryDict
140 encoding = self._encoding
141 handlers = self._upload_handlers
143 # HTTP spec says that Content-Length >= 0 is valid
144 # handling content-length == 0 before continuing
145 if self._content_length == 0:
146 return QueryDict(encoding=self._encoding), MultiValueDict()
148 # See if any of the handlers take care of the parsing.
149 # This allows overriding everything if need be.
150 for handler in handlers:
151 result = handler.handle_raw_input(
152 self._input_data,
153 self._meta,
154 self._content_length,
155 self._boundary,
156 encoding,
157 )
158 # Check to see if it was handled
159 if result is not None:
160 return result[0], result[1]
162 # Create the data structures to be used later.
163 self._post = QueryDict(mutable=True)
164 self._files = MultiValueDict()
166 # Instantiate the parser and stream:
167 stream = LazyStream(ChunkIter(self._input_data, self._chunk_size))
169 # Whether or not to signal a file-completion at the beginning of the loop.
170 old_field_name = None
171 counters = [0] * len(handlers)
173 # Number of bytes that have been read.
174 num_bytes_read = 0
175 # To count the number of keys in the request.
176 num_post_keys = 0
177 # To count the number of files in the request.
178 num_files = 0
179 # To limit the amount of data read from the request.
180 read_size = None
181 # Whether a file upload is finished.
182 uploaded_file = True
184 try:
185 for item_type, meta_data, field_stream in Parser(stream, self._boundary):
186 if old_field_name:
187 # We run this at the beginning of the next loop
188 # since we cannot be sure a file is complete until
189 # we hit the next boundary/part of the multipart content.
190 self.handle_file_complete(old_field_name, counters)
191 old_field_name = None
192 uploaded_file = True
194 if (
195 item_type in FIELD_TYPES
196 and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None
197 ):
198 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS.
199 num_post_keys += 1
200 # 2 accounts for empty raw fields before and after the
201 # last boundary.
202 if settings.DATA_UPLOAD_MAX_NUMBER_FIELDS + 2 < num_post_keys:
203 raise TooManyFieldsSent(
204 "The number of GET/POST parameters exceeded "
205 "settings.DATA_UPLOAD_MAX_NUMBER_FIELDS."
206 )
208 try:
209 disposition = meta_data["content-disposition"][1]
210 field_name = disposition["name"].strip()
211 except (KeyError, IndexError, AttributeError):
212 continue
214 transfer_encoding = meta_data.get("content-transfer-encoding")
215 if transfer_encoding is not None:
216 transfer_encoding = transfer_encoding[0].strip()
217 field_name = force_str(field_name, encoding, errors="replace")
219 if item_type == FIELD:
220 # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE.
221 if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None:
222 read_size = (
223 settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read
224 )
226 # This is a post field, we can just set it in the post
227 if transfer_encoding == "base64":
228 raw_data = field_stream.read(size=read_size)
229 num_bytes_read += len(raw_data)
230 try:
231 data = base64.b64decode(raw_data)
232 except binascii.Error:
233 data = raw_data
234 else:
235 data = field_stream.read(size=read_size)
236 num_bytes_read += len(data)
238 # Add two here to make the check consistent with the
239 # x-www-form-urlencoded check that includes '&='.
240 num_bytes_read += len(field_name) + 2
241 if (
242 settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None
243 and num_bytes_read > settings.DATA_UPLOAD_MAX_MEMORY_SIZE
244 ):
245 raise RequestDataTooBig(
246 "Request body exceeded "
247 "settings.DATA_UPLOAD_MAX_MEMORY_SIZE."
248 )
250 self._post.appendlist(
251 field_name, force_str(data, encoding, errors="replace")
252 )
253 elif item_type == FILE:
254 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FILES.
255 num_files += 1
256 if (
257 settings.DATA_UPLOAD_MAX_NUMBER_FILES is not None
258 and num_files > settings.DATA_UPLOAD_MAX_NUMBER_FILES
259 ):
260 raise TooManyFilesSent(
261 "The number of files exceeded "
262 "settings.DATA_UPLOAD_MAX_NUMBER_FILES."
263 )
264 # This is a file, use the handler...
265 file_name = disposition.get("filename")
266 if file_name:
267 file_name = force_str(file_name, encoding, errors="replace")
268 file_name = self.sanitize_file_name(file_name)
269 if not file_name:
270 continue
272 content_type, content_type_extra = meta_data.get(
273 "content-type", ("", {})
274 )
275 content_type = content_type.strip()
276 charset = content_type_extra.get("charset")
278 try:
279 content_length = int(meta_data.get("content-length")[0])
280 except (IndexError, TypeError, ValueError):
281 content_length = None
283 counters = [0] * len(handlers)
284 uploaded_file = False
285 try:
286 for handler in handlers:
287 try:
288 handler.new_file(
289 field_name,
290 file_name,
291 content_type,
292 content_length,
293 charset,
294 content_type_extra,
295 )
296 except StopFutureHandlers:
297 break
299 for chunk in field_stream:
300 if transfer_encoding == "base64":
301 # We only special-case base64 transfer encoding
302 # We should always decode base64 chunks by
303 # multiple of 4, ignoring whitespace.
305 stripped_chunk = b"".join(chunk.split())
307 remaining = len(stripped_chunk) % 4
308 while remaining != 0:
309 over_chunk = field_stream.read(4 - remaining)
310 if not over_chunk:
311 break
312 stripped_chunk += b"".join(over_chunk.split())
313 remaining = len(stripped_chunk) % 4
315 try:
316 chunk = base64.b64decode(stripped_chunk)
317 except Exception as exc:
318 # Since this is only a chunk, any error is
319 # an unfixable error.
320 raise MultiPartParserError(
321 "Could not decode base64 data."
322 ) from exc
324 for i, handler in enumerate(handlers):
325 chunk_length = len(chunk)
326 chunk = handler.receive_data_chunk(chunk, counters[i])
327 counters[i] += chunk_length
328 if chunk is None:
329 # Don't continue if the chunk received by
330 # the handler is None.
331 break
333 except SkipFile:
334 self._close_files()
335 # Just use up the rest of this file...
336 exhaust(field_stream)
337 else:
338 # Handle file upload completions on next iteration.
339 old_field_name = field_name
340 else:
341 # If this is neither a FIELD nor a FILE, exhaust the field
342 # stream. Note: There could be an error here at some point,
343 # but there will be at least two RAW types (before and
344 # after the other boundaries). This branch is usually not
345 # reached at all, because a missing content-disposition
346 # header will skip the whole boundary.
347 exhaust(field_stream)
348 except StopUpload as e:
349 self._close_files()
350 if not e.connection_reset:
351 exhaust(self._input_data)
352 else:
353 if not uploaded_file:
354 for handler in handlers:
355 handler.upload_interrupted()
356 # Make sure that the request data is all fed
357 exhaust(self._input_data)
359 # Signal that the upload has completed.
360 # any() shortcircuits if a handler's upload_complete() returns a value.
361 any(handler.upload_complete() for handler in handlers)
362 self._post._mutable = False
363 return self._post, self._files
365 def handle_file_complete(self, old_field_name, counters):
366 """
367 Handle all the signaling that takes place when a file is complete.
368 """
369 for i, handler in enumerate(self._upload_handlers):
370 file_obj = handler.file_complete(counters[i])
371 if file_obj:
372 # If it returns a file object, then set the files dict.
373 self._files.appendlist(
374 force_str(old_field_name, self._encoding, errors="replace"),
375 file_obj,
376 )
377 break
379 def sanitize_file_name(self, file_name):
380 """
381 Sanitize the filename of an upload.
383 Remove all possible path separators, even though that might remove more
384 than actually required by the target system. Filenames that could
385 potentially cause problems (current/parent dir) are also discarded.
387 It should be noted that this function could still return a "filepath"
388 like "C:some_file.txt" which is handled later on by the storage layer.
389 So while this function does sanitize filenames to some extent, the
390 resulting filename should still be considered as untrusted user input.
391 """
392 file_name = html.unescape(file_name)
393 file_name = file_name.rsplit("/")[-1]
394 file_name = file_name.rsplit("\\")[-1]
395 # Remove non-printable characters.
396 file_name = "".join([char for char in file_name if char.isprintable()])
398 if file_name in {"", ".", ".."}:
399 return None
400 return file_name
402 IE_sanitize = sanitize_file_name
404 def _close_files(self):
405 # Free up all file handles.
406 # FIXME: this currently assumes that upload handlers store the file as 'file'
407 # We should document that...
408 # (Maybe add handler.free_file to complement new_file)
409 for handler in self._upload_handlers:
410 if hasattr(handler, "file"):
411 handler.file.close()
414class LazyStream:
415 """
416 The LazyStream wrapper allows one to get and "unget" bytes from a stream.
418 Given a producer object (an iterator that yields bytestrings), the
419 LazyStream object will support iteration, reading, and keeping a "look-back"
420 variable in case you need to "unget" some bytes.
421 """
423 def __init__(self, producer, length=None):
424 """
425 Every LazyStream must have a producer when instantiated.
427 A producer is an iterable that returns a string each time it
428 is called.
429 """
430 self._producer = producer
431 self._empty = False
432 self._leftover = b""
433 self.length = length
434 self.position = 0
435 self._remaining = length
436 self._unget_history = []
438 def tell(self):
439 return self.position
441 def read(self, size=None):
442 def parts():
443 remaining = self._remaining if size is None else size
444 # do the whole thing in one shot if no limit was provided.
445 if remaining is None:
446 yield b"".join(self)
447 return
449 # otherwise do some bookkeeping to return exactly enough
450 # of the stream and stashing any extra content we get from
451 # the producer
452 while remaining != 0:
453 assert remaining > 0, "remaining bytes to read should never go negative"
455 try:
456 chunk = next(self)
457 except StopIteration:
458 return
459 else:
460 emitting = chunk[:remaining]
461 self.unget(chunk[remaining:])
462 remaining -= len(emitting)
463 yield emitting
465 return b"".join(parts())
467 def __next__(self):
468 """
469 Used when the exact number of bytes to read is unimportant.
471 Return whatever chunk is conveniently returned from the iterator.
472 Useful to avoid unnecessary bookkeeping if performance is an issue.
473 """
474 if self._leftover:
475 output = self._leftover
476 self._leftover = b""
477 else:
478 output = next(self._producer)
479 self._unget_history = []
480 self.position += len(output)
481 return output
483 def close(self):
484 """
485 Used to invalidate/disable this lazy stream.
487 Replace the producer with an empty list. Any leftover bytes that have
488 already been read will still be reported upon read() and/or next().
489 """
490 self._producer = []
492 def __iter__(self):
493 return self
495 def unget(self, bytes):
496 """
497 Place bytes back onto the front of the lazy stream.
499 Future calls to read() will return those bytes first. The
500 stream position and thus tell() will be rewound.
501 """
502 if not bytes:
503 return
504 self._update_unget_history(len(bytes))
505 self.position -= len(bytes)
506 self._leftover = bytes + self._leftover
508 def _update_unget_history(self, num_bytes):
509 """
510 Update the unget history as a sanity check to see if we've pushed
511 back the same number of bytes in one chunk. If we keep ungetting the
512 same number of bytes many times (here, 50), we're mostly likely in an
513 infinite loop of some sort. This is usually caused by a
514 maliciously-malformed MIME request.
515 """
516 self._unget_history = [num_bytes] + self._unget_history[:49]
517 number_equal = len(
518 [
519 current_number
520 for current_number in self._unget_history
521 if current_number == num_bytes
522 ]
523 )
525 if number_equal > 40:
526 raise SuspiciousMultipartForm(
527 "The multipart parser got stuck, which shouldn't happen with"
528 " normal uploaded files. Check for malicious upload activity;"
529 " if there is none, report this to the Plain developers."
530 )
533class ChunkIter:
534 """
535 An iterable that will yield chunks of data. Given a file-like object as the
536 constructor, yield chunks of read operations from that object.
537 """
539 def __init__(self, flo, chunk_size=64 * 1024):
540 self.flo = flo
541 self.chunk_size = chunk_size
543 def __next__(self):
544 try:
545 data = self.flo.read(self.chunk_size)
546 except InputStreamExhausted:
547 raise StopIteration()
548 if data:
549 return data
550 else:
551 raise StopIteration()
553 def __iter__(self):
554 return self
557class InterBoundaryIter:
558 """
559 A Producer that will iterate over boundaries.
560 """
562 def __init__(self, stream, boundary):
563 self._stream = stream
564 self._boundary = boundary
566 def __iter__(self):
567 return self
569 def __next__(self):
570 try:
571 return LazyStream(BoundaryIter(self._stream, self._boundary))
572 except InputStreamExhausted:
573 raise StopIteration()
576class BoundaryIter:
577 """
578 A Producer that is sensitive to boundaries.
580 Will happily yield bytes until a boundary is found. Will yield the bytes
581 before the boundary, throw away the boundary bytes themselves, and push the
582 post-boundary bytes back on the stream.
584 The future calls to next() after locating the boundary will raise a
585 StopIteration exception.
586 """
588 def __init__(self, stream, boundary):
589 self._stream = stream
590 self._boundary = boundary
591 self._done = False
592 # rollback an additional six bytes because the format is like
593 # this: CRLF<boundary>[--CRLF]
594 self._rollback = len(boundary) + 6
596 # Try to use mx fast string search if available. Otherwise
597 # use Python find. Wrap the latter for consistency.
598 unused_char = self._stream.read(1)
599 if not unused_char:
600 raise InputStreamExhausted()
601 self._stream.unget(unused_char)
603 def __iter__(self):
604 return self
606 def __next__(self):
607 if self._done:
608 raise StopIteration()
610 stream = self._stream
611 rollback = self._rollback
613 bytes_read = 0
614 chunks = []
615 for bytes in stream:
616 bytes_read += len(bytes)
617 chunks.append(bytes)
618 if bytes_read > rollback:
619 break
620 if not bytes:
621 break
622 else:
623 self._done = True
625 if not chunks:
626 raise StopIteration()
628 chunk = b"".join(chunks)
629 boundary = self._find_boundary(chunk)
631 if boundary:
632 end, next = boundary
633 stream.unget(chunk[next:])
634 self._done = True
635 return chunk[:end]
636 else:
637 # make sure we don't treat a partial boundary (and
638 # its separators) as data
639 if not chunk[:-rollback]: # and len(chunk) >= (len(self._boundary) + 6):
640 # There's nothing left, we should just return and mark as done.
641 self._done = True
642 return chunk
643 else:
644 stream.unget(chunk[-rollback:])
645 return chunk[:-rollback]
647 def _find_boundary(self, data):
648 """
649 Find a multipart boundary in data.
651 Should no boundary exist in the data, return None. Otherwise, return
652 a tuple containing the indices of the following:
653 * the end of current encapsulation
654 * the start of the next encapsulation
655 """
656 index = data.find(self._boundary)
657 if index < 0:
658 return None
659 else:
660 end = index
661 next = index + len(self._boundary)
662 # backup over CRLF
663 last = max(0, end - 1)
664 if data[last : last + 1] == b"\n":
665 end -= 1
666 last = max(0, end - 1)
667 if data[last : last + 1] == b"\r":
668 end -= 1
669 return end, next
672def exhaust(stream_or_iterable):
673 """Exhaust an iterator or stream."""
674 try:
675 iterator = iter(stream_or_iterable)
676 except TypeError:
677 iterator = ChunkIter(stream_or_iterable, 16384)
678 collections.deque(iterator, maxlen=0) # consume iterator quickly.
681def parse_boundary_stream(stream, max_header_size):
682 """
683 Parse one and exactly one stream that encapsulates a boundary.
684 """
685 # Stream at beginning of header, look for end of header
686 # and parse it if found. The header must fit within one
687 # chunk.
688 chunk = stream.read(max_header_size)
690 # 'find' returns the top of these four bytes, so we'll
691 # need to munch them later to prevent them from polluting
692 # the payload.
693 header_end = chunk.find(b"\r\n\r\n")
695 if header_end == -1:
696 # we find no header, so we just mark this fact and pass on
697 # the stream verbatim
698 stream.unget(chunk)
699 return (RAW, {}, stream)
701 header = chunk[:header_end]
703 # here we place any excess chunk back onto the stream, as
704 # well as throwing away the CRLFCRLF bytes from above.
705 stream.unget(chunk[header_end + 4 :])
707 TYPE = RAW
708 outdict = {}
710 # Eliminate blank lines
711 for line in header.split(b"\r\n"):
712 # This terminology ("main value" and "dictionary of
713 # parameters") is from the Python docs.
714 try:
715 main_value_pair, params = parse_header_parameters(line.decode())
716 name, value = main_value_pair.split(":", 1)
717 params = {k: v.encode() for k, v in params.items()}
718 except ValueError: # Invalid header.
719 continue
721 if name == "content-disposition":
722 TYPE = FIELD
723 if params.get("filename"):
724 TYPE = FILE
726 outdict[name] = value, params
728 if TYPE == RAW:
729 stream.unget(chunk)
731 return (TYPE, outdict, stream)
734class Parser:
735 def __init__(self, stream, boundary):
736 self._stream = stream
737 self._separator = b"--" + boundary
739 def __iter__(self):
740 boundarystream = InterBoundaryIter(self._stream, self._separator)
741 for sub_stream in boundarystream:
742 # Iterate over each part
743 yield parse_boundary_stream(sub_stream, 1024)