Coverage for /Users/davegaeddert/Developer/dropseed/plain/plain/plain/http/multipartparser.py: 59%

361 statements  

« prev     ^ index     » next       coverage.py v7.6.9, created at 2024-12-23 11:16 -0600

1""" 

2Multi-part parsing for file uploads. 

3 

4Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to 

5file upload handlers for processing. 

6""" 

7 

8import base64 

9import binascii 

10import collections 

11import html 

12 

13from plain.exceptions import ( 

14 RequestDataTooBig, 

15 SuspiciousMultipartForm, 

16 TooManyFieldsSent, 

17 TooManyFilesSent, 

18) 

19from plain.internal.files.uploadhandler import SkipFile, StopFutureHandlers, StopUpload 

20from plain.runtime import settings 

21from plain.utils.datastructures import MultiValueDict 

22from plain.utils.encoding import force_str 

23from plain.utils.http import parse_header_parameters 

24from plain.utils.regex_helper import _lazy_re_compile 

25 

26__all__ = ("MultiPartParser", "MultiPartParserError", "InputStreamExhausted") 

27 

28 

29class MultiPartParserError(Exception): 

30 pass 

31 

32 

33class InputStreamExhausted(Exception): 

34 """ 

35 No more reads are allowed from this device. 

36 """ 

37 

38 pass 

39 

40 

41RAW = "raw" 

42FILE = "file" 

43FIELD = "field" 

44FIELD_TYPES = frozenset([FIELD, RAW]) 

45 

46 

47class MultiPartParser: 

48 """ 

49 An RFC 7578 multipart/form-data parser. 

50 

51 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks 

52 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. 

53 """ 

54 

55 boundary_re = _lazy_re_compile(r"[ -~]{0,200}[!-~]") 

56 

57 def __init__(self, META, input_data, upload_handlers, encoding=None): 

58 """ 

59 Initialize the MultiPartParser object. 

60 

61 :META: 

62 The standard ``META`` dictionary in Plain request objects. 

63 :input_data: 

64 The raw post data, as a file-like object. 

65 :upload_handlers: 

66 A list of UploadHandler instances that perform operations on the 

67 uploaded data. 

68 :encoding: 

69 The encoding with which to treat the incoming data. 

70 """ 

71 # Content-Type should contain multipart and the boundary information. 

72 content_type = META.get("CONTENT_TYPE", "") 

73 if not content_type.startswith("multipart/"): 

74 raise MultiPartParserError(f"Invalid Content-Type: {content_type}") 

75 

76 try: 

77 content_type.encode("ascii") 

78 except UnicodeEncodeError: 

79 raise MultiPartParserError( 

80 f"Invalid non-ASCII Content-Type in multipart: {force_str(content_type)}" 

81 ) 

82 

83 # Parse the header to get the boundary to split the parts. 

84 _, opts = parse_header_parameters(content_type) 

85 boundary = opts.get("boundary") 

86 if not boundary or not self.boundary_re.fullmatch(boundary): 

87 raise MultiPartParserError( 

88 f"Invalid boundary in multipart: {force_str(boundary)}" 

89 ) 

90 

91 # Content-Length should contain the length of the body we are about 

92 # to receive. 

93 try: 

94 content_length = int(META.get("CONTENT_LENGTH", 0)) 

95 except (ValueError, TypeError): 

96 content_length = 0 

97 

98 if content_length < 0: 

99 # This means we shouldn't continue...raise an error. 

100 raise MultiPartParserError(f"Invalid content length: {content_length!r}") 

101 

102 self._boundary = boundary.encode("ascii") 

103 self._input_data = input_data 

104 

105 # For compatibility with low-level network APIs (with 32-bit integers), 

106 # the chunk size should be < 2^31, but still divisible by 4. 

107 possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size] 

108 self._chunk_size = min([2**31 - 4] + possible_sizes) 

109 

110 self._meta = META 

111 self._encoding = encoding or settings.DEFAULT_CHARSET 

112 self._content_length = content_length 

113 self._upload_handlers = upload_handlers 

114 

115 def parse(self): 

116 # Call the actual parse routine and close all open files in case of 

117 # errors. This is needed because if exceptions are thrown the 

118 # MultiPartParser will not be garbage collected immediately and 

119 # resources would be kept alive. This is only needed for errors because 

120 # the Request object closes all uploaded files at the end of the 

121 # request. 

122 try: 

123 return self._parse() 

124 except Exception: 

125 if hasattr(self, "_files"): 

126 for _, files in self._files.lists(): 

127 for fileobj in files: 

128 fileobj.close() 

129 raise 

130 

131 def _parse(self): 

132 """ 

133 Parse the POST data and break it into a FILES MultiValueDict and a POST 

134 MultiValueDict. 

135 

136 Return a tuple containing the POST and FILES dictionary, respectively. 

137 """ 

138 from plain.http import QueryDict 

139 

140 encoding = self._encoding 

141 handlers = self._upload_handlers 

142 

143 # HTTP spec says that Content-Length >= 0 is valid 

144 # handling content-length == 0 before continuing 

145 if self._content_length == 0: 

146 return QueryDict(encoding=self._encoding), MultiValueDict() 

147 

148 # See if any of the handlers take care of the parsing. 

149 # This allows overriding everything if need be. 

150 for handler in handlers: 

151 result = handler.handle_raw_input( 

152 self._input_data, 

153 self._meta, 

154 self._content_length, 

155 self._boundary, 

156 encoding, 

157 ) 

158 # Check to see if it was handled 

159 if result is not None: 

160 return result[0], result[1] 

161 

162 # Create the data structures to be used later. 

163 self._post = QueryDict(mutable=True) 

164 self._files = MultiValueDict() 

165 

166 # Instantiate the parser and stream: 

167 stream = LazyStream(ChunkIter(self._input_data, self._chunk_size)) 

168 

169 # Whether or not to signal a file-completion at the beginning of the loop. 

170 old_field_name = None 

171 counters = [0] * len(handlers) 

172 

173 # Number of bytes that have been read. 

174 num_bytes_read = 0 

175 # To count the number of keys in the request. 

176 num_post_keys = 0 

177 # To count the number of files in the request. 

178 num_files = 0 

179 # To limit the amount of data read from the request. 

180 read_size = None 

181 # Whether a file upload is finished. 

182 uploaded_file = True 

183 

184 try: 

185 for item_type, meta_data, field_stream in Parser(stream, self._boundary): 

186 if old_field_name: 

187 # We run this at the beginning of the next loop 

188 # since we cannot be sure a file is complete until 

189 # we hit the next boundary/part of the multipart content. 

190 self.handle_file_complete(old_field_name, counters) 

191 old_field_name = None 

192 uploaded_file = True 

193 

194 if ( 

195 item_type in FIELD_TYPES 

196 and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None 

197 ): 

198 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS. 

199 num_post_keys += 1 

200 # 2 accounts for empty raw fields before and after the 

201 # last boundary. 

202 if settings.DATA_UPLOAD_MAX_NUMBER_FIELDS + 2 < num_post_keys: 

203 raise TooManyFieldsSent( 

204 "The number of GET/POST parameters exceeded " 

205 "settings.DATA_UPLOAD_MAX_NUMBER_FIELDS." 

206 ) 

207 

208 try: 

209 disposition = meta_data["content-disposition"][1] 

210 field_name = disposition["name"].strip() 

211 except (KeyError, IndexError, AttributeError): 

212 continue 

213 

214 transfer_encoding = meta_data.get("content-transfer-encoding") 

215 if transfer_encoding is not None: 

216 transfer_encoding = transfer_encoding[0].strip() 

217 field_name = force_str(field_name, encoding, errors="replace") 

218 

219 if item_type == FIELD: 

220 # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE. 

221 if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None: 

222 read_size = ( 

223 settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read 

224 ) 

225 

226 # This is a post field, we can just set it in the post 

227 if transfer_encoding == "base64": 

228 raw_data = field_stream.read(size=read_size) 

229 num_bytes_read += len(raw_data) 

230 try: 

231 data = base64.b64decode(raw_data) 

232 except binascii.Error: 

233 data = raw_data 

234 else: 

235 data = field_stream.read(size=read_size) 

236 num_bytes_read += len(data) 

237 

238 # Add two here to make the check consistent with the 

239 # x-www-form-urlencoded check that includes '&='. 

240 num_bytes_read += len(field_name) + 2 

241 if ( 

242 settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None 

243 and num_bytes_read > settings.DATA_UPLOAD_MAX_MEMORY_SIZE 

244 ): 

245 raise RequestDataTooBig( 

246 "Request body exceeded " 

247 "settings.DATA_UPLOAD_MAX_MEMORY_SIZE." 

248 ) 

249 

250 self._post.appendlist( 

251 field_name, force_str(data, encoding, errors="replace") 

252 ) 

253 elif item_type == FILE: 

254 # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FILES. 

255 num_files += 1 

256 if ( 

257 settings.DATA_UPLOAD_MAX_NUMBER_FILES is not None 

258 and num_files > settings.DATA_UPLOAD_MAX_NUMBER_FILES 

259 ): 

260 raise TooManyFilesSent( 

261 "The number of files exceeded " 

262 "settings.DATA_UPLOAD_MAX_NUMBER_FILES." 

263 ) 

264 # This is a file, use the handler... 

265 file_name = disposition.get("filename") 

266 if file_name: 

267 file_name = force_str(file_name, encoding, errors="replace") 

268 file_name = self.sanitize_file_name(file_name) 

269 if not file_name: 

270 continue 

271 

272 content_type, content_type_extra = meta_data.get( 

273 "content-type", ("", {}) 

274 ) 

275 content_type = content_type.strip() 

276 charset = content_type_extra.get("charset") 

277 

278 try: 

279 content_length = int(meta_data.get("content-length")[0]) 

280 except (IndexError, TypeError, ValueError): 

281 content_length = None 

282 

283 counters = [0] * len(handlers) 

284 uploaded_file = False 

285 try: 

286 for handler in handlers: 

287 try: 

288 handler.new_file( 

289 field_name, 

290 file_name, 

291 content_type, 

292 content_length, 

293 charset, 

294 content_type_extra, 

295 ) 

296 except StopFutureHandlers: 

297 break 

298 

299 for chunk in field_stream: 

300 if transfer_encoding == "base64": 

301 # We only special-case base64 transfer encoding 

302 # We should always decode base64 chunks by 

303 # multiple of 4, ignoring whitespace. 

304 

305 stripped_chunk = b"".join(chunk.split()) 

306 

307 remaining = len(stripped_chunk) % 4 

308 while remaining != 0: 

309 over_chunk = field_stream.read(4 - remaining) 

310 if not over_chunk: 

311 break 

312 stripped_chunk += b"".join(over_chunk.split()) 

313 remaining = len(stripped_chunk) % 4 

314 

315 try: 

316 chunk = base64.b64decode(stripped_chunk) 

317 except Exception as exc: 

318 # Since this is only a chunk, any error is 

319 # an unfixable error. 

320 raise MultiPartParserError( 

321 "Could not decode base64 data." 

322 ) from exc 

323 

324 for i, handler in enumerate(handlers): 

325 chunk_length = len(chunk) 

326 chunk = handler.receive_data_chunk(chunk, counters[i]) 

327 counters[i] += chunk_length 

328 if chunk is None: 

329 # Don't continue if the chunk received by 

330 # the handler is None. 

331 break 

332 

333 except SkipFile: 

334 self._close_files() 

335 # Just use up the rest of this file... 

336 exhaust(field_stream) 

337 else: 

338 # Handle file upload completions on next iteration. 

339 old_field_name = field_name 

340 else: 

341 # If this is neither a FIELD nor a FILE, exhaust the field 

342 # stream. Note: There could be an error here at some point, 

343 # but there will be at least two RAW types (before and 

344 # after the other boundaries). This branch is usually not 

345 # reached at all, because a missing content-disposition 

346 # header will skip the whole boundary. 

347 exhaust(field_stream) 

348 except StopUpload as e: 

349 self._close_files() 

350 if not e.connection_reset: 

351 exhaust(self._input_data) 

352 else: 

353 if not uploaded_file: 

354 for handler in handlers: 

355 handler.upload_interrupted() 

356 # Make sure that the request data is all fed 

357 exhaust(self._input_data) 

358 

359 # Signal that the upload has completed. 

360 # any() shortcircuits if a handler's upload_complete() returns a value. 

361 any(handler.upload_complete() for handler in handlers) 

362 self._post._mutable = False 

363 return self._post, self._files 

364 

365 def handle_file_complete(self, old_field_name, counters): 

366 """ 

367 Handle all the signaling that takes place when a file is complete. 

368 """ 

369 for i, handler in enumerate(self._upload_handlers): 

370 file_obj = handler.file_complete(counters[i]) 

371 if file_obj: 

372 # If it returns a file object, then set the files dict. 

373 self._files.appendlist( 

374 force_str(old_field_name, self._encoding, errors="replace"), 

375 file_obj, 

376 ) 

377 break 

378 

379 def sanitize_file_name(self, file_name): 

380 """ 

381 Sanitize the filename of an upload. 

382 

383 Remove all possible path separators, even though that might remove more 

384 than actually required by the target system. Filenames that could 

385 potentially cause problems (current/parent dir) are also discarded. 

386 

387 It should be noted that this function could still return a "filepath" 

388 like "C:some_file.txt" which is handled later on by the storage layer. 

389 So while this function does sanitize filenames to some extent, the 

390 resulting filename should still be considered as untrusted user input. 

391 """ 

392 file_name = html.unescape(file_name) 

393 file_name = file_name.rsplit("/")[-1] 

394 file_name = file_name.rsplit("\\")[-1] 

395 # Remove non-printable characters. 

396 file_name = "".join([char for char in file_name if char.isprintable()]) 

397 

398 if file_name in {"", ".", ".."}: 

399 return None 

400 return file_name 

401 

402 IE_sanitize = sanitize_file_name 

403 

404 def _close_files(self): 

405 # Free up all file handles. 

406 # FIXME: this currently assumes that upload handlers store the file as 'file' 

407 # We should document that... 

408 # (Maybe add handler.free_file to complement new_file) 

409 for handler in self._upload_handlers: 

410 if hasattr(handler, "file"): 

411 handler.file.close() 

412 

413 

414class LazyStream: 

415 """ 

416 The LazyStream wrapper allows one to get and "unget" bytes from a stream. 

417 

418 Given a producer object (an iterator that yields bytestrings), the 

419 LazyStream object will support iteration, reading, and keeping a "look-back" 

420 variable in case you need to "unget" some bytes. 

421 """ 

422 

423 def __init__(self, producer, length=None): 

424 """ 

425 Every LazyStream must have a producer when instantiated. 

426 

427 A producer is an iterable that returns a string each time it 

428 is called. 

429 """ 

430 self._producer = producer 

431 self._empty = False 

432 self._leftover = b"" 

433 self.length = length 

434 self.position = 0 

435 self._remaining = length 

436 self._unget_history = [] 

437 

438 def tell(self): 

439 return self.position 

440 

441 def read(self, size=None): 

442 def parts(): 

443 remaining = self._remaining if size is None else size 

444 # do the whole thing in one shot if no limit was provided. 

445 if remaining is None: 

446 yield b"".join(self) 

447 return 

448 

449 # otherwise do some bookkeeping to return exactly enough 

450 # of the stream and stashing any extra content we get from 

451 # the producer 

452 while remaining != 0: 

453 assert remaining > 0, "remaining bytes to read should never go negative" 

454 

455 try: 

456 chunk = next(self) 

457 except StopIteration: 

458 return 

459 else: 

460 emitting = chunk[:remaining] 

461 self.unget(chunk[remaining:]) 

462 remaining -= len(emitting) 

463 yield emitting 

464 

465 return b"".join(parts()) 

466 

467 def __next__(self): 

468 """ 

469 Used when the exact number of bytes to read is unimportant. 

470 

471 Return whatever chunk is conveniently returned from the iterator. 

472 Useful to avoid unnecessary bookkeeping if performance is an issue. 

473 """ 

474 if self._leftover: 

475 output = self._leftover 

476 self._leftover = b"" 

477 else: 

478 output = next(self._producer) 

479 self._unget_history = [] 

480 self.position += len(output) 

481 return output 

482 

483 def close(self): 

484 """ 

485 Used to invalidate/disable this lazy stream. 

486 

487 Replace the producer with an empty list. Any leftover bytes that have 

488 already been read will still be reported upon read() and/or next(). 

489 """ 

490 self._producer = [] 

491 

492 def __iter__(self): 

493 return self 

494 

495 def unget(self, bytes): 

496 """ 

497 Place bytes back onto the front of the lazy stream. 

498 

499 Future calls to read() will return those bytes first. The 

500 stream position and thus tell() will be rewound. 

501 """ 

502 if not bytes: 

503 return 

504 self._update_unget_history(len(bytes)) 

505 self.position -= len(bytes) 

506 self._leftover = bytes + self._leftover 

507 

508 def _update_unget_history(self, num_bytes): 

509 """ 

510 Update the unget history as a sanity check to see if we've pushed 

511 back the same number of bytes in one chunk. If we keep ungetting the 

512 same number of bytes many times (here, 50), we're mostly likely in an 

513 infinite loop of some sort. This is usually caused by a 

514 maliciously-malformed MIME request. 

515 """ 

516 self._unget_history = [num_bytes] + self._unget_history[:49] 

517 number_equal = len( 

518 [ 

519 current_number 

520 for current_number in self._unget_history 

521 if current_number == num_bytes 

522 ] 

523 ) 

524 

525 if number_equal > 40: 

526 raise SuspiciousMultipartForm( 

527 "The multipart parser got stuck, which shouldn't happen with" 

528 " normal uploaded files. Check for malicious upload activity;" 

529 " if there is none, report this to the Plain developers." 

530 ) 

531 

532 

533class ChunkIter: 

534 """ 

535 An iterable that will yield chunks of data. Given a file-like object as the 

536 constructor, yield chunks of read operations from that object. 

537 """ 

538 

539 def __init__(self, flo, chunk_size=64 * 1024): 

540 self.flo = flo 

541 self.chunk_size = chunk_size 

542 

543 def __next__(self): 

544 try: 

545 data = self.flo.read(self.chunk_size) 

546 except InputStreamExhausted: 

547 raise StopIteration() 

548 if data: 

549 return data 

550 else: 

551 raise StopIteration() 

552 

553 def __iter__(self): 

554 return self 

555 

556 

557class InterBoundaryIter: 

558 """ 

559 A Producer that will iterate over boundaries. 

560 """ 

561 

562 def __init__(self, stream, boundary): 

563 self._stream = stream 

564 self._boundary = boundary 

565 

566 def __iter__(self): 

567 return self 

568 

569 def __next__(self): 

570 try: 

571 return LazyStream(BoundaryIter(self._stream, self._boundary)) 

572 except InputStreamExhausted: 

573 raise StopIteration() 

574 

575 

576class BoundaryIter: 

577 """ 

578 A Producer that is sensitive to boundaries. 

579 

580 Will happily yield bytes until a boundary is found. Will yield the bytes 

581 before the boundary, throw away the boundary bytes themselves, and push the 

582 post-boundary bytes back on the stream. 

583 

584 The future calls to next() after locating the boundary will raise a 

585 StopIteration exception. 

586 """ 

587 

588 def __init__(self, stream, boundary): 

589 self._stream = stream 

590 self._boundary = boundary 

591 self._done = False 

592 # rollback an additional six bytes because the format is like 

593 # this: CRLF<boundary>[--CRLF] 

594 self._rollback = len(boundary) + 6 

595 

596 # Try to use mx fast string search if available. Otherwise 

597 # use Python find. Wrap the latter for consistency. 

598 unused_char = self._stream.read(1) 

599 if not unused_char: 

600 raise InputStreamExhausted() 

601 self._stream.unget(unused_char) 

602 

603 def __iter__(self): 

604 return self 

605 

606 def __next__(self): 

607 if self._done: 

608 raise StopIteration() 

609 

610 stream = self._stream 

611 rollback = self._rollback 

612 

613 bytes_read = 0 

614 chunks = [] 

615 for bytes in stream: 

616 bytes_read += len(bytes) 

617 chunks.append(bytes) 

618 if bytes_read > rollback: 

619 break 

620 if not bytes: 

621 break 

622 else: 

623 self._done = True 

624 

625 if not chunks: 

626 raise StopIteration() 

627 

628 chunk = b"".join(chunks) 

629 boundary = self._find_boundary(chunk) 

630 

631 if boundary: 

632 end, next = boundary 

633 stream.unget(chunk[next:]) 

634 self._done = True 

635 return chunk[:end] 

636 else: 

637 # make sure we don't treat a partial boundary (and 

638 # its separators) as data 

639 if not chunk[:-rollback]: # and len(chunk) >= (len(self._boundary) + 6): 

640 # There's nothing left, we should just return and mark as done. 

641 self._done = True 

642 return chunk 

643 else: 

644 stream.unget(chunk[-rollback:]) 

645 return chunk[:-rollback] 

646 

647 def _find_boundary(self, data): 

648 """ 

649 Find a multipart boundary in data. 

650 

651 Should no boundary exist in the data, return None. Otherwise, return 

652 a tuple containing the indices of the following: 

653 * the end of current encapsulation 

654 * the start of the next encapsulation 

655 """ 

656 index = data.find(self._boundary) 

657 if index < 0: 

658 return None 

659 else: 

660 end = index 

661 next = index + len(self._boundary) 

662 # backup over CRLF 

663 last = max(0, end - 1) 

664 if data[last : last + 1] == b"\n": 

665 end -= 1 

666 last = max(0, end - 1) 

667 if data[last : last + 1] == b"\r": 

668 end -= 1 

669 return end, next 

670 

671 

672def exhaust(stream_or_iterable): 

673 """Exhaust an iterator or stream.""" 

674 try: 

675 iterator = iter(stream_or_iterable) 

676 except TypeError: 

677 iterator = ChunkIter(stream_or_iterable, 16384) 

678 collections.deque(iterator, maxlen=0) # consume iterator quickly. 

679 

680 

681def parse_boundary_stream(stream, max_header_size): 

682 """ 

683 Parse one and exactly one stream that encapsulates a boundary. 

684 """ 

685 # Stream at beginning of header, look for end of header 

686 # and parse it if found. The header must fit within one 

687 # chunk. 

688 chunk = stream.read(max_header_size) 

689 

690 # 'find' returns the top of these four bytes, so we'll 

691 # need to munch them later to prevent them from polluting 

692 # the payload. 

693 header_end = chunk.find(b"\r\n\r\n") 

694 

695 if header_end == -1: 

696 # we find no header, so we just mark this fact and pass on 

697 # the stream verbatim 

698 stream.unget(chunk) 

699 return (RAW, {}, stream) 

700 

701 header = chunk[:header_end] 

702 

703 # here we place any excess chunk back onto the stream, as 

704 # well as throwing away the CRLFCRLF bytes from above. 

705 stream.unget(chunk[header_end + 4 :]) 

706 

707 TYPE = RAW 

708 outdict = {} 

709 

710 # Eliminate blank lines 

711 for line in header.split(b"\r\n"): 

712 # This terminology ("main value" and "dictionary of 

713 # parameters") is from the Python docs. 

714 try: 

715 main_value_pair, params = parse_header_parameters(line.decode()) 

716 name, value = main_value_pair.split(":", 1) 

717 params = {k: v.encode() for k, v in params.items()} 

718 except ValueError: # Invalid header. 

719 continue 

720 

721 if name == "content-disposition": 

722 TYPE = FIELD 

723 if params.get("filename"): 

724 TYPE = FILE 

725 

726 outdict[name] = value, params 

727 

728 if TYPE == RAW: 

729 stream.unget(chunk) 

730 

731 return (TYPE, outdict, stream) 

732 

733 

734class Parser: 

735 def __init__(self, stream, boundary): 

736 self._stream = stream 

737 self._separator = b"--" + boundary 

738 

739 def __iter__(self): 

740 boundarystream = InterBoundaryIter(self._stream, self._separator) 

741 for sub_stream in boundarystream: 

742 # Iterate over each part 

743 yield parse_boundary_stream(sub_stream, 1024)