Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python 

2# cardinal_pythonlib/wsgi/reverse_proxied_mw.py 

3 

4""" 

5=============================================================================== 

6 

7 Original code copyright (C) 2009-2021 Rudolf Cardinal (rudolf@pobox.com). 

8 

9 This file is part of cardinal_pythonlib. 

10 

11 Licensed under the Apache License, Version 2.0 (the "License"); 

12 you may not use this file except in compliance with the License. 

13 You may obtain a copy of the License at 

14 

15 https://www.apache.org/licenses/LICENSE-2.0 

16 

17 Unless required by applicable law or agreed to in writing, software 

18 distributed under the License is distributed on an "AS IS" BASIS, 

19 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

20 See the License for the specific language governing permissions and 

21 limitations under the License. 

22 

23=============================================================================== 

24 

25**Middleware to set SCRIPT_NAME environment variable etc. when behind a 

26reverse proxy.** 

27 

28""" 

29 

30from pprint import pformat 

31from typing import List 

32 

33from cardinal_pythonlib.dicts import dict_diff, delete_keys 

34from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler 

35from cardinal_pythonlib.wsgi.constants import ( 

36 TYPE_WSGI_APP, 

37 TYPE_WSGI_APP_RESULT, 

38 TYPE_WSGI_ENVIRON, 

39 TYPE_WSGI_START_RESPONSE, 

40 WsgiEnvVar, 

41) 

42 

43log = get_brace_style_log_with_null_handler(__name__) 

44 

45 

46# ============================================================================= 

47# Helper functions for handling HTTP headers 

48# ============================================================================= 

49 

50def ip_addresses_from_xff(value: str) -> List[str]: 

51 """ 

52 Returns a list of IP addresses (as strings), given the value of an HTTP 

53 ``X-Forwarded-For`` (or ``WSGI HTTP_X_FORWARDED_FOR``) header. 

54 

55 Args: 

56 value: 

57 the value of an HTTP ``X-Forwarded-For`` (or ``WSGI 

58 HTTP_X_FORWARDED_FOR``) header 

59 

60 Returns: 

61 a list of IP address as strings 

62 

63 See: 

64 - https://en.wikipedia.org/wiki/X-Forwarded-For 

65 - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For # noqa 

66 - NOT THIS: https://tools.ietf.org/html/rfc7239 

67 """ 

68 if not value: 

69 return [] 

70 return [x.strip() for x in value.split(",")] 

71 # ... separator is comma-space, but let's be liberal 

72 

73 

74def first_from_xff(value: str) -> str: 

75 """ 

76 Returns the first IP address from an ``X-Forwarded-For`` header; see 

77 :func:`ip_addresses_from_xff`. 

78 

79 Args: 

80 value: 

81 the value of an HTTP ``X-Forwarded-For`` (or ``WSGI 

82 HTTP_X_FORWARDED_FOR``) header 

83 

84 Returns: 

85 an IP address as a string, or ``''`` if none is found 

86 

87 """ 

88 ip_addresses = ip_addresses_from_xff(value) 

89 if not ip_addresses: 

90 return '' 

91 return ip_addresses[0] # leftmost 

92 

93 

94# ============================================================================= 

95# Middleware to set SCRIPT_NAME environment variable etc. when behind a 

96# reverse proxy. 

97# ============================================================================= 

98 

99EXAMPLE_APACHE_REVERSE_PROXY_CONFIG = """ 

100 

101 # ========================================================================= 

102 # Mount a WSGI application, using CamCOPS as an example 

103 # ========================================================================= 

104 # This WSGI application is served by a SEPARATE web server (e.g. CherryPy); 

105 # Apache just needs to pass information to and fro, and handle the HTTPS  

106 # aspects. 

107 

108 # --------------------------------------------------------------------- 

109 # 1. Proxy requests to the external server and back, and allow access. 

110 # --------------------------------------------------------------------- 

111 # ... either via port 8000 

112 # ... or, better, socket /tmp/.camcops.sock 

113 # NOTES 

114 # - Don't specify trailing slashes. 

115 # If you do, http://host/camcops will fail, though 

116 # http://host/camcops/ will succeed. 

117 # - Using a socket 

118 # - this requires Apache 2.4.9, and passes after the '|' character a 

119 # URL that determines the Host: value of the request; see 

120 # https://httpd.apache.org/docs/trunk/mod/mod_proxy.html#proxypass 

121 # - The Django debug toolbar will then require the bizarre entry in 

122 # the Django settings: INTERNAL_IPS = ("b''", ) -- i.e. the string 

123 # value of "b''", not an empty bytestring. 

124 # - Ensure that you put the CORRECT PROTOCOL (e.g. https) in the rules 

125 # below. 

126  

127 # (a) Proxy  

128  

129 # ... via a port 

130 # Note the use of "http" (reflecting the backend), not https (like the 

131 # front end). 

132 ProxyPass /camcops http://127.0.0.1:8000 retry=0 

133 ProxyPassReverse /camcops http://127.0.0.1:8000 retry=0 

134 

135 # ... or via a socket (Apache 2.4.9 and higher) 

136 # ProxyPass /camcops unix:/tmp/.camcops.sock|https://localhost retry=0 

137 # ProxyPassReverse /camcops unix:/tmp/.camcops.sock|https://localhost retry=0 

138 

139 # (b) Allow proxy over SSL. 

140 # Without this, you will get errors like: 

141 # ... SSL Proxy requested for wombat:443 but not enabled [Hint: SSLProxyEngine] 

142 # ... failed to enable ssl support for 0.0.0.0:0 (httpd-UDS) 

143 SSLProxyEngine on 

144  

145 <Location /camcops> 

146 # (c) Allow access 

147 Require all granted 

148 

149 # (d) Tell the proxied application that we are using HTTPS: 

150 # ... https://stackoverflow.com/questions/16042647 

151 # Enable mod_headers (e.g. "sudo a2enmod headers") and: 

152 RequestHeader set X-Forwarded-Proto https 

153 RequestHeader set X-Script-Name /camcops 

154 </Location> 

155 

156 # --------------------------------------------------------------------- 

157 # 2. Serve static files 

158 # --------------------------------------------------------------------- 

159 # a) offer them at the appropriate URL 

160 # b) provide permission 

161 

162 Alias /camcops/static/ /usr/share/camcops/server/static/ 

163 

164 # Change this: aim the alias at your own institutional logo. 

165 # Alias /camcops/static/logo_local.png /usr/share/camcops/server/static/logo_local.png 

166 

167 <Directory /usr/share/camcops/server/static> 

168 Require all granted 

169 </Directory> 

170 

171""" # noqa 

172 

173 

174class ReverseProxiedConfig(object): 

175 """ 

176 Class to hold information about a reverse proxy configuration. 

177 """ 

178 def __init__(self, 

179 trusted_proxy_headers: List[str] = None, 

180 http_host: str = None, 

181 remote_addr: str = None, 

182 script_name: str = None, 

183 server_name: str = None, 

184 server_port: int = None, 

185 url_scheme: str = None, 

186 rewrite_path_info: bool = False) -> None: 

187 """ 

188 Args: 

189 trusted_proxy_headers: 

190 list of headers, from 

191 :const:`ReverseProxiedMiddleware.ALL_CANDIDATES`, that the 

192 middleware will treat as trusted and obey. All others from this 

193 list will be stripped. 

194 

195 http_host: 

196 Value to write to the ``HTTP_HOST`` WSGI variable. If not 

197 specified, an appropriate trusted header will be used (if there 

198 is one). 

199 

200 remote_addr: 

201 ... similarly for ``REMOTE_ADDR`` 

202 

203 script_name: 

204 ... similarly for ``SCRIPT_NAME`` 

205 

206 server_name: 

207 ... similarly for ``SERVER_NAME`` 

208 

209 server_port: 

210 ... similarly for ``SERVER_PORT`` 

211 

212 url_scheme: 

213 ... similarly for ``URL_SCHEME`` (e.g. ``"https"``) 

214 

215 rewrite_path_info: 

216 If ``True``, then if the middleware sets ``SCRIPT_NAME`` and 

217 ``PATH_INFO`` starts with ``SCRIPT_NAME``, the ``SCRIPT_NAME`` 

218 will be stripped off the front of ``PATH_INFO``. 

219 

220 This is appropriate for front-end web servers that fail to 

221 rewrite the incoming URL properly. (Do not use for Apache with 

222 ``ProxyPass``; ``ProxyPass`` rewrites the URLs properly for 

223 you.) 

224 

225 ... as per e.g. http://flask.pocoo.org/snippets/35/ 

226 """ 

227 self.trusted_proxy_headers = [] # type: List[str] 

228 if trusted_proxy_headers: 

229 for x in trusted_proxy_headers: 

230 h = x.upper() 

231 if h in ReverseProxiedMiddleware.ALL_CANDIDATES: 

232 self.trusted_proxy_headers.append(h) 

233 self.http_host = http_host 

234 self.remote_addr = remote_addr 

235 self.script_name = script_name.rstrip("/") if script_name else "" 

236 self.server_name = server_name 

237 self.server_port = str(server_port) if server_port is not None else "" 

238 self.url_scheme = url_scheme.lower() if url_scheme else "" 

239 self.rewrite_path_info = rewrite_path_info 

240 

241 def necessary(self) -> bool: 

242 """ 

243 Is any special handling (e.g. the addition of 

244 :class:`ReverseProxiedMiddleware`) necessary for thie config? 

245 """ 

246 return any([ 

247 self.trusted_proxy_headers, 

248 self.http_host, 

249 self.remote_addr, 

250 self.script_name, 

251 self.server_name, 

252 self.server_port, 

253 self.url_scheme, 

254 self.rewrite_path_info, 

255 ]) 

256 

257 

258class ReverseProxiedMiddleware(object): 

259 """ 

260 WSGI middleware to set the ``SCRIPT_NAME`` and ``PATH_INFO`` WSGI 

261 environment variables (etc.) correctly when behind a reverse proxy. 

262  

263 Note that the WSGI environment variables ``HTTP_*`` are clones of HTTP 

264 headers; for example, ``X-Forwarded-For`` in HTTP becomes 

265 ``HTTP_X_FORWARDED_FOR`` in WSGI. 

266 

267 See also: 

268  

269 - http://flask.pocoo.org/snippets/35/ 

270 - http://blog.macuyiko.com/post/2016/fixing-flask-url_for-when-behind-mod_proxy.html 

271 - http://alex.eftimie.ro/2013/03/21/how-to-run-flask-application-in-a-subpath-using-apache-mod_proxy/ 

272 - http://modwsgi.readthedocs.io/en/develop/release-notes/version-4.4.9.html 

273 - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers 

274 """ # noqa 

275 

276 CANDIDATES_HTTP_HOST = [ 

277 # These are variables that may contain a value for HTTP_HOST. 

278 WsgiEnvVar.HTTP_X_HOST, 

279 WsgiEnvVar.HTTP_X_FORWARDED_HOST 

280 ] 

281 CANDIDATES_SERVER_PORT = [ 

282 # These are variables that may contain a value for SERVER_PORT. 

283 WsgiEnvVar.HTTP_X_FORWARDED_PORT 

284 ] 

285 CANDIDATES_REMOTE_ADDR = [ 

286 # These are variables that may contain a value for REMOTE_ADDR. 

287 # However, they differ: 

288 WsgiEnvVar.HTTP_X_FORWARDED_FOR, # may contain many values; first is taken # noqa 

289 WsgiEnvVar.HTTP_X_REAL_IP # may contain only one 

290 ] 

291 _CANDIDATES_URL_SCHEME_GIVING_PROTOCOL = [ 

292 # These are variables whose values might be "http" or "https", 

293 # relevant to wsgi.url_scheme. 

294 WsgiEnvVar.HTTP_X_FORWARDED_PROTO, 

295 WsgiEnvVar.HTTP_X_FORWARDED_PROTOCOL, 

296 WsgiEnvVar.HTTP_X_FORWARDED_SCHEME, 

297 WsgiEnvVar.HTTP_X_SCHEME 

298 ] 

299 _CANDIDATES_URL_SCHEME_INDICATING_HTTPS = [ 

300 # These are variables whose values might be "On", "True", or "1", 

301 # to indicate the use of HTTPS/SSL, relevant to wsgi.url_scheme. 

302 WsgiEnvVar.HTTP_X_FORWARDED_HTTPS, 

303 WsgiEnvVar.HTTP_X_FORWARDED_SSL, 

304 WsgiEnvVar.HTTP_X_HTTPS, 

305 ] 

306 CANDIDATES_URL_SCHEME = ( 

307 # All variables that may inform wsgi.url_scheme 

308 _CANDIDATES_URL_SCHEME_GIVING_PROTOCOL + 

309 _CANDIDATES_URL_SCHEME_INDICATING_HTTPS 

310 ) 

311 CANDIDATES_SCRIPT_NAME = [ 

312 # These are variables that may contain a value for SCRIPT_NAME. 

313 WsgiEnvVar.HTTP_X_SCRIPT_NAME, 

314 WsgiEnvVar.HTTP_X_FORWARDED_SCRIPT_NAME 

315 ] 

316 CANDIDATES_SERVER_NAME = [ 

317 # These are variables that may contain a value for SERVER_NAME. 

318 WsgiEnvVar.HTTP_X_FORWARDED_SERVER 

319 ] 

320 ALL_CANDIDATES = ( 

321 # All variables of interest. 

322 CANDIDATES_HTTP_HOST + 

323 CANDIDATES_SERVER_PORT + 

324 CANDIDATES_REMOTE_ADDR + 

325 _CANDIDATES_URL_SCHEME_GIVING_PROTOCOL + 

326 _CANDIDATES_URL_SCHEME_INDICATING_HTTPS + 

327 CANDIDATES_SCRIPT_NAME + 

328 CANDIDATES_SERVER_NAME 

329 ) 

330 SCHEME_HTTPS = 'https' 

331 TRUE_VALUES_LOWER_CASE = ["on", "true", "1"] 

332 

333 def __init__(self, 

334 app: TYPE_WSGI_APP, 

335 config: ReverseProxiedConfig, 

336 debug: bool = False) -> None: 

337 self.app = app 

338 self.config = config 

339 self.debug = debug 

340 

341 self.vars_host = [x for x in self.CANDIDATES_HTTP_HOST 

342 if x in config.trusted_proxy_headers] 

343 self.vars_addr = [x for x in self.CANDIDATES_REMOTE_ADDR 

344 if x in config.trusted_proxy_headers] 

345 self.vars_script = [x for x in self.CANDIDATES_SCRIPT_NAME 

346 if x in config.trusted_proxy_headers] 

347 self.vars_server = [x for x in self.CANDIDATES_SERVER_NAME 

348 if x in config.trusted_proxy_headers] 

349 self.vars_port = [x for x in self.CANDIDATES_SERVER_PORT 

350 if x in config.trusted_proxy_headers] 

351 self.vars_scheme_a = [ 

352 x for x in self._CANDIDATES_URL_SCHEME_GIVING_PROTOCOL 

353 if x in config.trusted_proxy_headers 

354 ] 

355 self.vars_scheme_b = [ 

356 x for x in self._CANDIDATES_URL_SCHEME_INDICATING_HTTPS 

357 if x in config.trusted_proxy_headers 

358 ] 

359 

360 if self.debug: 

361 log.debug("ReverseProxiedMiddleware installed") 

362 self._report(WsgiEnvVar.HTTP_HOST, config.http_host, 

363 self.vars_host) 

364 self._report(WsgiEnvVar.REMOTE_ADDR, config.remote_addr, 

365 self.vars_addr) 

366 self._report(WsgiEnvVar.SCRIPT_NAME, config.script_name, 

367 self.vars_script) 

368 if config.script_name or self.vars_script: 

369 log.debug("... which will also affect WSGI environment " 

370 "variable {}", WsgiEnvVar.PATH_INFO) 

371 self._report(WsgiEnvVar.SERVER_NAME, config.server_name, 

372 self.vars_server) 

373 self._report(WsgiEnvVar.SERVER_PORT, config.server_port, 

374 self.vars_port) 

375 self._report(WsgiEnvVar.WSGI_URL_SCHEME, config.url_scheme, 

376 self.vars_scheme_a + self.vars_scheme_b) 

377 

378 @staticmethod 

379 def _report(option: str, value: str, envvars: List[str]) -> None: 

380 if value: 

381 log.debug("... WSGI environment variable {} will be set to {}", 

382 option, value) 

383 elif envvars: 

384 log.debug("... WSGI environment variable {} will be set according " 

385 "to reflect environment variables {!r} in " 

386 "incoming requests", 

387 option, envvars) 

388 else: 

389 log.debug("... WSGI environment variable {} will not be changed", 

390 option) 

391 

392 @classmethod 

393 def _get_first(cls, 

394 environ: TYPE_WSGI_ENVIRON, 

395 envvars: List[str], 

396 keys_to_keep: List[str], 

397 as_remote_addr: bool = False) -> str: 

398 for k in envvars: 

399 value = environ.get(k, '') 

400 if value: 

401 keys_to_keep.append(k) 

402 # Oddity for REMOTE_ADDR and X-Forwarded-For: 

403 if as_remote_addr and k == WsgiEnvVar.HTTP_X_FORWARDED_FOR: 

404 value = first_from_xff(value) 

405 return value 

406 return '' 

407 

408 @classmethod 

409 def _proto_if_one_true(cls, 

410 environ: TYPE_WSGI_ENVIRON, 

411 envvars: List[str], 

412 keys_to_keep: List[str]) -> str: 

413 for k in envvars: 

414 value = environ.get(k, '') 

415 if value.lower() in cls.TRUE_VALUES_LOWER_CASE: 

416 keys_to_keep.append(k) 

417 return cls.SCHEME_HTTPS 

418 return '' 

419 

420 def __call__(self, 

421 environ: TYPE_WSGI_ENVIRON, 

422 start_response: TYPE_WSGI_START_RESPONSE) \ 

423 -> TYPE_WSGI_APP_RESULT: 

424 """ 

425 ----------------------------------------------------------------------- 

426 REWRITING THE HOST (setting HTTP_HOST): 

427 ----------------------------------------------------------------------- 

428 

429 If you don't rewrite the host, the Pyramid debug toolbar will get 

430 things a bit wrong. An example: 

431 http://127.0.0.1:80/camcops 

432 is proxied by Apache to 

433 http://127.0.0.7:8000/camcops 

434 

435 In that situation, HTTP_HOST will be '127.0.0.1:8000', and so the 

436 Pyramid debug toolbar will start asking the web browser to go to 

437 http://127.0.0.1:8000/camcops/_debug_toolbar/... 

438 ... which is wrong (it's a reference to the "internal" site). 

439 

440 If you allow the host to be rewritten, then you get a sensible 

441 reference e.g. to 

442 http://wombat/camcops/_debug_toolbar/... 

443 

444 Should we be looking at HTTP_X_FORWARDED_HOST or 

445 HTTP_X_FORWARDED_SERVER? 

446 See https://github.com/omnigroup/Apache/blob/master/httpd/modules/proxy/mod_proxy_http.c # noqa 

447 ... and let's follow mod_wsgi. 

448 

449 ----------------------------------------------------------------------- 

450 HTTP_HOST versus SERVER_NAME 

451 ----------------------------------------------------------------------- 

452 https://stackoverflow.com/questions/2297403/what-is-the-difference-between-http-host-and-server-name-in-php # noqa 

453 

454 ----------------------------------------------------------------------- 

455 REWRITING THE PROTOCOL 

456 ----------------------------------------------------------------------- 

457 Consider how we get here. For example, we may have this sequence: 

458  

459 .. code-block:: none 

460 

461 user's web browser 

462 -> Apache front-end web server via HTTPS on port 443 

463 -> ProxyPass/ProxyPassReverse 

464 -> CherryPy server via HTTP on port 8000 or via a Unix socket 

465 -> ... 

466 -> cherrypy/wsgiserver/__init__.py, 

467 WSGIGateway_10.get_environ() 

468 ... which creates a WSGI environment from an HTTP request. 

469 

470 So if you want to see what's coming by way of raw headers, put this 

471 in at the end of that get_environ() function: 

472 

473 .. code-block:: python 

474 

475 from pprint import pformat; import logging; log = logging.getLogger(__name__); log.critical("Request headers:\n" + pformat(req.inheaders)) 

476 

477 """ # noqa 

478 if self.debug: 

479 log.debug("Starting WSGI environment: \n{}", pformat(environ)) 

480 oldenv = environ.copy() 

481 keys_to_keep = [] # type: List[str] 

482 config = self.config 

483 

484 # --------------------------------------------------------------------- 

485 

486 # HTTP_HOST 

487 http_host = ( 

488 config.http_host or # manually specified: top priority. Otherwise: 

489 self._get_first(environ, self.vars_host, keys_to_keep) 

490 ) 

491 if http_host: 

492 environ[WsgiEnvVar.HTTP_HOST] = http_host 

493 

494 # REMOTE_ADDR 

495 remote_addr = ( 

496 config.remote_addr or 

497 self._get_first(environ, self.vars_addr, keys_to_keep, 

498 as_remote_addr=True) 

499 ) 

500 if remote_addr: 

501 environ[WsgiEnvVar.REMOTE_ADDR] = remote_addr 

502 

503 # SCRIPT_NAME, PATH_INFO 

504 script_name = ( 

505 config.script_name or 

506 self._get_first(environ, self.vars_script, keys_to_keep) 

507 ) 

508 if script_name: 

509 environ[WsgiEnvVar.SCRIPT_NAME] = script_name 

510 path_info = environ[WsgiEnvVar.PATH_INFO] 

511 if config.rewrite_path_info and path_info.startswith(script_name): 

512 newpath = path_info[len(script_name):] 

513 if not newpath: # e.g. trailing slash omitted from incoming path # noqa 

514 newpath = "/" 

515 environ[WsgiEnvVar.PATH_INFO] = newpath 

516 

517 # SERVER_NAME 

518 server_name = ( 

519 config.server_name or 

520 self._get_first(environ, self.vars_server, keys_to_keep) 

521 ) 

522 if server_name: 

523 environ[WsgiEnvVar.SERVER_NAME] = server_name 

524 

525 # SERVER_PORT 

526 server_port = ( 

527 config.server_port or 

528 self._get_first(environ, self.vars_port, keys_to_keep) 

529 ) 

530 if server_port: 

531 environ[WsgiEnvVar.SERVER_PORT] = server_port 

532 

533 # wsgi.url_scheme 

534 url_scheme = ( 

535 config.url_scheme or # manually specified: top priority. Otherwise: 

536 self._get_first(environ, self.vars_scheme_a, keys_to_keep) or 

537 self._proto_if_one_true(environ, self.vars_scheme_b, keys_to_keep) 

538 ) 

539 if url_scheme: 

540 url_scheme = url_scheme.lower() 

541 environ[WsgiEnvVar.WSGI_URL_SCHEME] = url_scheme 

542 

543 # --------------------------------------------------------------------- 

544 

545 # As per mod_wsgi, we delete unused and untrusted keys. 

546 delete_keys(environ, 

547 keys_to_delete=self.ALL_CANDIDATES, 

548 keys_to_keep=keys_to_keep) 

549 if self.debug: 

550 # noinspection PyUnboundLocalVariable 

551 changes = dict_diff(oldenv, environ) 

552 log.debug("Changes to WSGI environment: \n{}", pformat(changes)) 

553 return self.app(environ, start_response)