Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/mgcv_cubic_splines.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2014 GDF Suez, http://www.gdfsuez.com/
3# See file LICENSE.txt for license information.
5# R package 'mgcv' compatible cubic spline basis functions
7# These are made available in the patsy.* namespace
8__all__ = ["cr", "cc", "te"]
10import numpy as np
12from patsy.util import (have_pandas, atleast_2d_column_default,
13 no_pickling, assert_no_pickling, safe_string_eq)
14from patsy.state import stateful_transform
16if have_pandas:
17 import pandas
20def _get_natural_f(knots):
21 """Returns mapping of natural cubic spline values to 2nd derivatives.
23 .. note:: See 'Generalized Additive Models', Simon N. Wood, 2006, pp 145-146
25 :param knots: The 1-d array knots used for cubic spline parametrization,
26 must be sorted in ascending order.
27 :return: A 2-d array mapping natural cubic spline values at
28 knots to second derivatives.
30 :raise ImportError: if scipy is not found, required for
31 ``linalg.solve_banded()``
32 """
33 try:
34 from scipy import linalg
35 except ImportError: # pragma: no cover
36 raise ImportError("Cubic spline functionality requires scipy.")
38 h = knots[1:] - knots[:-1]
39 diag = (h[:-1] + h[1:]) / 3.
40 ul_diag = h[1:-1] / 6.
41 banded_b = np.array([np.r_[0., ul_diag], diag, np.r_[ul_diag, 0.]])
42 d = np.zeros((knots.size - 2, knots.size))
43 for i in range(knots.size - 2):
44 d[i, i] = 1. / h[i]
45 d[i, i + 2] = 1. / h[i + 1]
46 d[i, i + 1] = - d[i, i] - d[i, i + 2]
48 fm = linalg.solve_banded((1, 1), banded_b, d)
50 return np.vstack([np.zeros(knots.size), fm, np.zeros(knots.size)])
53# Cyclic Cubic Regression Splines
56def _map_cyclic(x, lbound, ubound):
57 """Maps values into the interval [lbound, ubound] in a cyclic fashion.
59 :param x: The 1-d array values to be mapped.
60 :param lbound: The lower bound of the interval.
61 :param ubound: The upper bound of the interval.
62 :return: A new 1-d array containing mapped x values.
64 :raise ValueError: if lbound >= ubound.
65 """
66 if lbound >= ubound:
67 raise ValueError("Invalid argument: lbound (%r) should be "
68 "less than ubound (%r)."
69 % (lbound, ubound))
71 x = np.copy(x)
72 x[x > ubound] = lbound + (x[x > ubound] - ubound) % (ubound - lbound)
73 x[x < lbound] = ubound - (lbound - x[x < lbound]) % (ubound - lbound)
75 return x
78def test__map_cyclic():
79 x = np.array([1.5, 2.6, 0.1, 4.4, 10.7])
80 x_orig = np.copy(x)
81 expected_mapped_x = np.array([3.0, 2.6, 3.1, 2.9, 3.2])
82 mapped_x = _map_cyclic(x, 2.1, 3.6)
83 assert np.allclose(x, x_orig)
84 assert np.allclose(mapped_x, expected_mapped_x)
87def test__map_cyclic_errors():
88 from nose.tools import assert_raises
89 x = np.linspace(0.2, 5.7, 10)
90 assert_raises(ValueError, _map_cyclic, x, 4.5, 3.6)
91 assert_raises(ValueError, _map_cyclic, x, 4.5, 4.5)
94def _get_cyclic_f(knots):
95 """Returns mapping of cyclic cubic spline values to 2nd derivatives.
97 .. note:: See 'Generalized Additive Models', Simon N. Wood, 2006, pp 146-147
99 :param knots: The 1-d array knots used for cubic spline parametrization,
100 must be sorted in ascending order.
101 :return: A 2-d array mapping cyclic cubic spline values at
102 knots to second derivatives.
103 """
104 h = knots[1:] - knots[:-1]
105 n = knots.size - 1
106 b = np.zeros((n, n))
107 d = np.zeros((n, n))
109 b[0, 0] = (h[n - 1] + h[0]) / 3.
110 b[0, n - 1] = h[n - 1] / 6.
111 b[n - 1, 0] = h[n - 1] / 6.
113 d[0, 0] = -1. / h[0] - 1. / h[n - 1]
114 d[0, n - 1] = 1. / h[n - 1]
115 d[n - 1, 0] = 1. / h[n - 1]
117 for i in range(1, n):
118 b[i, i] = (h[i - 1] + h[i]) / 3.
119 b[i, i - 1] = h[i - 1] / 6.
120 b[i - 1, i] = h[i - 1] / 6.
122 d[i, i] = -1. / h[i - 1] - 1. / h[i]
123 d[i, i - 1] = 1. / h[i - 1]
124 d[i - 1, i] = 1. / h[i - 1]
126 return np.linalg.solve(b, d)
129# Tensor Product
132def _row_tensor_product(dms):
133 """Computes row-wise tensor product of given arguments.
135 .. note:: Custom algorithm to precisely match what is done in 'mgcv',
136 in particular look out for order of result columns!
137 For reference implementation see 'mgcv' source code,
138 file 'mat.c', mgcv_tensor_mm(), l.62
140 :param dms: A sequence of 2-d arrays (marginal design matrices).
141 :return: The 2-d array row-wise tensor product of given arguments.
143 :raise ValueError: if argument sequence is empty, does not contain only
144 2-d arrays or if the arrays number of rows does not match.
145 """
146 if len(dms) == 0:
147 raise ValueError("Tensor product arrays sequence should not be empty.")
148 for dm in dms:
149 if dm.ndim != 2:
150 raise ValueError("Tensor product arguments should be 2-d arrays.")
152 tp_nrows = dms[0].shape[0]
153 tp_ncols = 1
154 for dm in dms:
155 if dm.shape[0] != tp_nrows:
156 raise ValueError("Tensor product arguments should have "
157 "same number of rows.")
158 tp_ncols *= dm.shape[1]
159 tp = np.zeros((tp_nrows, tp_ncols))
160 tp[:, -dms[-1].shape[1]:] = dms[-1]
161 filled_tp_ncols = dms[-1].shape[1]
162 for dm in dms[-2::-1]:
163 p = - filled_tp_ncols * dm.shape[1]
164 for j in range(dm.shape[1]):
165 xj = dm[:, j]
166 for t in range(-filled_tp_ncols, 0):
167 tp[:, p] = tp[:, t] * xj
168 p += 1
169 filled_tp_ncols *= dm.shape[1]
171 return tp
174def test__row_tensor_product_errors():
175 from nose.tools import assert_raises
176 assert_raises(ValueError, _row_tensor_product, [])
177 assert_raises(ValueError, _row_tensor_product, [np.arange(1, 5)])
178 assert_raises(ValueError, _row_tensor_product,
179 [np.arange(1, 5), np.arange(1, 5)])
180 assert_raises(ValueError, _row_tensor_product,
181 [np.arange(1, 13).reshape((3, 4)),
182 np.arange(1, 13).reshape((4, 3))])
185def test__row_tensor_product():
186 # Testing cases where main input array should not be modified
187 dm1 = np.arange(1, 17).reshape((4, 4))
188 assert np.array_equal(_row_tensor_product([dm1]), dm1)
189 ones = np.ones(4).reshape((4, 1))
190 tp1 = _row_tensor_product([ones, dm1])
191 assert np.array_equal(tp1, dm1)
192 tp2 = _row_tensor_product([dm1, ones])
193 assert np.array_equal(tp2, dm1)
195 # Testing cases where main input array should be scaled
196 twos = 2 * ones
197 tp3 = _row_tensor_product([twos, dm1])
198 assert np.array_equal(tp3, 2 * dm1)
199 tp4 = _row_tensor_product([dm1, twos])
200 assert np.array_equal(tp4, 2 * dm1)
202 # Testing main cases
203 dm2 = np.array([[1, 2], [1, 2]])
204 dm3 = np.arange(1, 7).reshape((2, 3))
205 expected_tp5 = np.array([[1, 2, 3, 2, 4, 6],
206 [4, 5, 6, 8, 10, 12]])
207 tp5 = _row_tensor_product([dm2, dm3])
208 assert np.array_equal(tp5, expected_tp5)
209 expected_tp6 = np.array([[1, 2, 2, 4, 3, 6],
210 [4, 8, 5, 10, 6, 12]])
211 tp6 = _row_tensor_product([dm3, dm2])
212 assert np.array_equal(tp6, expected_tp6)
215# Common code
218def _find_knots_lower_bounds(x, knots):
219 """Finds knots lower bounds for given values.
221 Returns an array of indices ``I`` such that
222 ``0 <= I[i] <= knots.size - 2`` for all ``i``
223 and
224 ``knots[I[i]] < x[i] <= knots[I[i] + 1]`` if
225 ``np.min(knots) < x[i] <= np.max(knots)``,
226 ``I[i] = 0`` if ``x[i] <= np.min(knots)``
227 ``I[i] = knots.size - 2`` if ``np.max(knots) < x[i]``
229 :param x: The 1-d array values whose knots lower bounds are to be found.
230 :param knots: The 1-d array knots used for cubic spline parametrization,
231 must be sorted in ascending order.
232 :return: An array of knots lower bounds indices.
233 """
234 lb = np.searchsorted(knots, x) - 1
236 # I[i] = 0 for x[i] <= np.min(knots)
237 lb[lb == -1] = 0
239 # I[i] = knots.size - 2 for x[i] > np.max(knots)
240 lb[lb == knots.size - 1] = knots.size - 2
242 return lb
245def _compute_base_functions(x, knots):
246 """Computes base functions used for building cubic splines basis.
248 .. note:: See 'Generalized Additive Models', Simon N. Wood, 2006, p. 146
249 and for the special treatment of ``x`` values outside ``knots`` range
250 see 'mgcv' source code, file 'mgcv.c', function 'crspl()', l.249
252 :param x: The 1-d array values for which base functions should be computed.
253 :param knots: The 1-d array knots used for cubic spline parametrization,
254 must be sorted in ascending order.
255 :return: 4 arrays corresponding to the 4 base functions ajm, ajp, cjm, cjp
256 + the 1-d array of knots lower bounds indices corresponding to
257 the given ``x`` values.
258 """
259 j = _find_knots_lower_bounds(x, knots)
261 h = knots[1:] - knots[:-1]
262 hj = h[j]
263 xj1_x = knots[j + 1] - x
264 x_xj = x - knots[j]
266 ajm = xj1_x / hj
267 ajp = x_xj / hj
269 cjm_3 = xj1_x * xj1_x * xj1_x / (6. * hj)
270 cjm_3[x > np.max(knots)] = 0.
271 cjm_1 = hj * xj1_x / 6.
272 cjm = cjm_3 - cjm_1
274 cjp_3 = x_xj * x_xj * x_xj / (6. * hj)
275 cjp_3[x < np.min(knots)] = 0.
276 cjp_1 = hj * x_xj / 6.
277 cjp = cjp_3 - cjp_1
279 return ajm, ajp, cjm, cjp, j
282def _absorb_constraints(design_matrix, constraints):
283 """Absorb model parameters constraints into the design matrix.
285 :param design_matrix: The (2-d array) initial design matrix.
286 :param constraints: The 2-d array defining initial model parameters
287 (``betas``) constraints (``np.dot(constraints, betas) = 0``).
288 :return: The new design matrix with absorbed parameters constraints.
290 :raise ImportError: if scipy is not found, used for ``scipy.linalg.qr()``
291 which is cleaner than numpy's version requiring a call like
292 ``qr(..., mode='complete')`` to get a full QR decomposition.
293 """
294 try:
295 from scipy import linalg
296 except ImportError: # pragma: no cover
297 raise ImportError("Cubic spline functionality requires scipy.")
299 m = constraints.shape[0]
300 q, r = linalg.qr(np.transpose(constraints))
302 return np.dot(design_matrix, q[:, m:])
305def _get_free_crs_dmatrix(x, knots, cyclic=False):
306 """Builds an unconstrained cubic regression spline design matrix.
308 Returns design matrix with dimensions ``len(x) x n``
309 for a cubic regression spline smoother
310 where
311 - ``n = len(knots)`` for natural CRS
312 - ``n = len(knots) - 1`` for cyclic CRS
314 .. note:: See 'Generalized Additive Models', Simon N. Wood, 2006, p. 145
316 :param x: The 1-d array values.
317 :param knots: The 1-d array knots used for cubic spline parametrization,
318 must be sorted in ascending order.
319 :param cyclic: Indicates whether used cubic regression splines should
320 be cyclic or not. Default is ``False``.
321 :return: The (2-d array) design matrix.
322 """
323 n = knots.size
324 if cyclic:
325 x = _map_cyclic(x, min(knots), max(knots))
326 n -= 1
328 ajm, ajp, cjm, cjp, j = _compute_base_functions(x, knots)
330 j1 = j + 1
331 if cyclic:
332 j1[j1 == n] = 0
334 i = np.identity(n)
336 if cyclic:
337 f = _get_cyclic_f(knots)
338 else:
339 f = _get_natural_f(knots)
341 dmt = ajm * i[j, :].T + ajp * i[j1, :].T + \
342 cjm * f[j, :].T + cjp * f[j1, :].T
344 return dmt.T
347def _get_crs_dmatrix(x, knots, constraints=None, cyclic=False):
348 """Builds a cubic regression spline design matrix.
350 Returns design matrix with dimensions len(x) x n
351 where:
352 - ``n = len(knots) - nrows(constraints)`` for natural CRS
353 - ``n = len(knots) - nrows(constraints) - 1`` for cyclic CRS
354 for a cubic regression spline smoother
356 :param x: The 1-d array values.
357 :param knots: The 1-d array knots used for cubic spline parametrization,
358 must be sorted in ascending order.
359 :param constraints: The 2-d array defining model parameters (``betas``)
360 constraints (``np.dot(constraints, betas) = 0``).
361 :param cyclic: Indicates whether used cubic regression splines should
362 be cyclic or not. Default is ``False``.
363 :return: The (2-d array) design matrix.
364 """
365 dm = _get_free_crs_dmatrix(x, knots, cyclic)
366 if constraints is not None:
367 dm = _absorb_constraints(dm, constraints)
369 return dm
372def _get_te_dmatrix(design_matrices, constraints=None):
373 """Builds tensor product design matrix, given the marginal design matrices.
375 :param design_matrices: A sequence of 2-d arrays (marginal design matrices).
376 :param constraints: The 2-d array defining model parameters (``betas``)
377 constraints (``np.dot(constraints, betas) = 0``).
378 :return: The (2-d array) design matrix.
379 """
380 dm = _row_tensor_product(design_matrices)
381 if constraints is not None:
382 dm = _absorb_constraints(dm, constraints)
384 return dm
387# Stateful Transforms
390def _get_all_sorted_knots(x, n_inner_knots=None, inner_knots=None,
391 lower_bound=None, upper_bound=None):
392 """Gets all knots locations with lower and upper exterior knots included.
394 If needed, inner knots are computed as equally spaced quantiles of the
395 input data falling between given lower and upper bounds.
397 :param x: The 1-d array data values.
398 :param n_inner_knots: Number of inner knots to compute.
399 :param inner_knots: Provided inner knots if any.
400 :param lower_bound: The lower exterior knot location. If unspecified, the
401 minimum of ``x`` values is used.
402 :param upper_bound: The upper exterior knot location. If unspecified, the
403 maximum of ``x`` values is used.
404 :return: The array of ``n_inner_knots + 2`` distinct knots.
406 :raise ValueError: for various invalid parameters sets or if unable to
407 compute ``n_inner_knots + 2`` distinct knots.
408 """
409 if lower_bound is None and x.size == 0:
410 raise ValueError("Cannot set lower exterior knot location: empty "
411 "input data and lower_bound not specified.")
412 elif lower_bound is None and x.size != 0:
413 lower_bound = np.min(x)
415 if upper_bound is None and x.size == 0:
416 raise ValueError("Cannot set upper exterior knot location: empty "
417 "input data and upper_bound not specified.")
418 elif upper_bound is None and x.size != 0:
419 upper_bound = np.max(x)
421 if upper_bound < lower_bound:
422 raise ValueError("lower_bound > upper_bound (%r > %r)"
423 % (lower_bound, upper_bound))
425 if inner_knots is None and n_inner_knots is not None:
426 if n_inner_knots < 0:
427 raise ValueError("Invalid requested number of inner knots: %r"
428 % (n_inner_knots,))
430 x = x[(lower_bound <= x) & (x <= upper_bound)]
431 x = np.unique(x)
433 if x.size != 0:
434 inner_knots_q = np.linspace(0, 100, n_inner_knots + 2)[1:-1]
435 # .tolist() is necessary to work around a bug in numpy 1.8
436 inner_knots = np.asarray(np.percentile(x, inner_knots_q.tolist()))
437 elif n_inner_knots == 0:
438 inner_knots = np.array([])
439 else:
440 raise ValueError("No data values between lower_bound(=%r) and "
441 "upper_bound(=%r): cannot compute requested "
442 "%r inner knot(s)."
443 % (lower_bound, upper_bound, n_inner_knots))
444 elif inner_knots is not None:
445 inner_knots = np.unique(inner_knots)
446 if n_inner_knots is not None and n_inner_knots != inner_knots.size:
447 raise ValueError("Needed number of inner knots=%r does not match "
448 "provided number of inner knots=%r."
449 % (n_inner_knots, inner_knots.size))
450 n_inner_knots = inner_knots.size
451 if np.any(inner_knots < lower_bound):
452 raise ValueError("Some knot values (%s) fall below lower bound "
453 "(%r)."
454 % (inner_knots[inner_knots < lower_bound],
455 lower_bound))
456 if np.any(inner_knots > upper_bound):
457 raise ValueError("Some knot values (%s) fall above upper bound "
458 "(%r)."
459 % (inner_knots[inner_knots > upper_bound],
460 upper_bound))
461 else:
462 raise ValueError("Must specify either 'n_inner_knots' or 'inner_knots'.")
464 all_knots = np.concatenate(([lower_bound, upper_bound], inner_knots))
465 all_knots = np.unique(all_knots)
466 if all_knots.size != n_inner_knots + 2:
467 raise ValueError("Unable to compute n_inner_knots(=%r) + 2 distinct "
468 "knots: %r data value(s) found between "
469 "lower_bound(=%r) and upper_bound(=%r)."
470 % (n_inner_knots, x.size, lower_bound, upper_bound))
472 return all_knots
475def test__get_all_sorted_knots():
476 from nose.tools import assert_raises
477 assert_raises(ValueError, _get_all_sorted_knots,
478 np.array([]), -1)
479 assert_raises(ValueError, _get_all_sorted_knots,
480 np.array([]), 0)
481 assert_raises(ValueError, _get_all_sorted_knots,
482 np.array([]), 0, lower_bound=1)
483 assert_raises(ValueError, _get_all_sorted_knots,
484 np.array([]), 0, upper_bound=5)
485 assert_raises(ValueError, _get_all_sorted_knots,
486 np.array([]), 0, lower_bound=3, upper_bound=1)
487 assert np.array_equal(
488 _get_all_sorted_knots(np.array([]), 0, lower_bound=1, upper_bound=5),
489 [1, 5])
490 assert_raises(ValueError, _get_all_sorted_knots,
491 np.array([]), 0, lower_bound=1, upper_bound=1)
492 x = np.arange(6) * 2
493 assert_raises(ValueError, _get_all_sorted_knots,
494 x, -2)
495 assert np.array_equal(
496 _get_all_sorted_knots(x, 0),
497 [0, 10])
498 assert np.array_equal(
499 _get_all_sorted_knots(x, 0, lower_bound=3, upper_bound=8),
500 [3, 8])
501 assert np.array_equal(
502 _get_all_sorted_knots(x, 2, lower_bound=1, upper_bound=9),
503 [1, 4, 6, 9])
504 assert_raises(ValueError, _get_all_sorted_knots,
505 x, 2, lower_bound=1, upper_bound=3)
506 assert_raises(ValueError, _get_all_sorted_knots,
507 x, 1, lower_bound=1.3, upper_bound=1.4)
508 assert np.array_equal(
509 _get_all_sorted_knots(x, 1, lower_bound=1, upper_bound=3),
510 [1, 2, 3])
511 assert_raises(ValueError, _get_all_sorted_knots,
512 x, 1, lower_bound=2, upper_bound=3)
513 assert_raises(ValueError, _get_all_sorted_knots,
514 x, 1, inner_knots=[2, 3])
515 assert_raises(ValueError, _get_all_sorted_knots,
516 x, lower_bound=2, upper_bound=3)
517 assert np.array_equal(
518 _get_all_sorted_knots(x, inner_knots=[3, 7]),
519 [0, 3, 7, 10])
520 assert np.array_equal(
521 _get_all_sorted_knots(x, inner_knots=[3, 7], lower_bound=2),
522 [2, 3, 7, 10])
523 assert_raises(ValueError, _get_all_sorted_knots,
524 x, inner_knots=[3, 7], lower_bound=4)
525 assert_raises(ValueError, _get_all_sorted_knots,
526 x, inner_knots=[3, 7], upper_bound=6)
529def _get_centering_constraint_from_dmatrix(design_matrix):
530 """ Computes the centering constraint from the given design matrix.
532 We want to ensure that if ``b`` is the array of parameters, our
533 model is centered, ie ``np.mean(np.dot(design_matrix, b))`` is zero.
534 We can rewrite this as ``np.dot(c, b)`` being zero with ``c`` a 1-row
535 constraint matrix containing the mean of each column of ``design_matrix``.
537 :param design_matrix: The 2-d array design matrix.
538 :return: A 2-d array (1 x ncols(design_matrix)) defining the
539 centering constraint.
540 """
541 return design_matrix.mean(axis=0).reshape((1, design_matrix.shape[1]))
544class CubicRegressionSpline(object):
545 """Base class for cubic regression spline stateful transforms
547 This class contains all the functionality for the following stateful
548 transforms:
549 - ``cr(x, df=None, knots=None, lower_bound=None, upper_bound=None, constraints=None)``
550 for natural cubic regression spline
551 - ``cc(x, df=None, knots=None, lower_bound=None, upper_bound=None, constraints=None)``
552 for cyclic cubic regression spline
553 """
554 common_doc = """
555 :arg df: The number of degrees of freedom to use for this spline. The
556 return value will have this many columns. You must specify at least one
557 of ``df`` and ``knots``.
558 :arg knots: The interior knots to use for the spline. If unspecified, then
559 equally spaced quantiles of the input data are used. You must specify at
560 least one of ``df`` and ``knots``.
561 :arg lower_bound: The lower exterior knot location.
562 :arg upper_bound: The upper exterior knot location.
563 :arg constraints: Either a 2-d array defining general linear constraints
564 (that is ``np.dot(constraints, betas)`` is zero, where ``betas`` denotes
565 the array of *initial* parameters, corresponding to the *initial*
566 unconstrained design matrix), or the string
567 ``'center'`` indicating that we should apply a centering constraint
568 (this constraint will be computed from the input data, remembered and
569 re-used for prediction from the fitted model).
570 The constraints are absorbed in the resulting design matrix which means
571 that the model is actually rewritten in terms of
572 *unconstrained* parameters. For more details see :ref:`spline-regression`.
574 This is a stateful transforms (for details see
575 :ref:`stateful-transforms`). If ``knots``, ``lower_bound``, or
576 ``upper_bound`` are not specified, they will be calculated from the data
577 and then the chosen values will be remembered and re-used for prediction
578 from the fitted model.
580 Using this function requires scipy be installed.
582 .. versionadded:: 0.3.0
583 """
585 def __init__(self, name, cyclic):
586 self._name = name
587 self._cyclic = cyclic
588 self._tmp = {}
589 self._all_knots = None
590 self._constraints = None
592 def memorize_chunk(self, x, df=None, knots=None,
593 lower_bound=None, upper_bound=None,
594 constraints=None):
595 args = {"df": df,
596 "knots": knots,
597 "lower_bound": lower_bound,
598 "upper_bound": upper_bound,
599 "constraints": constraints,
600 }
601 self._tmp["args"] = args
603 x = np.atleast_1d(x)
604 if x.ndim == 2 and x.shape[1] == 1:
605 x = x[:, 0]
606 if x.ndim > 1:
607 raise ValueError("Input to %r must be 1-d, "
608 "or a 2-d column vector."
609 % (self._name,))
611 self._tmp.setdefault("xs", []).append(x)
613 def memorize_finish(self):
614 args = self._tmp["args"]
615 xs = self._tmp["xs"]
616 # Guards against invalid subsequent memorize_chunk() calls.
617 del self._tmp
619 x = np.concatenate(xs)
620 if args["df"] is None and args["knots"] is None:
621 raise ValueError("Must specify either 'df' or 'knots'.")
623 constraints = args["constraints"]
624 n_constraints = 0
625 if constraints is not None:
626 if safe_string_eq(constraints, "center"):
627 # Here we collect only number of constraints,
628 # actual centering constraint will be computed after all_knots
629 n_constraints = 1
630 else:
631 constraints = np.atleast_2d(constraints)
632 if constraints.ndim != 2:
633 raise ValueError("Constraints must be 2-d array or "
634 "1-d vector.")
635 n_constraints = constraints.shape[0]
637 n_inner_knots = None
638 if args["df"] is not None:
639 min_df = 1
640 if not self._cyclic and n_constraints == 0:
641 min_df = 2
642 if args["df"] < min_df:
643 raise ValueError("'df'=%r must be greater than or equal to %r."
644 % (args["df"], min_df))
645 n_inner_knots = args["df"] - 2 + n_constraints
646 if self._cyclic:
647 n_inner_knots += 1
648 self._all_knots = _get_all_sorted_knots(x,
649 n_inner_knots=n_inner_knots,
650 inner_knots=args["knots"],
651 lower_bound=args["lower_bound"],
652 upper_bound=args["upper_bound"])
653 if constraints is not None:
654 if safe_string_eq(constraints, "center"):
655 # Now we can compute centering constraints
656 constraints = _get_centering_constraint_from_dmatrix(
657 _get_free_crs_dmatrix(x, self._all_knots, cyclic=self._cyclic)
658 )
660 df_before_constraints = self._all_knots.size
661 if self._cyclic:
662 df_before_constraints -= 1
663 if constraints.shape[1] != df_before_constraints:
664 raise ValueError("Constraints array should have %r columns but"
665 " %r found."
666 % (df_before_constraints, constraints.shape[1]))
667 self._constraints = constraints
669 def transform(self, x, df=None, knots=None,
670 lower_bound=None, upper_bound=None,
671 constraints=None):
672 x_orig = x
673 x = np.atleast_1d(x)
674 if x.ndim == 2 and x.shape[1] == 1:
675 x = x[:, 0]
676 if x.ndim > 1:
677 raise ValueError("Input to %r must be 1-d, "
678 "or a 2-d column vector."
679 % (self._name,))
680 dm = _get_crs_dmatrix(x, self._all_knots,
681 self._constraints, cyclic=self._cyclic)
682 if have_pandas:
683 if isinstance(x_orig, (pandas.Series, pandas.DataFrame)):
684 dm = pandas.DataFrame(dm)
685 dm.index = x_orig.index
686 return dm
688 __getstate__ = no_pickling
691class CR(CubicRegressionSpline):
692 """cr(x, df=None, knots=None, lower_bound=None, upper_bound=None, constraints=None)
694 Generates a natural cubic spline basis for ``x``
695 (with the option of absorbing centering or more general parameters
696 constraints), allowing non-linear fits. The usual usage is something like::
698 y ~ 1 + cr(x, df=5, constraints='center')
700 to fit ``y`` as a smooth function of ``x``, with 5 degrees of freedom
701 given to the smooth, and centering constraint absorbed in
702 the resulting design matrix. Note that in this example, due to the centering
703 constraint, 6 knots will get computed from the input data ``x``
704 to achieve 5 degrees of freedom.
707 .. note:: This function reproduce the cubic regression splines 'cr' and 'cs'
708 as implemented in the R package 'mgcv' (GAM modelling).
710 """
712 # Under python -OO, __doc__ will be defined but set to None
713 if __doc__:
714 __doc__ += CubicRegressionSpline.common_doc
716 def __init__(self):
717 CubicRegressionSpline.__init__(self, name='cr', cyclic=False)
719cr = stateful_transform(CR)
722class CC(CubicRegressionSpline):
723 """cc(x, df=None, knots=None, lower_bound=None, upper_bound=None, constraints=None)
725 Generates a cyclic cubic spline basis for ``x``
726 (with the option of absorbing centering or more general parameters
727 constraints), allowing non-linear fits. The usual usage is something like::
729 y ~ 1 + cc(x, df=7, constraints='center')
731 to fit ``y`` as a smooth function of ``x``, with 7 degrees of freedom
732 given to the smooth, and centering constraint absorbed in
733 the resulting design matrix. Note that in this example, due to the centering
734 and cyclic constraints, 9 knots will get computed from the input data ``x``
735 to achieve 7 degrees of freedom.
737 .. note:: This function reproduce the cubic regression splines 'cc'
738 as implemented in the R package 'mgcv' (GAM modelling).
740 """
742 # Under python -OO, __doc__ will be defined but set to None
743 if __doc__:
744 __doc__ += CubicRegressionSpline.common_doc
746 def __init__(self):
747 CubicRegressionSpline.__init__(self, name='cc', cyclic=True)
749cc = stateful_transform(CC)
752def test_crs_errors():
753 from nose.tools import assert_raises
754 # Invalid 'x' shape
755 assert_raises(ValueError, cr, np.arange(16).reshape((4, 4)), df=4)
756 assert_raises(ValueError, CR().transform,
757 np.arange(16).reshape((4, 4)), df=4)
758 # Should provide at least 'df' or 'knots'
759 assert_raises(ValueError, cr, np.arange(50))
760 # Invalid constraints shape
761 assert_raises(ValueError, cr, np.arange(50), df=4,
762 constraints=np.arange(27).reshape((3, 3, 3)))
763 # Invalid nb of columns in constraints
764 # (should have df + 1 = 5, but 6 provided)
765 assert_raises(ValueError, cr, np.arange(50), df=4,
766 constraints=np.arange(6))
767 # Too small 'df' for natural cubic spline
768 assert_raises(ValueError, cr, np.arange(50), df=1)
769 # Too small 'df' for cyclic cubic spline
770 assert_raises(ValueError, cc, np.arange(50), df=0)
773def test_crs_compat():
774 from patsy.test_state import check_stateful
775 from patsy.test_splines_crs_data import (R_crs_test_x,
776 R_crs_test_data,
777 R_crs_num_tests)
778 lines = R_crs_test_data.split("\n")
779 tests_ran = 0
780 start_idx = lines.index("--BEGIN TEST CASE--")
781 while True:
782 if not lines[start_idx] == "--BEGIN TEST CASE--":
783 break
784 start_idx += 1
785 stop_idx = lines.index("--END TEST CASE--", start_idx)
786 block = lines[start_idx:stop_idx]
787 test_data = {}
788 for line in block:
789 key, value = line.split("=", 1)
790 test_data[key] = value
791 # Translate the R output into Python calling conventions
792 adjust_df = 0
793 if test_data["spline_type"] == "cr" or test_data["spline_type"] == "cs":
794 spline_type = CR
795 elif test_data["spline_type"] == "cc":
796 spline_type = CC
797 adjust_df += 1
798 else:
799 raise ValueError("Unrecognized spline type %r"
800 % (test_data["spline_type"],))
801 kwargs = {}
802 if test_data["absorb_cons"] == "TRUE":
803 kwargs["constraints"] = "center"
804 adjust_df += 1
805 if test_data["knots"] != "None":
806 all_knots = np.asarray(eval(test_data["knots"]))
807 all_knots.sort()
808 kwargs["knots"] = all_knots[1:-1]
809 kwargs["lower_bound"] = all_knots[0]
810 kwargs["upper_bound"] = all_knots[-1]
811 else:
812 kwargs["df"] = eval(test_data["nb_knots"]) - adjust_df
813 output = np.asarray(eval(test_data["output"]))
814 # Do the actual test
815 check_stateful(spline_type, False, R_crs_test_x, output, **kwargs)
816 tests_ran += 1
817 # Set up for the next one
818 start_idx = stop_idx + 1
819 assert tests_ran == R_crs_num_tests
821test_crs_compat.slow = True
823def test_crs_with_specific_constraint():
824 from patsy.highlevel import incr_dbuilder, build_design_matrices, dmatrix
825 x = (-1.5)**np.arange(20)
826 # Hard coded R values for smooth: s(x, bs="cr", k=5)
827 # R> knots <- smooth$xp
828 knots_R = np.array([-2216.837820053100585937,
829 -50.456909179687500000,
830 -0.250000000000000000,
831 33.637939453125000000,
832 1477.891880035400390625])
833 # R> centering.constraint <- t(qr.X(attr(smooth, "qrc")))
834 centering_constraint_R = np.array([[0.064910676323168478574,
835 1.4519875239407085132,
836 -2.1947446912471946234,
837 1.6129783104357671153,
838 0.064868180547550072235]])
839 # values for which we want a prediction
840 new_x = np.array([-3000., -200., 300., 2000.])
841 result1 = dmatrix("cr(new_x, knots=knots_R[1:-1], "
842 "lower_bound=knots_R[0], upper_bound=knots_R[-1], "
843 "constraints=centering_constraint_R)")
845 data_chunked = [{"x": x[:10]}, {"x": x[10:]}]
846 new_data = {"x": new_x}
847 builder = incr_dbuilder("cr(x, df=4, constraints='center')",
848 lambda: iter(data_chunked))
849 result2 = build_design_matrices([builder], new_data)[0]
851 assert np.allclose(result1, result2, rtol=1e-12, atol=0.)
854class TE(object):
855 """te(s1, .., sn, constraints=None)
857 Generates smooth of several covariates as a tensor product of the bases
858 of marginal univariate smooths ``s1, .., sn``. The marginal smooths are
859 required to transform input univariate data into some kind of smooth
860 functions basis producing a 2-d array output with the ``(i, j)`` element
861 corresponding to the value of the ``j`` th basis function at the ``i`` th
862 data point.
863 The resulting basis dimension is the product of the basis dimensions of
864 the marginal smooths. The usual usage is something like::
866 y ~ 1 + te(cr(x1, df=5), cc(x2, df=6), constraints='center')
868 to fit ``y`` as a smooth function of both ``x1`` and ``x2``, with a natural
869 cubic spline for ``x1`` marginal smooth and a cyclic cubic spline for
870 ``x2`` (and centering constraint absorbed in the resulting design matrix).
872 :arg constraints: Either a 2-d array defining general linear constraints
873 (that is ``np.dot(constraints, betas)`` is zero, where ``betas`` denotes
874 the array of *initial* parameters, corresponding to the *initial*
875 unconstrained design matrix), or the string
876 ``'center'`` indicating that we should apply a centering constraint
877 (this constraint will be computed from the input data, remembered and
878 re-used for prediction from the fitted model).
879 The constraints are absorbed in the resulting design matrix which means
880 that the model is actually rewritten in terms of
881 *unconstrained* parameters. For more details see :ref:`spline-regression`.
883 Using this function requires scipy be installed.
885 .. note:: This function reproduce the tensor product smooth 'te' as
886 implemented in the R package 'mgcv' (GAM modelling).
887 See also 'Generalized Additive Models', Simon N. Wood, 2006, pp 158-163
889 .. versionadded:: 0.3.0
890 """
891 def __init__(self):
892 self._tmp = {}
893 self._constraints = None
895 def memorize_chunk(self, *args, **kwargs):
896 constraints = self._tmp.setdefault("constraints",
897 kwargs.get("constraints"))
898 if safe_string_eq(constraints, "center"):
899 args_2d = []
900 for arg in args:
901 arg = atleast_2d_column_default(arg)
902 if arg.ndim != 2:
903 raise ValueError("Each tensor product argument must be "
904 "a 2-d array or 1-d vector.")
905 args_2d.append(arg)
907 tp = _row_tensor_product(args_2d)
908 self._tmp.setdefault("count", 0)
909 self._tmp["count"] += tp.shape[0]
911 chunk_sum = np.atleast_2d(tp.sum(axis=0))
912 self._tmp.setdefault("sum", np.zeros(chunk_sum.shape))
913 self._tmp["sum"] += chunk_sum
915 def memorize_finish(self):
916 tmp = self._tmp
917 constraints = self._tmp["constraints"]
918 # Guards against invalid subsequent memorize_chunk() calls.
919 del self._tmp
921 if constraints is not None:
922 if safe_string_eq(constraints, "center"):
923 constraints = np.atleast_2d(tmp["sum"] / tmp["count"])
924 else:
925 constraints = np.atleast_2d(constraints)
926 if constraints.ndim != 2:
927 raise ValueError("Constraints must be 2-d array or "
928 "1-d vector.")
930 self._constraints = constraints
932 def transform(self, *args, **kwargs):
933 args_2d = []
934 for arg in args:
935 arg = atleast_2d_column_default(arg)
936 if arg.ndim != 2:
937 raise ValueError("Each tensor product argument must be "
938 "a 2-d array or 1-d vector.")
939 args_2d.append(arg)
941 return _get_te_dmatrix(args_2d, self._constraints)
943 __getstate__ = no_pickling
945te = stateful_transform(TE)
948def test_te_errors():
949 from nose.tools import assert_raises
950 x = np.arange(27)
951 # Invalid input shape
952 assert_raises(ValueError, te, x.reshape((3, 3, 3)))
953 assert_raises(ValueError, te, x.reshape((3, 3, 3)), constraints='center')
954 # Invalid constraints shape
955 assert_raises(ValueError, te, x,
956 constraints=np.arange(8).reshape((2, 2, 2)))
959def test_te_1smooth():
960 from patsy.splines import bs
961 # Tensor product of 1 smooth covariate should be the same
962 # as the smooth alone
963 x = (-1.5)**np.arange(20)
964 assert np.allclose(cr(x, df=6), te(cr(x, df=6)))
965 assert np.allclose(cc(x, df=5), te(cc(x, df=5)))
966 assert np.allclose(bs(x, df=4), te(bs(x, df=4)))
967 # Adding centering constraint to tensor product
968 assert np.allclose(cr(x, df=3, constraints='center'),
969 te(cr(x, df=4), constraints='center'))
970 # Adding specific constraint
971 center_constraint = np.arange(1, 5)
972 assert np.allclose(cr(x, df=3, constraints=center_constraint),
973 te(cr(x, df=4), constraints=center_constraint))
976def test_te_2smooths():
977 from patsy.highlevel import incr_dbuilder, build_design_matrices
978 x1 = (-1.5)**np.arange(20)
979 x2 = (1.6)**np.arange(20)
980 # Hard coded R results for smooth: te(x1, x2, bs=c("cs", "cc"), k=c(5,7))
981 # Without centering constraint:
982 dmatrix_R_nocons = \
983 np.array([[-4.4303024184609255207e-06, 7.9884438387230142235e-06,
984 9.7987758194797719025e-06, -7.2894213245475212959e-08,
985 1.5907686862964493897e-09, -3.2565884983072595159e-11,
986 0.0170749607855874667439, -3.0788499835965849050e-02,
987 -3.7765754357352458725e-02, 2.8094376299826799787e-04,
988 -6.1310290747349201414e-06, 1.2551314933193442915e-07,
989 -0.26012671685838206770, 4.6904420337437874311e-01,
990 0.5753384627946153129230, -4.2800085814700449330e-03,
991 9.3402525733484874533e-05, -1.9121170389937518131e-06,
992 -0.0904312240489447832781, 1.6305991924427923334e-01,
993 2.0001237112941641638e-01, -1.4879148887003382663e-03,
994 3.2470731316462736135e-05, -6.6473404365914134499e-07,
995 2.0447857920168824846e-05, -3.6870296695050991799e-05,
996 -4.5225801045409022233e-05, 3.3643990293641665710e-07,
997 -7.3421200200015877329e-09, 1.5030635073660743297e-10],
998 [-9.4006130602653794302e-04, 7.8681398069163730347e-04,
999 2.4573006857381437217e-04, -1.4524712230452725106e-04,
1000 7.8216741353106329551e-05, -3.1304283003914264551e-04,
1001 3.6231183382798337611064, -3.0324832476174168328e+00,
1002 -9.4707559178211142559e-01, 5.5980126937492580286e-01,
1003 -3.0145747744342332730e-01, 1.2065077148806895302e+00,
1004 -35.17561267504181188315, 2.9441339255948005160e+01,
1005 9.1948319320782125885216, -5.4349184288245195873e+00,
1006 2.9267472035096449012e+00, -1.1713569391233907169e+01,
1007 34.0275626863976370373166, -2.8480442582712722555e+01,
1008 -8.8947340548151565542e+00, 5.2575353623762932642e+00,
1009 -2.8312249982592527786e+00, 1.1331265795534763541e+01,
1010 7.9462158845078978420e-01, -6.6508361863670617531e-01,
1011 -2.0771242914526857892e-01, 1.2277550230353953542e-01,
1012 -6.6115593588420035198e-02, 2.6461103043402139923e-01]])
1013 # With centering constraint:
1014 dmatrix_R_cons = \
1015 np.array([[0.00329998606323867252343, 1.6537431155796576600e-04,
1016 -1.2392262709790753433e-04, 6.5405304166706783407e-05,
1017 -6.6764045799537624095e-05, -0.1386431081763726258504,
1018 0.124297283800864313830, -3.5487293655619825405e-02,
1019 -3.0527115315785902268e-03, 5.2009247643311604277e-04,
1020 -0.00384203992301702674378, -0.058901915802819435064,
1021 0.266422358491648914036, 0.5739281693874087597607,
1022 -1.3171008503525844392e-03, 8.2573456631878912413e-04,
1023 6.6730833453016958831e-03, -0.1467677784718444955470,
1024 0.220757650934837484913, 0.1983127687880171796664,
1025 -1.6269930328365173316e-03, -1.7785892412241208812e-03,
1026 -3.2702835436351201243e-03, -4.3252183044300757109e-02,
1027 4.3403766976235179376e-02, 3.5973406402893762387e-05,
1028 -5.4035858568225075046e-04, 2.9565209382794241247e-04,
1029 -2.2769990750264097637e-04],
1030 [0.41547954838956052681098, 1.9843570584107707994e-02,
1031 -1.5746590234791378593e-02, 8.3171184312221431434e-03,
1032 -8.7233014052017516377e-03, -15.9926770785086258541696,
1033 16.503663226274017716833, -6.6005803955894726265e-01,
1034 1.3986092022708346283e-01, -2.3516913533670955050e-01,
1035 0.72251037497207359905360, -9.827337059999853963177,
1036 3.917078117294827688255, 9.0171773596973618936090,
1037 -5.0616811270787671617e+00, 3.0189990249009683865e+00,
1038 -1.0872720629943064097e+01, 26.9308504460453121964747,
1039 -21.212262927009287949431, -9.1088328555582247503253,
1040 5.2400156972500298025e+00, -3.0593641098325474736e+00,
1041 1.0919392118399086300e+01, -4.6564290223265718538e+00,
1042 4.8071307441606982991e+00, -1.9748377005689798924e-01,
1043 5.4664183716965096538e-02, -2.8871392916916285148e-02,
1044 2.3592766838010845176e-01]])
1045 new_x1 = np.array([11.390625, 656.84083557128906250])
1046 new_x2 = np.array([16.777216000000006346, 1844.6744073709567147])
1047 new_data = {"x1": new_x1, "x2": new_x2}
1048 data_chunked = [{"x1": x1[:10], "x2": x2[:10]},
1049 {"x1": x1[10:], "x2": x2[10:]}]
1051 builder = incr_dbuilder("te(cr(x1, df=5), cc(x2, df=6)) - 1",
1052 lambda: iter(data_chunked))
1053 dmatrix_nocons = build_design_matrices([builder], new_data)[0]
1054 assert np.allclose(dmatrix_nocons, dmatrix_R_nocons, rtol=1e-12, atol=0.)
1056 builder = incr_dbuilder("te(cr(x1, df=5), cc(x2, df=6), "
1057 "constraints='center') - 1",
1058 lambda: iter(data_chunked))
1059 dmatrix_cons = build_design_matrices([builder], new_data)[0]
1060 assert np.allclose(dmatrix_cons, dmatrix_R_cons, rtol=1e-12, atol=0.)
1063def test_te_3smooths():
1064 from patsy.highlevel import incr_dbuilder, build_design_matrices
1065 x1 = (-1.5)**np.arange(20)
1066 x2 = (1.6)**np.arange(20)
1067 x3 = (-1.2)**np.arange(20)
1068 # Hard coded R results for smooth: te(x1, x2, x3, bs=c("cr", "cs", "cc"), k=c(3,3,4))
1069 design_matrix_R = \
1070 np.array([[7.2077663709837084334e-05, 2.0648333344343273131e-03,
1071 -4.7934014082310591768e-04, 2.3923430783992746568e-04,
1072 6.8534265421922660466e-03, -1.5909867344112936776e-03,
1073 -6.8057712777151204314e-09, -1.9496724335203412851e-07,
1074 4.5260614658693259131e-08, 0.0101479754187435277507,
1075 0.290712501531622591333, -0.067487370093906928759,
1076 0.03368233306025386619709, 0.9649092451763204847381,
1077 -0.2239985793289433757547, -9.5819975394704535133e-07,
1078 -2.7449874082511405643e-05, 6.3723431275833230217e-06,
1079 -1.5205851762850489204e-04, -0.00435607204539782688624,
1080 0.00101123909269346416370, -5.0470024059694933508e-04,
1081 -1.4458319360584082416e-02, 3.3564223914790921634e-03,
1082 1.4357783514933466209e-08, 4.1131230514870551983e-07,
1083 -9.5483976834512651038e-08]])
1084 new_data = {"x1": -38.443359375000000000,
1085 "x2": 68.719476736000032702,
1086 "x3": -5.1597803519999985156}
1087 data_chunked = [{"x1": x1[:10], "x2": x2[:10], "x3": x3[:10]},
1088 {"x1": x1[10:], "x2": x2[10:], "x3": x3[10:]}]
1089 builder = incr_dbuilder("te(cr(x1, df=3), cr(x2, df=3), cc(x3, df=3)) - 1",
1090 lambda: iter(data_chunked))
1091 design_matrix = build_design_matrices([builder], new_data)[0]
1092 assert np.allclose(design_matrix, design_matrix_R, rtol=1e-12, atol=0.)