Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/highlevel.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2011-2013 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5# These are made available in the patsy.* namespace:
6__all__ = ["dmatrix", "dmatrices",
7 "incr_dbuilder", "incr_dbuilders"]
9# problems:
10# statsmodels reluctant to pass around separate eval environment, suggesting
11# that design_and_matrices-equivalent should return a formula_like
12# is ModelDesc really the high-level thing?
13# ModelDesign doesn't work -- need to work with the builder set
14# want to be able to return either a matrix or a pandas dataframe
16import six
17import numpy as np
18from patsy import PatsyError
19from patsy.design_info import DesignMatrix, DesignInfo
20from patsy.eval import EvalEnvironment
21from patsy.desc import ModelDesc
22from patsy.build import (design_matrix_builders,
23 build_design_matrices)
24from patsy.util import (have_pandas, asarray_or_pandas,
25 atleast_2d_column_default)
27if have_pandas:
28 import pandas
30# Tries to build a (lhs, rhs) design given a formula_like and an incremental
31# data source. If formula_like is not capable of doing this, then returns
32# None.
33def _try_incr_builders(formula_like, data_iter_maker, eval_env,
34 NA_action):
35 if isinstance(formula_like, DesignInfo):
36 return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0],
37 formula_like)
38 if (isinstance(formula_like, tuple)
39 and len(formula_like) == 2
40 and isinstance(formula_like[0], DesignInfo)
41 and isinstance(formula_like[1], DesignInfo)):
42 return formula_like
43 if hasattr(formula_like, "__patsy_get_model_desc__"):
44 formula_like = formula_like.__patsy_get_model_desc__(eval_env)
45 if not isinstance(formula_like, ModelDesc):
46 raise PatsyError("bad value from %r.__patsy_get_model_desc__"
47 % (formula_like,))
48 # fallthrough
49 if not six.PY3 and isinstance(formula_like, unicode):
50 # Included for the convenience of people who are using py2 with
51 # __future__.unicode_literals.
52 try:
53 formula_like = formula_like.encode("ascii")
54 except UnicodeEncodeError:
55 raise PatsyError(
56 "On Python 2, formula strings must be either 'str' objects, "
57 "or else 'unicode' objects containing only ascii "
58 "characters. You passed a unicode string with non-ascii "
59 "characters. I'm afraid you'll have to either switch to "
60 "ascii-only, or else upgrade to Python 3.")
61 if isinstance(formula_like, str):
62 formula_like = ModelDesc.from_formula(formula_like)
63 # fallthrough
64 if isinstance(formula_like, ModelDesc):
65 assert isinstance(eval_env, EvalEnvironment)
66 return design_matrix_builders([formula_like.lhs_termlist,
67 formula_like.rhs_termlist],
68 data_iter_maker,
69 eval_env,
70 NA_action)
71 else:
72 return None
74def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"):
75 """Construct a design matrix builder incrementally from a large data set.
77 :arg formula_like: Similar to :func:`dmatrix`, except that explicit
78 matrices are not allowed. Must be a formula string, a
79 :class:`ModelDesc`, a :class:`DesignInfo`, or an object with a
80 ``__patsy_get_model_desc__`` method.
81 :arg data_iter_maker: A zero-argument callable which returns an iterator
82 over dict-like data objects. This must be a callable rather than a
83 simple iterator because sufficiently complex formulas may require
84 multiple passes over the data (e.g. if there are nested stateful
85 transforms).
86 :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
87 look up any variables referenced in `formula_like` that cannot be
88 found in `data`, or else a depth represented as an
89 integer which will be passed to :meth:`EvalEnvironment.capture`.
90 ``eval_env=0`` means to use the context of the function calling
91 :func:`incr_dbuilder` for lookups. If calling this function from a
92 library, you probably want ``eval_env=1``, which means that variables
93 should be resolved in *your* caller's namespace.
94 :arg NA_action: An :class:`NAAction` object or string, used to determine
95 what values count as 'missing' for purposes of determining the levels of
96 categorical factors.
97 :returns: A :class:`DesignInfo`
99 Tip: for `data_iter_maker`, write a generator like::
101 def iter_maker():
102 for data_chunk in my_data_store:
103 yield data_chunk
105 and pass `iter_maker` (*not* `iter_maker()`).
107 .. versionadded:: 0.2.0
108 The ``NA_action`` argument.
109 """
110 eval_env = EvalEnvironment.capture(eval_env, reference=1)
111 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
112 NA_action)
113 if design_infos is None:
114 raise PatsyError("bad formula-like object")
115 if len(design_infos[0].column_names) > 0:
116 raise PatsyError("encountered outcome variables for a model "
117 "that does not expect them")
118 return design_infos[1]
120def incr_dbuilders(formula_like, data_iter_maker, eval_env=0,
121 NA_action="drop"):
122 """Construct two design matrix builders incrementally from a large data
123 set.
125 :func:`incr_dbuilders` is to :func:`incr_dbuilder` as :func:`dmatrices` is
126 to :func:`dmatrix`. See :func:`incr_dbuilder` for details.
127 """
128 eval_env = EvalEnvironment.capture(eval_env, reference=1)
129 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
130 NA_action)
131 if design_infos is None:
132 raise PatsyError("bad formula-like object")
133 if len(design_infos[0].column_names) == 0:
134 raise PatsyError("model is missing required outcome variables")
135 return design_infos
137# This always returns a length-two tuple,
138# response, predictors
139# where
140# response is a DesignMatrix (possibly with 0 columns)
141# predictors is a DesignMatrix
142# The input 'formula_like' could be like:
143# (np.ndarray, np.ndarray)
144# (DesignMatrix, DesignMatrix)
145# (None, DesignMatrix)
146# np.ndarray # for predictor-only models
147# DesignMatrix
148# (None, np.ndarray)
149# "y ~ x"
150# ModelDesc(...)
151# DesignInfo
152# (DesignInfo, DesignInfo)
153# any object with a special method __patsy_get_model_desc__
154def _do_highlevel_design(formula_like, data, eval_env,
155 NA_action, return_type):
156 if return_type == "dataframe" and not have_pandas:
157 raise PatsyError("pandas.DataFrame was requested, but pandas "
158 "is not installed")
159 if return_type not in ("matrix", "dataframe"):
160 raise PatsyError("unrecognized output type %r, should be "
161 "'matrix' or 'dataframe'" % (return_type,))
162 def data_iter_maker():
163 return iter([data])
164 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
165 NA_action)
166 if design_infos is not None:
167 return build_design_matrices(design_infos, data,
168 NA_action=NA_action,
169 return_type=return_type)
170 else:
171 # No builders, but maybe we can still get matrices
172 if isinstance(formula_like, tuple):
173 if len(formula_like) != 2:
174 raise PatsyError("don't know what to do with a length %s "
175 "matrices tuple"
176 % (len(formula_like),))
177 (lhs, rhs) = formula_like
178 else:
179 # subok=True is necessary here to allow DesignMatrixes to pass
180 # through
181 (lhs, rhs) = (None, asarray_or_pandas(formula_like, subok=True))
182 # some sort of explicit matrix or matrices were given. Currently we
183 # have them in one of these forms:
184 # -- an ndarray or subclass
185 # -- a DesignMatrix
186 # -- a pandas.Series
187 # -- a pandas.DataFrame
188 # and we have to produce a standard output format.
189 def _regularize_matrix(m, default_column_prefix):
190 di = DesignInfo.from_array(m, default_column_prefix)
191 if have_pandas and isinstance(m, (pandas.Series, pandas.DataFrame)):
192 orig_index = m.index
193 else:
194 orig_index = None
195 if return_type == "dataframe":
196 m = atleast_2d_column_default(m, preserve_pandas=True)
197 m = pandas.DataFrame(m)
198 m.columns = di.column_names
199 m.design_info = di
200 return (m, orig_index)
201 else:
202 return (DesignMatrix(m, di), orig_index)
203 rhs, rhs_orig_index = _regularize_matrix(rhs, "x")
204 if lhs is None:
205 lhs = np.zeros((rhs.shape[0], 0), dtype=float)
206 lhs, lhs_orig_index = _regularize_matrix(lhs, "y")
208 assert isinstance(getattr(lhs, "design_info", None), DesignInfo)
209 assert isinstance(getattr(rhs, "design_info", None), DesignInfo)
210 if lhs.shape[0] != rhs.shape[0]:
211 raise PatsyError("shape mismatch: outcome matrix has %s rows, "
212 "predictor matrix has %s rows"
213 % (lhs.shape[0], rhs.shape[0]))
214 if rhs_orig_index is not None and lhs_orig_index is not None:
215 if not rhs_orig_index.equals(lhs_orig_index):
216 raise PatsyError("index mismatch: outcome and "
217 "predictor have incompatible indexes")
218 if return_type == "dataframe":
219 if rhs_orig_index is not None and lhs_orig_index is None:
220 lhs.index = rhs.index
221 if rhs_orig_index is None and lhs_orig_index is not None:
222 rhs.index = lhs.index
223 return (lhs, rhs)
225def dmatrix(formula_like, data={}, eval_env=0,
226 NA_action="drop", return_type="matrix"):
227 """Construct a single design matrix given a formula_like and data.
229 :arg formula_like: An object that can be used to construct a design
230 matrix. See below.
231 :arg data: A dict-like object that can be used to look up variables
232 referenced in `formula_like`.
233 :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
234 look up any variables referenced in `formula_like` that cannot be
235 found in `data`, or else a depth represented as an
236 integer which will be passed to :meth:`EvalEnvironment.capture`.
237 ``eval_env=0`` means to use the context of the function calling
238 :func:`dmatrix` for lookups. If calling this function from a library,
239 you probably want ``eval_env=1``, which means that variables should be
240 resolved in *your* caller's namespace.
241 :arg NA_action: What to do with rows that contain missing values. You can
242 ``"drop"`` them, ``"raise"`` an error, or for customization, pass an
243 :class:`NAAction` object. See :class:`NAAction` for details on what
244 values count as 'missing' (and how to alter this).
245 :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
247 The `formula_like` can take a variety of forms. You can use any of the
248 following:
250 * (The most common option) A formula string like ``"x1 + x2"`` (for
251 :func:`dmatrix`) or ``"y ~ x1 + x2"`` (for :func:`dmatrices`). For
252 details see :ref:`formulas`.
253 * A :class:`ModelDesc`, which is a Python object representation of a
254 formula. See :ref:`formulas` and :ref:`expert-model-specification` for
255 details.
256 * A :class:`DesignInfo`.
257 * An object that has a method called :meth:`__patsy_get_model_desc__`.
258 For details see :ref:`expert-model-specification`.
259 * A numpy array_like (for :func:`dmatrix`) or a tuple
260 (array_like, array_like) (for :func:`dmatrices`). These will have
261 metadata added, representation normalized, and then be returned
262 directly. In this case `data` and `eval_env` are
263 ignored. There is special handling for two cases:
265 * :class:`DesignMatrix` objects will have their :class:`DesignInfo`
266 preserved. This allows you to set up custom column names and term
267 information even if you aren't using the rest of the patsy
268 machinery.
269 * :class:`pandas.DataFrame` or :class:`pandas.Series` objects will have
270 their (row) indexes checked. If two are passed in, their indexes must
271 be aligned. If ``return_type="dataframe"``, then their indexes will be
272 preserved on the output.
274 Regardless of the input, the return type is always either:
276 * A :class:`DesignMatrix`, if ``return_type="matrix"`` (the default)
277 * A :class:`pandas.DataFrame`, if ``return_type="dataframe"``.
279 The actual contents of the design matrix is identical in both cases, and
280 in both cases a :class:`DesignInfo` object will be available in a
281 ``.design_info`` attribute on the return value. However, for
282 ``return_type="dataframe"``, any pandas indexes on the input (either in
283 `data` or directly passed through `formula_like`) will be preserved, which
284 may be useful for e.g. time-series models.
286 .. versionadded:: 0.2.0
287 The ``NA_action`` argument.
288 """
289 eval_env = EvalEnvironment.capture(eval_env, reference=1)
290 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
291 NA_action, return_type)
292 if lhs.shape[1] != 0:
293 raise PatsyError("encountered outcome variables for a model "
294 "that does not expect them")
295 return rhs
297def dmatrices(formula_like, data={}, eval_env=0,
298 NA_action="drop", return_type="matrix"):
299 """Construct two design matrices given a formula_like and data.
301 This function is identical to :func:`dmatrix`, except that it requires
302 (and returns) two matrices instead of one. By convention, the first matrix
303 is the "outcome" or "y" data, and the second is the "predictor" or "x"
304 data.
306 See :func:`dmatrix` for details.
307 """
308 eval_env = EvalEnvironment.capture(eval_env, reference=1)
309 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
310 NA_action, return_type)
311 if lhs.shape[1] == 0:
312 raise PatsyError("model is missing required outcome variables")
313 return (lhs, rhs)