Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from statsmodels.compat.pandas import is_numeric_dtype
3import numbers
5import warnings
6import numpy as np
7from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
8 PeriodIndex, RangeIndex, Timestamp, Series, Index,
9 Float64Index, date_range, period_range)
10from pandas.tseries.frequencies import to_offset
12from statsmodels.base import data
13import statsmodels.base.model as base
14import statsmodels.base.wrapper as wrap
15from statsmodels.tools.sm_exceptions import ValueWarning
17_tsa_doc = """
18 %(model)s
20 Parameters
21 ----------
22 %(params)s
23 dates : array_like, optional
24 An array-like object of datetime objects. If a pandas object is given
25 for endog or exog, it is assumed to have a DateIndex.
26 freq : str, optional
27 The frequency of the time-series. A Pandas offset or 'B', 'D', 'W',
28 'M', 'A', or 'Q'. This is optional if dates are given.
29 %(extra_params)s
30 %(extra_sections)s"""
32_model_doc = "Timeseries model base class"
34_generic_params = base._model_params_doc
35_missing_param_doc = base._missing_param_doc
38class TimeSeriesModel(base.LikelihoodModel):
40 __doc__ = _tsa_doc % {"model": _model_doc, "params": _generic_params,
41 "extra_params": _missing_param_doc,
42 "extra_sections": ""}
44 def __init__(self, endog, exog=None, dates=None, freq=None,
45 missing='none', **kwargs):
46 super(TimeSeriesModel, self).__init__(endog, exog, missing=missing,
47 **kwargs)
49 # Date handling in indexes
50 self._init_dates(dates, freq)
52 def _init_dates(self, dates=None, freq=None):
53 """
54 Initialize dates
56 Parameters
57 ----------
58 dates : array_like, optional
59 An array like object containing dates.
60 freq : str, tuple, datetime.timedelta, DateOffset or None, optional
61 A frequency specification for either `dates` or the row labels from
62 the endog / exog data.
64 Notes
65 -----
66 Creates `self._index` and related attributes. `self._index` is always
67 a Pandas index, and it is always Int64Index, DatetimeIndex, or
68 PeriodIndex.
70 If Pandas objects, endog / exog may have any type of index. If it is
71 an Int64Index with values 0, 1, ..., nobs-1 or if it is (coerceable to)
72 a DatetimeIndex or PeriodIndex *with an associated frequency*, then it
73 is called a "supported" index. Otherwise it is called an "unsupported"
74 index.
76 Supported indexes are standardized (i.e. a list of date strings is
77 converted to a DatetimeIndex) and the result is put in `self._index`.
79 Unsupported indexes are ignored, and a supported Int64Index is
80 generated and put in `self._index`. Warnings are issued in this case
81 to alert the user if the returned index from some operation (e.g.
82 forecasting) is different from the original data's index. However,
83 whenever possible (e.g. purely in-sample prediction), the original
84 index is returned.
86 The benefit of supported indexes is that they allow *forecasting*, i.e.
87 it is possible to extend them in a reasonable way. Thus every model
88 must have an underlying supported index, even if it is just a generated
89 Int64Index.
90 """
92 # Get our index from `dates` if available, otherwise from whatever
93 # Pandas index we might have retrieved from endog, exog
94 if dates is not None:
95 index = dates
96 else:
97 index = self.data.row_labels
99 # Sanity check that we do not have a `freq` without an index
100 if index is None and freq is not None:
101 raise ValueError('Frequency provided without associated index.')
103 # If an index is available, see if it is a date-based index or if it
104 # can be coerced to one. (If it cannot we'll fall back, below, to an
105 # internal, 0, 1, ... nobs-1 integer index for modeling purposes)
106 inferred_freq = False
107 if index is not None:
108 # Try to coerce to date-based index
109 if not isinstance(index, (DatetimeIndex, PeriodIndex)):
110 try:
111 # Only try to coerce non-numeric index types (string,
112 # list of date-times, etc.)
113 # Note that np.asarray(Float64Index([...])) yields an
114 # object dtype array in earlier versions of Pandas (and so
115 # will not have is_numeric_dtype == True), so explicitly
116 # check for it here. But note also that in very early
117 # Pandas (~0.12), Float64Index does not exist (and so the
118 # statsmodels compat makes it an empty tuple, so in that
119 # case also check if the first element is a float.
120 _index = np.asarray(index)
121 if (is_numeric_dtype(_index) or
122 isinstance(index, Float64Index) or
123 (Float64Index == tuple() and
124 isinstance(_index[0], float))):
125 raise ValueError('Numeric index given')
126 # If a non-index Pandas series was given, only keep its
127 # values (because we must have a pd.Index type, below, and
128 # pd.to_datetime will return a Series when passed
129 # non-list-like objects)
130 if isinstance(index, Series):
131 index = index.values
132 # All coercion is done via pd.to_datetime
133 # Note: date coercion via pd.to_datetime does not handle
134 # string versions of PeriodIndex objects most of the time.
135 _index = to_datetime(index)
136 # Older versions of Pandas can sometimes fail here and
137 # return a numpy array - check to make sure it's an index
138 if not isinstance(_index, Index):
139 raise ValueError('Could not coerce to date index')
140 index = _index
141 except:
142 # Only want to actually raise an exception if `dates` was
143 # provided but cannot be coerced. If we got the index from
144 # the row_labels, we'll just ignore it and use the integer
145 # index below
146 if dates is not None:
147 raise ValueError('Non-date index index provided to'
148 ' `dates` argument.')
149 # Now, if we were given, or coerced, a date-based index, make sure
150 # it has an associated frequency
151 if isinstance(index, (DatetimeIndex, PeriodIndex)):
152 # If no frequency, try to get an inferred frequency
153 if freq is None and index.freq is None:
154 freq = index.inferred_freq
155 # If we got an inferred frequncy, alert the user
156 if freq is not None:
157 inferred_freq = True
158 if freq is not None:
159 warnings.warn('No frequency information was'
160 ' provided, so inferred frequency %s'
161 ' will be used.'
162 % freq, ValueWarning)
164 # Convert the passed freq to a pandas offset object
165 if freq is not None:
166 freq = to_offset(freq)
168 # Now, if no frequency information is available from the index
169 # itself or from the `freq` argument, raise an exception
170 if freq is None and index.freq is None:
171 # But again, only want to raise the exception if `dates`
172 # was provided.
173 if dates is not None:
174 raise ValueError('No frequency information was'
175 ' provided with date index and no'
176 ' frequency could be inferred.')
177 # However, if the index itself has no frequency information but
178 # the `freq` argument is available (or was inferred), construct
179 # a new index with an associated frequency
180 elif freq is not None and index.freq is None:
181 resampled_index = date_range(
182 start=index[0], end=index[-1], freq=freq)
183 if not inferred_freq and not resampled_index.equals(index):
184 raise ValueError('The given frequency argument could'
185 ' not be matched to the given index.')
186 index = resampled_index
187 # Finally, if the index itself has a frequency and there was
188 # also a given frequency, raise an exception if they are not
189 # equal
190 elif (freq is not None and not inferred_freq and
191 not (index.freq == freq)):
192 raise ValueError('The given frequency argument is'
193 ' incompatible with the given index.')
194 # Finally, raise an exception if we could not coerce to date-based
195 # but we were given a frequency argument
196 elif freq is not None:
197 raise ValueError('Given index could not be coerced to dates'
198 ' but `freq` argument was provided.')
200 # Get attributes of the index
201 has_index = index is not None
202 date_index = isinstance(index, (DatetimeIndex, PeriodIndex))
203 period_index = isinstance(index, PeriodIndex)
204 int_index = isinstance(index, Int64Index)
205 range_index = isinstance(index, RangeIndex)
206 has_freq = index.freq is not None if date_index else None
207 increment = Index(range(self.endog.shape[0]))
208 is_increment = index.equals(increment) if int_index else None
209 is_monotonic = index.is_monotonic if date_index else None
211 # Issue warnings for unsupported indexes
212 if has_index and not (date_index or range_index or is_increment):
213 warnings.warn('An unsupported index was provided and will be'
214 ' ignored when e.g. forecasting.', ValueWarning)
215 if date_index and not has_freq:
216 warnings.warn('A date index has been provided, but it has no'
217 ' associated frequency information and so will be'
218 ' ignored when e.g. forecasting.', ValueWarning)
219 if date_index and not is_monotonic:
220 warnings.warn('A date index has been provided, but it is not'
221 ' monotonic and so will be ignored when e.g.'
222 ' forecasting.', ValueWarning)
224 # Construct the internal index
225 index_generated = False
226 valid_index = ((date_index and has_freq and is_monotonic) or
227 (int_index and is_increment) or range_index)
229 if valid_index:
230 _index = index
231 else:
232 _index = increment
233 index_generated = True
234 self._index = _index
235 self._index_generated = index_generated
236 self._index_none = index is None
237 self._index_int64 = int_index and not range_index and not date_index
238 self._index_dates = date_index and not index_generated
239 self._index_freq = self._index.freq if self._index_dates else None
240 self._index_inferred_freq = inferred_freq
242 # For backwards compatibility, set data.dates, data.freq
243 self.data.dates = self._index if self._index_dates else None
244 self.data.freq = self._index.freqstr if self._index_dates else None
246 def _get_index_loc(self, key, base_index=None):
247 """
248 Get the location of a specific key in an index
250 Parameters
251 ----------
252 key : label
253 The key for which to find the location if the underlying index is
254 a DateIndex or a location if the underlying index is a RangeIndex
255 or an Int64Index.
256 base_index : pd.Index, optional
257 Optionally the base index to search. If None, the model's index is
258 searched.
260 Returns
261 -------
262 loc : int
263 The location of the key
264 index : pd.Index
265 The index including the key; this is a copy of the original index
266 unless the index had to be expanded to accommodate `key`.
267 index_was_expanded : bool
268 Whether or not the index was expanded to accommodate `key`.
270 Notes
271 -----
272 If `key` is past the end of of the given index, and the index is either
273 an Int64Index or a date index, this function extends the index up to
274 and including key, and then returns the location in the new index.
275 """
276 if base_index is None:
277 base_index = self._index
279 index = base_index
280 date_index = isinstance(base_index, (PeriodIndex, DatetimeIndex))
281 int_index = isinstance(base_index, Int64Index)
282 range_index = isinstance(base_index, RangeIndex)
283 index_class = type(base_index)
284 nobs = len(index)
286 # Special handling for RangeIndex
287 if range_index and isinstance(key, (int, np.integer)):
288 # Negative indices (that lie in the Index)
289 if key < 0 and -key <= nobs:
290 key = nobs + key
291 # Out-of-sample (note that we include key itself in the new index)
292 elif key > nobs - 1:
293 # See gh5835. Remove the except after pandas 0.25 required.
294 try:
295 base_index_start = base_index.start
296 base_index_step = base_index.step
297 except AttributeError:
298 base_index_start = base_index._start
299 base_index_step = base_index._step
300 stop = base_index_start + (key + 1) * base_index_step
301 index = RangeIndex(start=base_index_start,
302 stop=stop,
303 step=base_index_step)
305 # Special handling for Int64Index
306 if (not range_index and int_index and not date_index and
307 isinstance(key, (int, np.integer))):
308 # Negative indices (that lie in the Index)
309 if key < 0 and -key <= nobs:
310 key = nobs + key
311 # Out-of-sample (note that we include key itself in the new index)
312 elif key > base_index[-1]:
313 index = Int64Index(np.arange(base_index[0], int(key + 1)))
315 # Special handling for date indexes
316 if date_index:
317 # Use index type to choose creation function
318 if index_class is DatetimeIndex:
319 index_fn = date_range
320 else:
321 index_fn = period_range
322 # Integer key (i.e. already given a location)
323 if isinstance(key, (int, np.integer)):
324 # Negative indices (that lie in the Index)
325 if key < 0 and -key < nobs:
326 key = index[nobs + key]
327 # Out-of-sample (note that we include key itself in the new
328 # index)
329 elif key > len(base_index) - 1:
330 index = index_fn(start=base_index[0],
331 periods=int(key + 1),
332 freq=base_index.freq)
333 key = index[-1]
334 else:
335 key = index[key]
336 # Other key types (i.e. string date or some datetime-like object)
337 else:
338 # Covert the key to the appropriate date-like object
339 if index_class is PeriodIndex:
340 date_key = Period(key, freq=base_index.freq)
341 else:
342 date_key = Timestamp(key, freq=base_index.freq)
344 # Out-of-sample
345 if date_key > base_index[-1]:
346 # First create an index that may not always include `key`
347 index = index_fn(start=base_index[0], end=date_key,
348 freq=base_index.freq)
350 # Now make sure we include `key`
351 if not index[-1] == date_key:
352 index = index_fn(start=base_index[0],
353 periods=len(index) + 1,
354 freq=base_index.freq)
356 # To avoid possible inconsistencies with `get_loc` below,
357 # set the key directly equal to the last index location
358 key = index[-1]
360 # Get the location
361 if date_index:
362 # (note that get_loc will throw a KeyError if key is invalid)
363 loc = index.get_loc(key)
364 elif int_index or range_index:
365 # For Int64Index and RangeIndex, key is assumed to be the location
366 # and not an index value (this assumption is required to support
367 # RangeIndex)
368 try:
369 index[key]
370 # We want to raise a KeyError in this case, to keep the exception
371 # consistent across index types.
372 # - Attempting to index with an out-of-bound location (e.g.
373 # index[10] on an index of length 9) will raise an IndexError
374 # (as of Pandas 0.22)
375 # - Attemtping to index with a type that cannot be cast to integer
376 # (e.g. a non-numeric string) will raise a ValueError if the
377 # index is RangeIndex (otherwise will raise an IndexError)
378 # (as of Pandas 0.22)
379 except (IndexError, ValueError) as e:
380 raise KeyError(str(e))
381 loc = key
382 else:
383 loc = index.get_loc(key)
385 # Check if we now have a modified index
386 index_was_expanded = index is not base_index
388 # Return the index through the end of the loc / slice
389 if isinstance(loc, slice):
390 end = loc.stop - 1
391 else:
392 end = loc
394 return loc, index[:end + 1], index_was_expanded
396 def _get_index_label_loc(self, key, base_index=None):
397 """
398 Get the location of a specific key in an index or model row labels
400 Parameters
401 ----------
402 key : label
403 The key for which to find the location if the underlying index is
404 a DateIndex or is only being used as row labels, or a location if
405 the underlying index is a RangeIndex or an Int64Index.
406 base_index : pd.Index, optional
407 Optionally the base index to search. If None, the model's index is
408 searched.
410 Returns
411 -------
412 loc : int
413 The location of the key
414 index : pd.Index
415 The index including the key; this is a copy of the original index
416 unless the index had to be expanded to accommodate `key`.
417 index_was_expanded : bool
418 Whether or not the index was expanded to accommodate `key`.
420 Notes
421 -----
422 This method expands on `_get_index_loc` by first trying the given
423 base index (or the model's index if the base index was not given) and
424 then falling back to try again with the model row labels as the base
425 index.
426 """
427 try:
428 loc, index, index_was_expanded = (
429 self._get_index_loc(key, base_index))
430 except KeyError as e:
431 try:
432 if not isinstance(key, (int, np.integer)):
433 loc = self.data.row_labels.get_loc(key)
434 else:
435 raise
436 # Require scalar
437 # Pandas may return a slice if there are multiple matching
438 # locations that are monotonic increasing (otherwise it may
439 # return an array of integer locations, see below).
440 if isinstance(loc, slice):
441 loc = loc.start
442 if isinstance(loc, np.ndarray):
443 # Pandas may return a mask (boolean array), for e.g.:
444 # pd.Index(list('abcb')).get_loc('b')
445 if loc.dtype == bool:
446 # Return the first True value
447 # (we know there is at least one True value if we're
448 # here because otherwise the get_loc call would have
449 # raised an exception)
450 loc = np.argmax(loc)
451 # Finally, Pandas may return an integer array of
452 # locations that match the given value, for e.g.
453 # pd.DatetimeIndex(['2001-02', '2001-01']).get_loc('2001')
454 # (this appears to be slightly undocumented behavior, since
455 # only int, slice, and mask are mentioned in docs for
456 # pandas.Index.get_loc as of 0.23.4)
457 else:
458 loc = loc[0]
459 if not isinstance(loc, numbers.Integral):
460 raise
462 index = self.data.row_labels[:loc + 1]
463 index_was_expanded = False
464 except:
465 raise e
466 return loc, index, index_was_expanded
468 def _get_prediction_index(self, start, end, index=None, silent=False):
469 """
470 Get the location of a specific key in an index or model row labels
472 Parameters
473 ----------
474 start : label
475 The key at which to start prediction. Depending on the underlying
476 model's index, may be an integer, a date (string, datetime object,
477 pd.Timestamp, or pd.Period object), or some other object in the
478 model's row labels.
479 end : label
480 The key at which to end prediction (note that this key will be
481 *included* in prediction). Depending on the underlying
482 model's index, may be an integer, a date (string, datetime object,
483 pd.Timestamp, or pd.Period object), or some other object in the
484 model's row labels.
485 index : pd.Index, optional
486 Optionally an index to associate the predicted results to. If None,
487 an attempt is made to create an index for the predicted results
488 from the model's index or model's row labels.
489 silent : bool, optional
490 Argument to silence warnings.
492 Returns
493 -------
494 start : int
495 The index / observation location at which to begin prediction.
496 end : int
497 The index / observation location at which to end in-sample
498 prediction. The maximum value for this is nobs-1.
499 out_of_sample : int
500 The number of observations to forecast after the end of the sample.
501 prediction_index : pd.Index or None
502 The index associated with the prediction results. This index covers
503 the range [start, end + out_of_sample]. If the model has no given
504 index and no given row labels (i.e. endog/exog is not Pandas), then
505 this will be None.
507 Notes
508 -----
509 The arguments `start` and `end` behave differently, depending on if
510 they are integer or not. If either is an integer, then it is assumed
511 to refer to a *location* in the index, not to an index value. On the
512 other hand, if it is a date string or some other type of object, then
513 it is assumed to refer to an index *value*. In all cases, the returned
514 `start` and `end` values refer to index *locations* (so in the former
515 case, the given location is validated and returned whereas in the
516 latter case a location is found that corresponds to the given index
517 value).
519 This difference in behavior is necessary to support `RangeIndex`. This
520 is because integers for a RangeIndex could refer either to index values
521 or to index locations in an ambiguous way (while for `Int64Index`,
522 since we have required them to be full indexes, there is no ambiguity).
523 """
525 # Convert index keys (start, end) to index locations and get associated
526 # indexes.
527 try:
528 start, start_index, start_oos = self._get_index_label_loc(start)
529 except KeyError:
530 raise KeyError('The `start` argument could not be matched to a'
531 ' location related to the index of the data.')
532 if end is None:
533 end = max(start, len(self._index) - 1)
534 try:
535 end, end_index, end_oos = self._get_index_label_loc(end)
536 except KeyError:
537 raise KeyError('The `end` argument could not be matched to a'
538 ' location related to the index of the data.')
540 # Handle slices (if the given index keys cover more than one date)
541 if isinstance(start, slice):
542 start = start.start
543 if isinstance(end, slice):
544 end = end.stop - 1
546 # Get the actual index for the prediction
547 prediction_index = end_index[start:]
549 # Validate prediction options
550 if end < start:
551 raise ValueError('Prediction must have `end` after `start`.')
553 # Handle custom prediction index
554 # First, if we were given an index, check that it's the right size and
555 # use it if so
556 if index is not None:
557 if not len(prediction_index) == len(index):
558 raise ValueError('Invalid `index` provided in prediction.'
559 ' Must have length consistent with `start`'
560 ' and `end` arguments.')
561 # But if we weren't given Pandas input, this index will not be
562 # used because the data will not be wrapped; in that case, issue
563 # a warning
564 if not isinstance(self.data, data.PandasData) and not silent:
565 warnings.warn('Because the model data (`endog`, `exog`) were'
566 ' not given as Pandas objects, the prediction'
567 ' output will be Numpy arrays, and the given'
568 ' `index` argument will only be used'
569 ' internally.', ValueWarning)
570 prediction_index = Index(index)
571 # Now, if we *do not* have a supported index, but we were given some
572 # kind of index...
573 elif self._index_generated and not self._index_none:
574 # If we are in sample, and have row labels, use them
575 if self.data.row_labels is not None and not (start_oos or end_oos):
576 prediction_index = self.data.row_labels[start:end + 1]
577 # Otherwise, warn the user that they will get an Int64Index
578 else:
579 if not silent:
580 warnings.warn('No supported index is available.'
581 ' Prediction results will be given with'
582 ' an integer index beginning at `start`.',
583 ValueWarning)
584 warnings.warn('No supported index is available. In the next'
585 ' version, calling this method in a model'
586 ' without a supported index will result in an'
587 ' exception.', DeprecationWarning)
588 elif self._index_none:
589 prediction_index = None
591 # For backwards compatibility, set `predict_*` values
592 if prediction_index is not None:
593 self.data.predict_start = prediction_index[0]
594 self.data.predict_end = prediction_index[-1]
595 self.data.predict_dates = prediction_index
596 else:
597 self.data.predict_start = None
598 self.data.predict_end = None
599 self.data.predict_dates = None
601 # Compute out-of-sample observations
602 nobs = len(self.endog)
603 out_of_sample = max(end - (nobs - 1), 0)
604 end -= out_of_sample
606 return start, end, out_of_sample, prediction_index
608 def _get_exog_names(self):
609 return self.data.xnames
611 def _set_exog_names(self, vals):
612 if not isinstance(vals, list):
613 vals = [vals]
614 self.data.xnames = vals
616 # overwrite with writable property for (V)AR models
617 exog_names = property(_get_exog_names, _set_exog_names, None,
618 'The names of the exogenous variables.')
621class TimeSeriesModelResults(base.LikelihoodModelResults):
622 def __init__(self, model, params, normalized_cov_params, scale=1.):
623 self.data = model.data
624 super(TimeSeriesModelResults,
625 self).__init__(model, params, normalized_cov_params, scale)
628class TimeSeriesResultsWrapper(wrap.ResultsWrapper):
629 _attrs = {}
630 _wrap_attrs = wrap.union_dicts(base.LikelihoodResultsWrapper._wrap_attrs,
631 _attrs)
632 _methods = {'predict' : 'dates'}
633 _wrap_methods = wrap.union_dicts(base.LikelihoodResultsWrapper._wrap_methods,
634 _methods)
635wrap.populate_wrapper(TimeSeriesResultsWrapper, # noqa:E305
636 TimeSeriesModelResults)
639if __name__ == "__main__":
640 import statsmodels.api as sm
641 import pandas
643 mdata = sm.datasets.macrodata.load(as_pandas=False)
645 #make a DataFrame
646 #TODO: attach a DataFrame to some of the datasets, for quicker use
647 dates = [str(int(x[0])) +':'+ str(int(x[1])) \
648 for x in mdata.data[['year','quarter']]]
650 df = pandas.DataFrame(mdata.data[['realgdp','realinv','realcons']], index=dates)
651 ex_mod = TimeSeriesModel(df)