Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/iolib/summary2.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from statsmodels.compat.python import (lrange, iterkeys, iteritems, lzip,
2 itervalues)
4from collections import OrderedDict
5import datetime
6from functools import reduce
7import re
8import textwrap
10import numpy as np
11import pandas as pd
13from .table import SimpleTable
14from .tableformatting import fmt_latex, fmt_txt
17class Summary(object):
18 def __init__(self):
19 self.tables = []
20 self.settings = []
21 self.extra_txt = []
22 self.title = None
23 self._merge_latex = False
25 def __str__(self):
26 return self.as_text()
28 def __repr__(self):
29 return str(type(self)) + '\n"""\n' + self.__str__() + '\n"""'
31 def _repr_html_(self):
32 '''Display as HTML in IPython notebook.'''
33 return self.as_html()
35 def add_df(self, df, index=True, header=True, float_format='%.4f',
36 align='r'):
37 '''Add the contents of a DataFrame to summary table
39 Parameters
40 ----------
41 df : DataFrame
42 header: bool
43 Reproduce the DataFrame column labels in summary table
44 index: bool
45 Reproduce the DataFrame row labels in summary table
46 float_format : str
47 Formatting to float data columns
48 align : str
49 Data alignment (l/c/r)
50 '''
52 settings = {'index': index, 'header': header,
53 'float_format': float_format, 'align': align}
54 self.tables.append(df)
55 self.settings.append(settings)
57 def add_array(self, array, align='r', float_format="%.4f"):
58 '''Add the contents of a Numpy array to summary table
60 Parameters
61 ----------
62 array : numpy array (2D)
63 float_format : str
64 Formatting to array if type is float
65 align : str
66 Data alignment (l/c/r)
67 '''
69 table = pd.DataFrame(array)
70 self.add_df(table, index=False, header=False,
71 float_format=float_format, align=align)
73 def add_dict(self, d, ncols=2, align='l', float_format="%.4f"):
74 '''Add the contents of a Dict to summary table
76 Parameters
77 ----------
78 d : dict
79 Keys and values are automatically coerced to strings with str().
80 Users are encouraged to format them before using add_dict.
81 ncols: int
82 Number of columns of the output table
83 align : str
84 Data alignment (l/c/r)
85 '''
87 keys = [_formatter(x, float_format) for x in iterkeys(d)]
88 vals = [_formatter(x, float_format) for x in itervalues(d)]
89 data = np.array(lzip(keys, vals))
91 if data.shape[0] % ncols != 0:
92 pad = ncols - (data.shape[0] % ncols)
93 data = np.vstack([data, np.array(pad * [['', '']])])
95 data = np.split(data, ncols)
96 data = reduce(lambda x, y: np.hstack([x, y]), data)
97 self.add_array(data, align=align)
99 def add_text(self, string):
100 '''Append a note to the bottom of the summary table. In ASCII tables,
101 the note will be wrapped to table width. Notes are not indendented.
102 '''
103 self.extra_txt.append(string)
105 def add_title(self, title=None, results=None):
106 '''Insert a title on top of the summary table. If a string is provided
107 in the title argument, that string is printed. If no title string is
108 provided but a results instance is provided, statsmodels attempts
109 to construct a useful title automatically.
110 '''
111 if isinstance(title, str):
112 self.title = title
113 else:
114 if results is not None:
115 model = results.model.__class__.__name__
116 if model in _model_types:
117 model = _model_types[model]
118 self.title = 'Results: ' + model
119 else:
120 self.title = ''
122 def add_base(self, results, alpha=0.05, float_format="%.4f", title=None,
123 xname=None, yname=None):
124 '''Try to construct a basic summary instance.
126 Parameters
127 ----------
128 results : Model results instance
129 alpha : float
130 significance level for the confidence intervals (optional)
131 float_formatting: str
132 Float formatting for summary of parameters (optional)
133 title : str
134 Title of the summary table (optional)
135 xname : list[str] of length equal to the number of parameters
136 Names of the independent variables (optional)
137 yname : str
138 Name of the dependent variable (optional)
139 '''
141 param = summary_params(results, alpha=alpha, use_t=results.use_t)
142 info = summary_model(results)
143 if xname is not None:
144 param.index = xname
145 if yname is not None:
146 info['Dependent Variable:'] = yname
147 self.add_dict(info, align='l')
148 self.add_df(param, float_format=float_format)
149 self.add_title(title=title, results=results)
151 def as_text(self):
152 '''Generate ASCII Summary Table
153 '''
155 tables = self.tables
156 settings = self.settings
157 title = self.title
158 extra_txt = self.extra_txt
160 pad_col, pad_index, widest = _measure_tables(tables, settings)
162 rule_equal = widest * '='
164 simple_tables = _simple_tables(tables, settings, pad_col, pad_index)
165 tab = [x.as_text() for x in simple_tables]
167 tab = '\n'.join(tab)
168 tab = tab.split('\n')
169 tab[0] = rule_equal
170 tab.append(rule_equal)
171 tab = '\n'.join(tab)
173 if title is not None:
174 title = title
175 if len(title) < widest:
176 title = ' ' * int(widest/2 - len(title)/2) + title
177 else:
178 title = ''
180 txt = [textwrap.wrap(x, widest) for x in extra_txt]
181 txt = ['\n'.join(x) for x in txt]
182 txt = '\n'.join(txt)
184 out = '\n'.join([title, tab, txt])
186 return out
188 def as_html(self):
189 '''Generate HTML Summary Table
190 '''
192 tables = self.tables
193 settings = self.settings
195 simple_tables = _simple_tables(tables, settings)
196 tab = [x.as_html() for x in simple_tables]
197 tab = '\n'.join(tab)
199 return tab
201 def as_latex(self):
202 '''Generate LaTeX Summary Table
203 '''
204 tables = self.tables
205 settings = self.settings
206 title = self.title
208 if title is not None:
209 title = '\\caption{' + title + '}'
210 else:
211 title = '\\caption{}'
213 simple_tables = _simple_tables(tables, settings)
214 tab = [x.as_latex_tabular() for x in simple_tables]
215 tab = '\n\\hline\n'.join(tab)
217 to_replace = ('\\\\hline\\n\\\\hline\\n\\\\'
218 'end{tabular}\\n\\\\begin{tabular}{.*}\\n')
220 if self._merge_latex:
221 # create single tabular object for summary_col
222 tab = re.sub(to_replace, r'\\midrule\n', tab)
224 out = '\\begin{table}', title, tab, '\\end{table}'
225 out = '\n'.join(out)
226 return out
229def _measure_tables(tables, settings):
230 '''Compare width of ascii tables in a list and calculate padding values.
231 We add space to each col_sep to get us as close as possible to the
232 width of the largest table. Then, we add a few spaces to the first
233 column to pad the rest.
234 '''
236 simple_tables = _simple_tables(tables, settings)
237 tab = [x.as_text() for x in simple_tables]
239 length = [len(x.splitlines()[0]) for x in tab]
240 len_max = max(length)
241 pad_sep = []
242 pad_index = []
244 for i in range(len(tab)):
245 nsep = max(tables[i].shape[1] - 1, 1)
246 pad = int((len_max - length[i]) / nsep)
247 pad_sep.append(pad)
248 len_new = length[i] + nsep * pad
249 pad_index.append(len_max - len_new)
251 return pad_sep, pad_index, max(length)
254# Useful stuff # TODO: be more specific
255_model_types = {'OLS': 'Ordinary least squares',
256 'GLS': 'Generalized least squares',
257 'GLSAR': 'Generalized least squares with AR(p)',
258 'WLS': 'Weighted least squares',
259 'RLM': 'Robust linear model',
260 'NBin': 'Negative binomial model',
261 'GLM': 'Generalized linear model'
262 }
265def summary_model(results):
266 '''Create a dict with information about the model
267 '''
269 def time_now(*args, **kwds):
270 now = datetime.datetime.now()
271 return now.strftime('%Y-%m-%d %H:%M')
273 info = OrderedDict()
274 info['Model:'] = lambda x: x.model.__class__.__name__
275 info['Model Family:'] = lambda x: x.family.__class.__name__
276 info['Link Function:'] = lambda x: x.family.link.__class__.__name__
277 info['Dependent Variable:'] = lambda x: x.model.endog_names
278 info['Date:'] = time_now
279 info['No. Observations:'] = lambda x: "%#6d" % x.nobs
280 info['Df Model:'] = lambda x: "%#6d" % x.df_model
281 info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid
282 info['Converged:'] = lambda x: x.mle_retvals['converged']
283 info['No. Iterations:'] = lambda x: x.mle_retvals['iterations']
284 info['Method:'] = lambda x: x.method
285 info['Norm:'] = lambda x: x.fit_options['norm']
286 info['Scale Est.:'] = lambda x: x.fit_options['scale_est']
287 info['Cov. Type:'] = lambda x: x.fit_options['cov']
289 rsquared_type = '' if results.k_constant else ' (uncentered)'
290 info['R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared
291 info['Adj. R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared_adj # noqa:E501
292 info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared
293 info['AIC:'] = lambda x: "%8.4f" % x.aic
294 info['BIC:'] = lambda x: "%8.4f" % x.bic
295 info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf
296 info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull
297 info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue
298 info['Deviance:'] = lambda x: "%#8.5g" % x.deviance
299 info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2
300 info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue
301 info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue
302 info['Scale:'] = lambda x: "%#8.5g" % x.scale
303 out = OrderedDict()
304 for key, func in iteritems(info):
305 try:
306 out[key] = func(results)
307 except (AttributeError, KeyError, NotImplementedError):
308 # NOTE: some models do not have loglike defined (RLM),
309 # so raise NotImplementedError
310 pass
311 return out
314def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True,
315 skip_header=False, float_format="%.4f"):
316 '''create a summary table of parameters from results instance
318 Parameters
319 ----------
320 res : results instance
321 some required information is directly taken from the result
322 instance
323 yname : {str, None}
324 optional name for the endogenous variable, default is "y"
325 xname : {list[str], None}
326 optional names for the exogenous variables, default is "var_xx"
327 alpha : float
328 significance level for the confidence intervals
329 use_t : bool
330 indicator whether the p-values are based on the Student-t
331 distribution (if True) or on the normal distribution (if False)
332 skip_headers : bool
333 If false (default), then the header row is added. If true, then no
334 header row is added.
335 float_format : str
336 float formatting options (e.g. ".3g")
338 Returns
339 -------
340 params_table : SimpleTable instance
341 '''
343 if isinstance(results, tuple):
344 results, params, bse, tvalues, pvalues, conf_int = results
345 else:
346 params = results.params
347 bse = results.bse
348 tvalues = results.tvalues
349 pvalues = results.pvalues
350 conf_int = results.conf_int(alpha)
352 data = np.array([params, bse, tvalues, pvalues]).T
353 data = np.hstack([data, conf_int])
354 data = pd.DataFrame(data)
356 if use_t:
357 data.columns = ['Coef.', 'Std.Err.', 't', 'P>|t|',
358 '[' + str(alpha/2), str(1-alpha/2) + ']']
359 else:
360 data.columns = ['Coef.', 'Std.Err.', 'z', 'P>|z|',
361 '[' + str(alpha/2), str(1-alpha/2) + ']']
363 if not xname:
364 try:
365 data.index = results.model.data.param_names
366 except AttributeError:
367 data.index = results.model.exog_names
368 else:
369 data.index = xname
371 return data
374# Vertical summary instance for multiple models
375def _col_params(result, float_format='%.4f', stars=True):
376 '''Stack coefficients and standard errors in single column
377 '''
379 # Extract parameters
380 res = summary_params(result)
381 # Format float
382 for col in res.columns[:2]:
383 res[col] = res[col].apply(lambda x: float_format % x)
384 # Std.Errors in parentheses
385 res.iloc[:, 1] = '(' + res.iloc[:, 1] + ')'
386 # Significance stars
387 if stars:
388 idx = res.iloc[:, 3] < .1
389 res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*'
390 idx = res.iloc[:, 3] < .05
391 res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*'
392 idx = res.iloc[:, 3] < .01
393 res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*'
394 # Stack Coefs and Std.Errors
395 res = res.iloc[:, :2]
396 res = res.iloc[:, :2]
397 rsquared = rsquared_adj = np.nan
398 if hasattr(result, 'rsquared'):
399 rsquared = result.rsquared
400 if hasattr(result, 'rsquared_adj'):
401 rsquared_adj = result.rsquared_adj
402 r_result = pd.DataFrame({'Basic': [rsquared], 'Adj.': [rsquared_adj]},
403 index=['R-squared'])
404 if not np.all(np.isnan(np.asarray(r_result))):
405 for col in r_result:
406 r_result[col] = r_result[col].apply(lambda x: float_format % x)
407 try:
408 res = pd.DataFrame(res).append(r_result, sort=True)
409 except TypeError:
410 # TODO: Remove when min pandas >= 0.23
411 res = pd.DataFrame(res).append(r_result)
412 res = res.stack()
413 res = pd.DataFrame(res)
414 res.columns = [str(result.model.endog_names)]
415 return res
418def _col_info(result, info_dict=None):
419 '''Stack model info in a column
420 '''
422 if info_dict is None:
423 info_dict = {}
424 out = []
425 index = []
426 for i in info_dict:
427 if isinstance(info_dict[i], dict):
428 # this is a specific model info_dict, but not for this result...
429 continue
430 try:
431 out.append(info_dict[i](result))
432 except AttributeError:
433 out.append('')
434 index.append(i)
435 out = pd.DataFrame({str(result.model.endog_names): out}, index=index)
436 return out
439def _make_unique(list_of_names):
440 if len(set(list_of_names)) == len(list_of_names):
441 return list_of_names
442 # pandas does not like it if multiple columns have the same names
443 from collections import defaultdict
444 name_counter = defaultdict(str)
445 header = []
446 for _name in list_of_names:
447 name_counter[_name] += "I"
448 header.append(_name+" " + name_counter[_name])
449 return header
452def summary_col(results, float_format='%.4f', model_names=(), stars=False,
453 info_dict=None, regressor_order=(), drop_omitted=False):
454 """
455 Summarize multiple results instances side-by-side (coefs and SEs)
457 Parameters
458 ----------
459 results : statsmodels results instance or list of result instances
460 float_format : str, optional
461 float format for coefficients and standard errors
462 Default : '%.4f'
463 model_names : list[str], optional
464 Must have same length as the number of results. If the names are not
465 unique, a roman number will be appended to all model names
466 stars : bool
467 print significance stars
468 info_dict : dict
469 dict of functions to be applied to results instances to retrieve
470 model info. To use specific information for different models, add a
471 (nested) info_dict with model name as the key.
472 Example: `info_dict = {"N":lambda x:(x.nobs), "R2": ..., "OLS":{
473 "R2":...}}` would only show `R2` for OLS regression models, but
474 additionally `N` for all other results.
475 Default : None (use the info_dict specified in
476 result.default_model_infos, if this property exists)
477 regressor_order : list[str], optional
478 list of names of the regressors in the desired order. All regressors
479 not specified will be appended to the end of the list.
480 drop_omitted : bool, optional
481 Includes regressors that are not specified in regressor_order. If
482 False, regressors not specified will be appended to end of the list.
483 If True, only regressors in regressor_order will be included.
484 """
486 if not isinstance(results, list):
487 results = [results]
489 cols = [_col_params(x, stars=stars, float_format=float_format) for x in
490 results]
492 # Unique column names (pandas has problems merging otherwise)
493 if model_names:
494 colnames = _make_unique(model_names)
495 else:
496 colnames = _make_unique([x.columns[0] for x in cols])
497 for i in range(len(cols)):
498 cols[i].columns = [colnames[i]]
500 def merg(x, y):
501 return x.merge(y, how='outer', right_index=True,
502 left_index=True)
504 summ = reduce(merg, cols)
506 if regressor_order:
507 varnames = summ.index.get_level_values(0).tolist()
508 ordered = [x for x in regressor_order if x in varnames]
509 unordered = [x for x in varnames if x not in regressor_order + ['']]
510 order = ordered + list(np.unique(unordered))
512 def f(idx):
513 return sum([[x + 'coef', x + 'stde'] for x in idx], [])
515 summ.index = f(pd.unique(varnames))
516 summ = summ.reindex(f(order))
517 summ.index = [x[:-4] for x in summ.index]
518 if drop_omitted:
519 summ = summ.loc[regressor_order]
521 idx = pd.Series(lrange(summ.shape[0])) % 2 == 1
522 summ.index = np.where(idx, '', summ.index.get_level_values(0))
524 # add infos about the models.
525 if info_dict:
526 cols = [_col_info(x, info_dict.get(x.model.__class__.__name__,
527 info_dict)) for x in results]
528 else:
529 cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in
530 results]
531 # use unique column names, otherwise the merge will not succeed
532 for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])):
533 df.columns = [name]
535 def merg(x, y):
536 return x.merge(y, how='outer', right_index=True,
537 left_index=True)
539 info = reduce(merg, cols)
540 dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error
541 dat.columns = summ.columns
542 dat.index = pd.Index(summ.index.tolist() + info.index.tolist())
543 summ = dat
545 summ = summ.fillna('')
547 smry = Summary()
548 smry._merge_latex = True
549 smry.add_df(summ, header=True, align='l')
550 smry.add_text('Standard errors in parentheses.')
551 if stars:
552 smry.add_text('* p<.1, ** p<.05, ***p<.01')
554 return smry
557def _formatter(element, float_format='%.4f'):
558 try:
559 out = float_format % element
560 except (ValueError, TypeError):
561 out = str(element)
562 return out.strip()
565def _df_to_simpletable(df, align='r', float_format="%.4f", header=True,
566 index=True, table_dec_above='-', table_dec_below=None,
567 header_dec_below='-', pad_col=0, pad_index=0):
568 dat = df.copy()
569 dat = dat.applymap(lambda x: _formatter(x, float_format))
570 if header:
571 headers = [str(x) for x in dat.columns.tolist()]
572 else:
573 headers = None
574 if index:
575 stubs = [str(x) + int(pad_index) * ' ' for x in dat.index.tolist()]
576 else:
577 dat.iloc[:, 0] = [str(x) + int(pad_index) * ' '
578 for x in dat.iloc[:, 0]]
579 stubs = None
580 st = SimpleTable(np.array(dat), headers=headers, stubs=stubs,
581 ltx_fmt=fmt_latex, txt_fmt=fmt_txt)
582 st.output_formats['latex']['data_aligns'] = align
583 st.output_formats['txt']['data_aligns'] = align
584 st.output_formats['txt']['table_dec_above'] = table_dec_above
585 st.output_formats['txt']['table_dec_below'] = table_dec_below
586 st.output_formats['txt']['header_dec_below'] = header_dec_below
587 st.output_formats['txt']['colsep'] = ' ' * int(pad_col + 1)
588 return st
591def _simple_tables(tables, settings, pad_col=None, pad_index=None):
592 simple_tables = []
593 float_format = settings[0]['float_format'] if settings else '%.4f'
594 if pad_col is None:
595 pad_col = [0] * len(tables)
596 if pad_index is None:
597 pad_index = [0] * len(tables)
598 for i, v in enumerate(tables):
599 index = settings[i]['index']
600 header = settings[i]['header']
601 align = settings[i]['align']
602 simple_tables.append(_df_to_simpletable(v, align=align,
603 float_format=float_format,
604 header=header, index=index,
605 pad_col=pad_col[i],
606 pad_index=pad_index[i]))
607 return simple_tables