Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/iolib/foreign.py : 15%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Input/Output tools for working with binary data.
4The Stata input tools were originally written by Joe Presbrey as part of PyDTA.
6You can find more information here http://presbrey.mit.edu/PyDTA
8See Also
9--------
10numpy.lib.io
11"""
12import warnings
14from statsmodels.compat.python import (lzip, lmap, lrange,
15 lfilter, asbytes, asstr)
16from struct import unpack, calcsize, pack
17from struct import error as struct_error
18import datetime
19import sys
21import numpy as np
22import statsmodels.tools.data as data_util
23from pandas import isnull
24from pandas.io.stata import StataMissingValue
25from statsmodels.iolib.openfile import get_file_obj
27_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"]
29def _datetime_to_stata_elapsed(date, fmt):
30 """
31 Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime
33 Parameters
34 ----------
35 date : datetime.datetime
36 The date to convert to the Stata Internal Format given by fmt
37 fmt : str
38 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
39 """
40 if not isinstance(date, datetime.datetime):
41 raise ValueError("date should be datetime.datetime format")
42 stata_epoch = datetime.datetime(1960, 1, 1)
43 if fmt in ["%tc", "tc"]:
44 delta = date - stata_epoch
45 return (delta.days * 86400000 + delta.seconds*1000 +
46 delta.microseconds/1000)
47 elif fmt in ["%tC", "tC"]:
48 from warnings import warn
49 warn("Stata Internal Format tC not supported.", UserWarning)
50 return date
51 elif fmt in ["%td", "td"]:
52 return (date- stata_epoch).days
53 elif fmt in ["%tw", "tw"]:
54 return (52*(date.year-stata_epoch.year) +
55 (date - datetime.datetime(date.year, 1, 1)).days / 7)
56 elif fmt in ["%tm", "tm"]:
57 return (12 * (date.year - stata_epoch.year) + date.month - 1)
58 elif fmt in ["%tq", "tq"]:
59 return 4*(date.year-stata_epoch.year) + int((date.month - 1)/3)
60 elif fmt in ["%th", "th"]:
61 return 2 * (date.year - stata_epoch.year) + int(date.month > 6)
62 elif fmt in ["%ty", "ty"]:
63 return date.year
64 else:
65 raise ValueError("fmt %s not understood" % fmt)
67def _stata_elapsed_date_to_datetime(date, fmt):
68 """
69 Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
71 Parameters
72 ----------
73 date : int
74 The Stata Internal Format date to convert to datetime according to fmt
75 fmt : str
76 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
78 Examples
79 --------
80 >>> _stata_elapsed_date_to_datetime(52, "%tw") datetime.datetime(1961, 1, 1, 0, 0)
82 Notes
83 -----
84 datetime/c - tc
85 milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
86 datetime/C - tC - NOT IMPLEMENTED
87 milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
88 date - td
89 days since 01jan1960 (01jan1960 = 0)
90 weekly date - tw
91 weeks since 1960w1
92 This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
93 The datetime value is the start of the week in terms of days in the
94 year, not ISO calendar weeks.
95 monthly date - tm
96 months since 1960m1
97 quarterly date - tq
98 quarters since 1960q1
99 half-yearly date - th
100 half-years since 1960h1 yearly
101 date - ty
102 years since 0000
104 If you do not have pandas with datetime support, then you cannot do
105 milliseconds accurately.
106 """
107 #NOTE: we could run into overflow / loss of precision situations here
108 # casting to int, but I'm not sure what to do. datetime will not deal with
109 # numpy types and numpy datetime is not mature enough / we cannot rely on
110 # pandas version > 0.7.1
111 #TODO: IIRC relative delta does not play well with np.datetime?
112 date = int(date)
113 stata_epoch = datetime.datetime(1960, 1, 1)
114 if fmt in ["%tc", "tc"]:
115 from dateutil.relativedelta import relativedelta
116 return stata_epoch + relativedelta(microseconds=date*1000)
117 elif fmt in ["%tC", "tC"]:
118 from warnings import warn
119 warn("Encountered %tC format. Leaving in Stata Internal Format.",
120 UserWarning)
121 return date
122 elif fmt in ["%td", "td"]:
123 return stata_epoch + datetime.timedelta(int(date))
124 elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week
125 year = datetime.datetime(stata_epoch.year + date // 52, 1, 1)
126 day_delta = (date % 52 ) * 7
127 return year + datetime.timedelta(int(day_delta))
128 elif fmt in ["%tm", "tm"]:
129 year = stata_epoch.year + date // 12
130 month_delta = (date % 12 ) + 1
131 return datetime.datetime(year, month_delta, 1)
132 elif fmt in ["%tq", "tq"]:
133 year = stata_epoch.year + date // 4
134 month_delta = (date % 4) * 3 + 1
135 return datetime.datetime(year, month_delta, 1)
136 elif fmt in ["%th", "th"]:
137 year = stata_epoch.year + date // 2
138 month_delta = (date % 2) * 6 + 1
139 return datetime.datetime(year, month_delta, 1)
140 elif fmt in ["%ty", "ty"]:
141 if date > 0:
142 return datetime.datetime(date, 1, 1)
143 else: # do not do negative years bc cannot mix dtypes in column
144 raise ValueError("Year 0 and before not implemented")
145 else:
146 raise ValueError("Date fmt %s not understood" % fmt)
149### Helper classes for StataReader ###
151class _StataVariable(object):
152 """
153 A dataset variable. Not intended for public use.
155 Parameters
156 ----------
157 variable_data
159 Attributes
160 ----------
161 format : str
162 Stata variable format. See notes for more information.
163 index : int
164 Zero-index column index of variable.
165 label : str
166 Data Label
167 name : str
168 Variable name
169 type : str
170 Stata data type. See notes for more information.
171 value_format : str
172 Value format.
174 Notes
175 -----
176 More information: http://www.stata.com/help.cgi?format
177 """
178 def __init__(self, variable_data):
179 self._data = variable_data
181 def __int__(self):
182 """the variable's index within an observation"""
183 return self.index
185 def __str__(self):
186 """the name of the variable"""
187 return self.name
189 @property
190 def index(self):
191 """the variable's index within an observation"""
192 return self._data[0]
194 @property
195 def type(self):
196 """
197 The data type of variable
199 Possible types are:
200 {1..244:string, b:byte, h:int, l:long, f:float, d:double)
201 """
202 return self._data[1]
204 @property
205 def name(self):
206 """the name of the variable"""
207 return self._data[2]
209 @property
210 def format(self):
211 """the variable's Stata format"""
212 return self._data[4]
214 @property
215 def value_format(self):
216 """the variable's value format"""
217 return self._data[5]
219 @property
220 def label(self):
221 """The variable's label"""
222 return self._data[6]
225class StataReader(object):
226 """
227 Stata .dta file reader.
229 Provides methods to return the metadata of a Stata .dta file and
230 a generator for the data itself.
232 Parameters
233 ----------
234 file : file-like
235 A file-like object representing a Stata .dta file.
236 missing_values : bool
237 If missing_values is True, parse missing_values and return a
238 Missing Values object instead of None.
239 encoding : str, optional
240 Used for Python 3 only. Encoding to use when reading the .dta file.
241 Defaults to `locale.getpreferredencoding`
243 See Also
244 --------
245 statsmodels.iolib.foreign.genfromdta
246 pandas.read_stata
247 pandas.io.stata.StataReader
249 Notes
250 -----
251 This is known only to work on file formats 113 (Stata 8/9), 114
252 (Stata 10/11), and 115 (Stata 12). Needs to be tested on older versions.
253 Known not to work on format 104, 108. If you have the documentation for
254 older formats, please contact the developers.
256 For more information about the .dta format see
257 http://www.stata.com/help.cgi?dta
258 http://www.stata.com/help.cgi?dta_113
259 """
261 _header = {}
262 _data_location = 0
263 _col_sizes = ()
264 _has_string_data = False
265 _missing_values = False
266 #type code
267 #--------------------
268 #str1 1 = 0x01
269 #str2 2 = 0x02
270 #...
271 #str244 244 = 0xf4
272 #byte 251 = 0xfb (sic)
273 #int 252 = 0xfc
274 #long 253 = 0xfd
275 #float 254 = 0xfe
276 #double 255 = 0xff
277 #--------------------
278 #NOTE: the byte type seems to be reserved for categorical variables
279 # with a label, but the underlying variable is -127 to 100
280 # we're going to drop the label and cast to int
281 DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \
282 [(251, np.int16),(252, np.int32),(253, int),
283 (254, np.float32), (255, np.float64)])
284 TYPE_MAP = lrange(251)+list('bhlfd')
285 #NOTE: technically, some of these are wrong. there are more numbers
286 # that can be represented. it's the 27 ABOVE and BELOW the max listed
287 # numeric data type in [U] 12.2.2 of the 11.2 manual
288 MISSING_VALUES = { 'b': (-127,100), 'h': (-32767, 32740), 'l':
289 (-2147483647, 2147483620), 'f': (-1.701e+38, +1.701e+38), 'd':
290 (-1.798e+308, +8.988e+307) }
292 def __init__(self, fname, missing_values=False, encoding=None):
293 warnings.warn(
294 "StataReader is deprecated as of 0.10.0 and will be removed in a "
295 "future version. Use pandas.read_stata or "
296 "pandas.io.stata.StataReader instead.",
297 FutureWarning)
299 if encoding is None:
300 import locale
301 self._encoding = locale.getpreferredencoding()
302 else:
303 self._encoding = encoding
304 self._missing_values = missing_values
305 self._parse_header(fname)
307 def file_headers(self):
308 """
309 Returns all .dta file headers.
311 out: dict
312 Has keys typlist, data_label, lbllist, varlist, nvar, filetype,
313 ds_format, nobs, fmtlist, vlblist, time_stamp, srtlist, byteorder
314 """
315 return self._header
317 def file_format(self):
318 """
319 Returns the file format.
321 Returns
322 -------
323 out : int
325 Notes
326 -----
327 Format 113: Stata 8/9
328 Format 114: Stata 10/11
329 Format 115: Stata 12
330 """
331 return self._header['ds_format']
333 def file_label(self):
334 """
335 Returns the dataset's label.
337 Returns
338 -------
339 out: str
340 """
341 return self._header['data_label']
343 def file_timestamp(self):
344 """
345 Returns the date and time Stata recorded on last file save.
347 Returns
348 -------
349 out : str
350 """
351 return self._header['time_stamp']
353 def variables(self):
354 """
355 Returns a list of the dataset's StataVariables objects.
356 """
357 return lmap(_StataVariable, zip(lrange(self._header['nvar']),
358 self._header['typlist'], self._header['varlist'],
359 self._header['srtlist'],
360 self._header['fmtlist'], self._header['lbllist'],
361 self._header['vlblist']))
363 def dataset(self, as_dict=False):
364 """
365 Returns a Python generator object for iterating over the dataset.
368 Parameters
369 ----------
370 as_dict : bool, optional
371 If as_dict is True, yield each row of observations as a dict.
372 If False, yields each row of observations as a list.
374 Returns
375 -------
376 Generator object for iterating over the dataset. Yields each row of
377 observations as a list by default.
379 Notes
380 -----
381 If missing_values is True during instantiation of StataReader then
382 observations with StataMissingValue(s) are not filtered and should
383 be handled by your application.
384 """
386 try:
387 self._file.seek(self._data_location)
388 except Exception:
389 pass
391 if as_dict:
392 vars = lmap(str, self.variables())
393 for i in range(len(self)):
394 yield dict(zip(vars, self._next()))
395 else:
396 for i in range(self._header['nobs']):
397 yield self._next()
399 ### Python special methods
401 def __len__(self):
402 """
403 Return the number of observations in the dataset.
405 This value is taken directly from the header and includes observations
406 with missing values.
407 """
408 return self._header['nobs']
410 def __getitem__(self, k):
411 """
412 Seek to an observation indexed k in the file and return it, ordered
413 by Stata's output to the .dta file.
415 k is zero-indexed. Prefer using R.data() for performance.
416 """
417 if not (isinstance(k, int)) or k < 0 or k > len(self)-1:
418 raise IndexError(k)
419 loc = self._data_location + sum(self._col_size()) * k
420 if self._file.tell() != loc:
421 self._file.seek(loc)
422 return self._next()
424 # Private methods
426 def _null_terminate(self, s, encoding):
427 null_byte = asbytes('\x00')
428 try:
429 s = s.lstrip(null_byte)[:s.index(null_byte)]
430 except Exception:
431 pass
432 return s.decode(encoding)
434 def _parse_header(self, file_object):
435 self._file = file_object
436 encoding = self._encoding
438 # parse headers
439 self._header['ds_format'] = unpack('b', self._file.read(1))[0]
441 if self._header['ds_format'] not in [113, 114, 115]:
442 raise ValueError("Only file formats >= 113 (Stata >= 9)"
443 " are supported. Got format %s. Please report "
444 "if you think this error is incorrect." %
445 self._header['ds_format'])
446 byteorder = self._header['byteorder'] = unpack('b',
447 self._file.read(1))[0]==0x1 and '>' or '<'
448 self._header['filetype'] = unpack('b', self._file.read(1))[0]
449 self._file.read(1)
450 nvar = self._header['nvar'] = unpack(byteorder+'h',
451 self._file.read(2))[0]
452 self._header['nobs'] = unpack(byteorder+'i', self._file.read(4))[0]
453 self._header['data_label'] = self._null_terminate(self._file.read(81),
454 encoding)
455 self._header['time_stamp'] = self._null_terminate(self._file.read(18),
456 encoding)
458 # parse descriptors
459 typlist =[ord(self._file.read(1)) for i in range(nvar)]
460 self._header['typlist'] = [self.TYPE_MAP[typ] for typ in typlist]
461 self._header['dtyplist'] = [self.DTYPE_MAP[typ] for typ in typlist]
462 self._header['varlist'] = [self._null_terminate(self._file.read(33),
463 encoding) for i in range(nvar)]
464 self._header['srtlist'] = unpack(byteorder+('h'*(nvar+1)),
465 self._file.read(2*(nvar+1)))[:-1]
466 if self._header['ds_format'] <= 113:
467 self._header['fmtlist'] = \
468 [self._null_terminate(self._file.read(12), encoding) \
469 for i in range(nvar)]
470 else:
471 self._header['fmtlist'] = \
472 [self._null_terminate(self._file.read(49), encoding) \
473 for i in range(nvar)]
474 self._header['lbllist'] = [self._null_terminate(self._file.read(33),
475 encoding) for i in range(nvar)]
476 self._header['vlblist'] = [self._null_terminate(self._file.read(81),
477 encoding) for i in range(nvar)]
479 # ignore expansion fields
480 # When reading, read five bytes; the last four bytes now tell you the
481 # size of the next read, which you discard. You then continue like
482 # this until you read 5 bytes of zeros.
484 while True:
485 data_type = unpack(byteorder+'b', self._file.read(1))[0]
486 data_len = unpack(byteorder+'i', self._file.read(4))[0]
487 if data_type == 0:
488 break
489 self._file.read(data_len)
491 # other state vars
492 self._data_location = self._file.tell()
493 self._has_string_data = len(lfilter(lambda x: isinstance(x, int),
494 self._header['typlist'])) > 0
495 self._col_size()
497 def _calcsize(self, fmt):
498 return isinstance(fmt, int) and fmt or \
499 calcsize(self._header['byteorder']+fmt)
501 def _col_size(self, k = None):
502 """Calculate size of a data record."""
503 if len(self._col_sizes) == 0:
504 self._col_sizes = lmap(lambda x: self._calcsize(x),
505 self._header['typlist'])
506 if k is None:
507 return self._col_sizes
508 else:
509 return self._col_sizes[k]
511 def _unpack(self, fmt, byt):
512 d = unpack(self._header['byteorder']+fmt, byt)[0]
513 if fmt[-1] in self.MISSING_VALUES:
514 nmin, nmax = self.MISSING_VALUES[fmt[-1]]
515 if d < nmin or d > nmax:
516 if self._missing_values:
517 return StataMissingValue(nmax, d)
518 else:
519 return None
520 return d
522 def _next(self):
523 typlist = self._header['typlist']
524 if self._has_string_data:
525 data = [None]*self._header['nvar']
526 for i in range(len(data)):
527 if isinstance(typlist[i], int):
528 data[i] = self._null_terminate(self._file.read(typlist[i]),
529 self._encoding)
530 else:
531 data[i] = self._unpack(typlist[i],
532 self._file.read(self._col_size(i)))
533 return data
534 else:
535 return lmap(lambda i: self._unpack(typlist[i],
536 self._file.read(self._col_size(i))),
537 lrange(self._header['nvar']))
539def _set_endianness(endianness):
540 if endianness.lower() in ["<", "little"]:
541 return "<"
542 elif endianness.lower() in [">", "big"]:
543 return ">"
544 else: # pragma : no cover
545 raise ValueError("Endianness %s not understood" % endianness)
547def _dtype_to_stata_type(dtype):
548 """
549 Converts dtype types to stata types. Returns the byte of the given ordinal.
550 See TYPE_MAP and comments for an explanation. This is also explained in
551 the dta spec.
552 1 - 244 are strings of this length
553 251 - chr(251) - for int8 and int16, byte
554 252 - chr(252) - for int32, int
555 253 - chr(253) - for int64, long
556 254 - chr(254) - for float32, float
557 255 - chr(255) - double, double
559 If there are dates to convert, then dtype will already have the correct
560 type inserted.
561 """
562 #TODO: expand to handle datetime to integer conversion
563 if dtype.type == np.string_:
564 return chr(dtype.itemsize)
565 elif dtype.type == np.object_:
566 # try to coerce it to the biggest string
567 # not memory efficient, what else could we do?
568 return chr(244)
569 elif dtype == np.float64:
570 return chr(255)
571 elif dtype == np.float32:
572 return chr(254)
573 elif dtype == np.int64:
574 return chr(253)
575 elif dtype == np.int32:
576 return chr(252)
577 elif dtype == np.int8 or dtype == np.int16: # ok to assume bytes?
578 return chr(251)
579 else: # pragma : no cover
580 raise ValueError("Data type %s not currently understood. "
581 "Please report an error to the developers." % dtype)
583def _dtype_to_default_stata_fmt(dtype):
584 """
585 Maps numpy dtype to stata's default format for this type. Not terribly
586 important since users can change this in Stata. Semantics are
588 string -> "%DDs" where DD is the length of the string
589 float64 -> "%10.0g"
590 float32 -> "%9.0g"
591 int64 -> "%9.0g"
592 int32 -> "%9.0g"
593 int16 -> "%9.0g"
594 int8 -> "%8.0g"
595 """
596 #TODO: expand this to handle a default datetime format?
597 if dtype.type == np.string_:
598 return "%" + str(dtype.itemsize) + "s"
599 elif dtype.type == np.object_:
600 return "%244s"
601 elif dtype == np.float64:
602 return "%10.0g"
603 elif dtype == np.float32:
604 return "%9.0g"
605 elif dtype == np.int64:
606 return "%9.0g"
607 elif dtype == np.int32:
608 return "%8.0g"
609 elif dtype == np.int8 or dtype == np.int16: # ok to assume bytes?
610 return "%8.0g"
611 else: # pragma : no cover
612 raise ValueError("Data type %s not currently understood. "
613 "Please report an error to the developers." % dtype)
615def _pad_bytes(name, length):
616 """
617 Takes a char string and pads it wih null bytes until it's length chars
618 """
619 return name + "\x00" * (length - len(name))
621def _default_names(nvar):
622 """
623 Returns default Stata names v1, v2, ... vnvar
624 """
625 return ["v%d" % i for i in range(1,nvar+1)]
627def _convert_datetime_to_stata_type(fmt):
628 """
629 Converts from one of the stata date formats to a type in TYPE_MAP
630 """
631 if fmt in ["tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq",
632 "%tq", "th", "%th", "ty", "%ty"]:
633 return np.float64 # Stata expects doubles for SIFs
634 else:
635 raise ValueError("fmt %s not understood" % fmt)
637def _maybe_convert_to_int_keys(convert_dates, varlist):
638 new_dict = {}
639 for key in convert_dates:
640 if not convert_dates[key].startswith("%"): # make sure proper fmts
641 convert_dates[key] = "%" + convert_dates[key]
642 if key in varlist:
643 new_dict.update({varlist.index(key) : convert_dates[key]})
644 else:
645 if not isinstance(key, int):
646 raise ValueError("convery_dates key is not in varlist "
647 "and is not an int")
648 new_dict.update({key : convert_dates[key]})
649 return new_dict
651_type_converters = {253 : np.long, 252 : int}
653class StataWriter(object):
654 """
655 A class for writing Stata binary dta files from array-like objects
657 Parameters
658 ----------
659 fname : file path or buffer
660 Where to save the dta file.
661 data : array_like
662 Array-like input to save. Pandas objects are also accepted.
663 convert_dates : dict
664 Dictionary mapping column of datetime types to the stata internal
665 format that you want to use for the dates. Options are
666 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
667 number or a name.
668 encoding : str
669 Default is latin-1. Note that Stata does not support unicode.
670 byteorder : str
671 Can be ">", "<", "little", or "big". The default is None which uses
672 `sys.byteorder`
674 Returns
675 -------
676 writer : StataWriter instance
677 The StataWriter instance has a write_file method, which will
678 write the file to the given `fname`.
680 Examples
681 --------
682 >>> writer = StataWriter('./data_file.dta', data)
683 >>> writer.write_file()
685 Or with dates
687 >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'})
688 >>> writer.write_file()
689 """
690 #type code
691 #--------------------
692 #str1 1 = 0x01
693 #str2 2 = 0x02
694 #...
695 #str244 244 = 0xf4
696 #byte 251 = 0xfb (sic)
697 #int 252 = 0xfc
698 #long 253 = 0xfd
699 #float 254 = 0xfe
700 #double 255 = 0xff
701 #--------------------
702 #NOTE: the byte type seems to be reserved for categorical variables
703 # with a label, but the underlying variable is -127 to 100
704 # we're going to drop the label and cast to int
705 DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \
706 [(251, np.int16),(252, np.int32),(253, int),
707 (254, np.float32), (255, np.float64)])
708 TYPE_MAP = lrange(251)+list('bhlfd')
709 MISSING_VALUES = { 'b': 101,
710 'h': 32741,
711 'l' : 2147483621,
712 'f': 1.7014118346046923e+38,
713 'd': 8.98846567431158e+307}
714 def __init__(self, fname, data, convert_dates=None, encoding="latin-1",
715 byteorder=None):
716 warnings.warn(
717 "StataWriter is deprecated as of 0.10.0 and will be removed in a "
718 "future version. Use pandas.DataFrame.to_stata or "
719 "pandas.io.stata.StatWriter instead.",
720 FutureWarning)
722 self._convert_dates = convert_dates
723 # attach nobs, nvars, data, varlist, typlist
724 if data_util._is_using_pandas(data, None):
725 self._prepare_pandas(data)
727 elif data_util._is_array_like(data, None):
728 data = np.asarray(data)
729 if data_util._is_structured_ndarray(data):
730 self._prepare_structured_array(data)
731 else:
732 if convert_dates is not None:
733 raise ValueError("Not able to convert dates in a plain"
734 " ndarray.")
735 self._prepare_ndarray(data)
737 else: # pragma : no cover
738 raise ValueError("Type %s for data not understood" % type(data))
741 if byteorder is None:
742 byteorder = sys.byteorder
743 self._byteorder = _set_endianness(byteorder)
744 self._encoding = encoding
745 self._file = get_file_obj(fname, 'wb', encoding)
747 def _write(self, to_write):
748 """
749 Helper to call asbytes before writing to file for Python 3 compat.
750 """
751 self._file.write(asbytes(to_write))
753 def _prepare_structured_array(self, data):
754 self.nobs = len(data)
755 self.nvar = len(data.dtype)
756 self.data = data
757 self.datarows = iter(data)
758 dtype = data.dtype
759 descr = dtype.descr
760 if dtype.names is None:
761 varlist = _default_names(self.nvar)
762 else:
763 varlist = dtype.names
765 # check for datetime and change the type
766 convert_dates = self._convert_dates
767 if convert_dates is not None:
768 convert_dates = _maybe_convert_to_int_keys(convert_dates,
769 varlist)
770 self._convert_dates = convert_dates
771 for key in convert_dates:
772 descr[key] = (
773 descr[key][0],
774 _convert_datetime_to_stata_type(convert_dates[key])
775 )
776 dtype = np.dtype(descr)
778 self.varlist = varlist
779 self.typlist = [_dtype_to_stata_type(dtype[i])
780 for i in range(self.nvar)]
781 self.fmtlist = [_dtype_to_default_stata_fmt(dtype[i])
782 for i in range(self.nvar)]
783 # set the given format for the datetime cols
784 if convert_dates is not None:
785 for key in convert_dates:
786 self.fmtlist[key] = convert_dates[key]
789 def _prepare_ndarray(self, data):
790 if data.ndim == 1:
791 data = data[:,None]
792 self.nobs, self.nvar = data.shape
793 self.data = data
794 self.datarows = iter(data)
795 #TODO: this should be user settable
796 dtype = data.dtype
797 self.varlist = _default_names(self.nvar)
798 self.typlist = [_dtype_to_stata_type(dtype) for i in range(self.nvar)]
799 self.fmtlist = [_dtype_to_default_stata_fmt(dtype)
800 for i in range(self.nvar)]
802 def _prepare_pandas(self, data):
803 #NOTE: we might need a different API / class for pandas objects so
804 # we can set different semantics - handle this with a PR to pandas.io
805 class DataFrameRowIter(object):
806 def __init__(self, data):
807 self.data = data
809 def __iter__(self):
810 for i, row in data.iterrows():
811 yield row
813 data = data.reset_index()
814 self.datarows = DataFrameRowIter(data)
815 self.nobs, self.nvar = data.shape
816 self.data = data
817 self.varlist = data.columns.tolist()
818 dtypes = data.dtypes
819 convert_dates = self._convert_dates
820 if convert_dates is not None:
821 convert_dates = _maybe_convert_to_int_keys(convert_dates,
822 self.varlist)
823 self._convert_dates = convert_dates
824 for key in convert_dates:
825 new_type = _convert_datetime_to_stata_type(convert_dates[key])
826 dtypes[key] = np.dtype(new_type)
827 self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes]
828 self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes]
829 # set the given format for the datetime cols
830 if convert_dates is not None:
831 for key in convert_dates:
832 self.fmtlist[key] = convert_dates[key]
834 def write_file(self):
835 self._write_header()
836 self._write_descriptors()
837 self._write_variable_labels()
838 # write 5 zeros for expansion fields
839 self._write(_pad_bytes("", 5))
840 if self._convert_dates is None:
841 self._write_data_nodates()
842 else:
843 self._write_data_dates()
844 #self._write_value_labels()
846 def _write_header(self, data_label=None, time_stamp=None):
847 byteorder = self._byteorder
848 # ds_format - just use 114
849 self._write(pack("b", 114))
850 # byteorder
851 self._write(byteorder == ">" and "\x01" or "\x02")
852 # filetype
853 self._write("\x01")
854 # unused
855 self._write("\x00")
856 # number of vars, 2 bytes
857 self._write(pack(byteorder+"h", self.nvar)[:2])
858 # number of obs, 4 bytes
859 self._write(pack(byteorder+"i", self.nobs)[:4])
860 # data label 81 bytes, char, null terminated
861 if data_label is None:
862 self._write(self._null_terminate(_pad_bytes("", 80),
863 self._encoding))
864 else:
865 self._write(self._null_terminate(_pad_bytes(data_label[:80],
866 80), self._encoding))
867 # time stamp, 18 bytes, char, null terminated
868 # format dd Mon yyyy hh:mm
869 if time_stamp is None:
870 time_stamp = datetime.datetime.now()
871 elif not isinstance(time_stamp, datetime):
872 raise ValueError("time_stamp should be datetime type")
873 self._write(self._null_terminate(
874 time_stamp.strftime("%d %b %Y %H:%M"),
875 self._encoding))
877 def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
878 fmtlist=None, lbllist=None):
879 nvar = self.nvar
880 # typlist, length nvar, format byte array
881 for typ in self.typlist:
882 self._write(typ)
884 # varlist, length 33*nvar, char array, null terminated
885 for name in self.varlist:
886 name = self._null_terminate(name, self._encoding)
887 name = _pad_bytes(asstr(name[:32]), 33)
888 self._write(name)
890 # srtlist, 2*(nvar+1), int array, encoded by byteorder
891 srtlist = _pad_bytes("", (2*(nvar+1)))
892 self._write(srtlist)
894 # fmtlist, 49*nvar, char array
895 for fmt in self.fmtlist:
896 self._write(_pad_bytes(fmt, 49))
898 # lbllist, 33*nvar, char array
899 #NOTE: this is where you could get fancy with pandas categorical type
900 for i in range(nvar):
901 self._write(_pad_bytes("", 33))
903 def _write_variable_labels(self, labels=None):
904 nvar = self.nvar
905 if labels is None:
906 for i in range(nvar):
907 self._write(_pad_bytes("", 81))
909 def _write_data_nodates(self):
910 data = self.datarows
911 byteorder = self._byteorder
912 TYPE_MAP = self.TYPE_MAP
913 typlist = self.typlist
914 for row in data:
915 #row = row.squeeze().tolist() # needed for structured arrays
916 for i,var in enumerate(row):
917 typ = ord(typlist[i])
918 if typ <= 244: # we've got a string
919 if len(var) < typ:
920 var = _pad_bytes(asstr(var), len(var) + 1)
921 self._write(var)
922 else:
923 try:
924 if typ in _type_converters:
925 var = _type_converters[typ](var)
926 self._write(pack(byteorder+TYPE_MAP[typ], var))
927 except struct_error:
928 # have to be strict about type pack will not do any
929 # kind of casting
930 self._write(pack(byteorder+TYPE_MAP[typ],
931 _type_converters[typ](var)))
933 def _write_data_dates(self):
934 convert_dates = self._convert_dates
935 data = self.datarows
936 byteorder = self._byteorder
937 TYPE_MAP = self.TYPE_MAP
938 MISSING_VALUES = self.MISSING_VALUES
939 typlist = self.typlist
940 for row in data:
941 #row = row.squeeze().tolist() # needed for structured arrays
942 for i,var in enumerate(row):
943 typ = ord(typlist[i])
944 #NOTE: If anyone finds this terribly slow, there is
945 # a vectorized way to convert dates, see genfromdta for going
946 # from int to datetime and reverse it. will copy data though
947 if i in convert_dates:
948 var = _datetime_to_stata_elapsed(var, self.fmtlist[i])
949 if typ <= 244: # we've got a string
950 if isnull(var):
951 var = "" # missing string
952 if len(var) < typ:
953 var = _pad_bytes(var, len(var) + 1)
954 self._write(var)
955 else:
956 if isnull(var): # this only matters for floats
957 var = MISSING_VALUES[typ]
958 self._write(pack(byteorder+TYPE_MAP[typ], var))
960 def _null_terminate(self, s, encoding):
961 null_byte = '\x00'
962 s += null_byte
963 return s.encode(encoding)
966def genfromdta(fname, missing_flt=-999., encoding=None, pandas=False,
967 convert_dates=True):
968 """
969 Returns an ndarray or DataFrame from a Stata .dta file.
971 Parameters
972 ----------
973 fname : str or filehandle
974 Stata .dta file.
975 missing_flt : numeric
976 The numeric value to replace missing values with. Will be used for
977 any numeric value.
978 encoding : str, optional
979 Used for Python 3 only. Encoding to use when reading the .dta file.
980 Defaults to `locale.getpreferredencoding`
981 pandas : bool
982 Optionally return a DataFrame instead of an ndarray
983 convert_dates : bool
984 If convert_dates is True, then Stata formatted dates will be converted
985 to datetime types according to the variable's format.
986 """
987 warnings.warn(
988 "genfromdta is deprecated as of 0.10.0 and will be removed in a "
989 "future version. Use pandas.read_stata instead.",
990 FutureWarning)
992 if isinstance(fname, str):
993 fhd = StataReader(open(fname, 'rb'), missing_values=False,
994 encoding=encoding)
995 elif not hasattr(fname, 'read'):
996 raise TypeError("The input should be a string or a filehandle. "\
997 "(got %s instead)" % type(fname))
998 else:
999 fhd = StataReader(fname, missing_values=False, encoding=encoding)
1000# validate_names = np.lib._iotools.NameValidator(excludelist=excludelist,
1001# deletechars=deletechars,
1002# case_sensitive=case_sensitive)
1004 #TODO: This needs to handle the byteorder?
1005 header = fhd.file_headers()
1006 types = header['dtyplist']
1007 nobs = header['nobs']
1008 numvars = header['nvar']
1009 varnames = header['varlist']
1010 fmtlist = header['fmtlist']
1011 dataname = header['data_label']
1012 labels = header['vlblist'] # labels are thrown away unless DataArray
1013 # type is used
1014 data = np.zeros((nobs,numvars))
1015 stata_dta = fhd.dataset()
1017 dt = np.dtype(lzip(varnames, types))
1018 data = np.zeros((nobs), dtype=dt) # init final array
1020 for rownum,line in enumerate(stata_dta):
1021 # does not handle missing value objects, just casts
1022 # None will only work without missing value object.
1023 if None in line:
1024 for i,val in enumerate(line):
1025 #NOTE: This will only be scalar types because missing strings
1026 # are empty not None in Stata
1027 if val is None:
1028 line[i] = missing_flt
1029 data[rownum] = tuple(line)
1031 if pandas:
1032 from pandas import DataFrame
1033 data = DataFrame.from_records(data)
1034 if convert_dates:
1035 cols = np.where(lmap(lambda x : x in _date_formats, fmtlist))[0]
1036 for col in cols:
1037 i = col
1038 col = data.columns[col]
1039 data[col] = data[col].apply(_stata_elapsed_date_to_datetime,
1040 args=(fmtlist[i],))
1041 elif convert_dates:
1042 # date_cols = np.where(map(lambda x : x in _date_formats,
1043 # fmtlist))[0]
1044 # make the dtype for the datetime types
1045 cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0]
1046 dtype = data.dtype.descr
1047 dtype = [(sub_dtype[0], object) if i in cols else sub_dtype
1048 for i, sub_dtype in enumerate(dtype)]
1049 data = data.astype(dtype) # have to copy
1050 for col in cols:
1051 def convert(x):
1052 return _stata_elapsed_date_to_datetime(x, fmtlist[col])
1053 data[data.dtype.names[col]] = lmap(convert,
1054 data[data.dtype.names[col]])
1055 return data
1058def savetxt(fname, X, names=None, fmt='%.18e', delimiter=' '):
1059 """
1060 Save an array to a text file.
1062 This is just a copy of numpy.savetxt patched to support structured arrays
1063 or a header of names. Does not include py3 support now in savetxt.
1065 Parameters
1066 ----------
1067 fname : filename or file handle
1068 If the filename ends in ``.gz``, the file is automatically saved in
1069 compressed gzip format. `loadtxt` understands gzipped files
1070 transparently.
1071 X : array_like
1072 Data to be saved to a text file.
1073 names : list, optional
1074 If given names will be the column header in the text file. If None and
1075 X is a structured or recarray then the names are taken from
1076 X.dtype.names.
1077 fmt : str or sequence of strs
1078 A single format (%10.5f), a sequence of formats, or a
1079 multi-format string, e.g. 'Iteration %d -- %10.5f', in which
1080 case `delimiter` is ignored.
1081 delimiter : str
1082 Character separating columns.
1084 See Also
1085 --------
1086 save : Save an array to a binary file in NumPy ``.npy`` format
1087 savez : Save several arrays into a ``.npz`` compressed archive
1089 Notes
1090 -----
1091 Further explanation of the `fmt` parameter
1092 (``%[flag]width[.precision]specifier``):
1094 flags:
1095 ``-`` : left justify
1097 ``+`` : Forces to preceed result with + or -.
1099 ``0`` : Left pad the number with zeros instead of space (see width).
1101 width:
1102 Minimum number of characters to be printed. The value is not truncated
1103 if it has more characters.
1105 precision:
1106 - For integer specifiers (eg. ``d,i,o,x``), the minimum number of
1107 digits.
1108 - For ``e, E`` and ``f`` specifiers, the number of digits to print
1109 after the decimal point.
1110 - For ``g`` and ``G``, the maximum number of significant digits.
1111 - For ``s``, the maximum number of characters.
1113 specifiers:
1114 ``c`` : character
1116 ``d`` or ``i`` : signed decimal integer
1118 ``e`` or ``E`` : scientific notation with ``e`` or ``E``.
1120 ``f`` : decimal floating point
1122 ``g,G`` : use the shorter of ``e,E`` or ``f``
1124 ``o`` : signed octal
1126 ``s`` : str of characters
1128 ``u`` : unsigned decimal integer
1130 ``x,X`` : unsigned hexadecimal integer
1132 This explanation of ``fmt`` is not complete, for an exhaustive
1133 specification see [1]_.
1135 References
1136 ----------
1137 .. [1] `Format Specification Mini-Language
1138 <http://docs.python.org/library/string.html#
1139 format-specification-mini-language>`_, Python Documentation.
1141 Examples
1142 --------
1143 >>> savetxt('test.out', x, delimiter=',') # x is an array
1144 >>> savetxt('test.out', (x,y,z)) # x,y,z equal sized 1D arrays
1145 >>> savetxt('test.out', x, fmt='%1.4e') # use exponential notation
1146 """
1148 with get_file_obj(fname, 'w') as fh:
1149 X = np.asarray(X)
1151 # Handle 1-dimensional arrays
1152 if X.ndim == 1:
1153 # Common case -- 1d array of numbers
1154 if X.dtype.names is None:
1155 X = np.atleast_2d(X).T
1156 ncol = 1
1158 # Complex dtype -- each field indicates a separate column
1159 else:
1160 ncol = len(X.dtype.descr)
1161 else:
1162 ncol = X.shape[1]
1164 # `fmt` can be a string with multiple insertion points or a list of formats.
1165 # E.g. '%10.5f\t%10d' or ('%10.5f', '$10d')
1166 if isinstance(fmt, (list, tuple)):
1167 if len(fmt) != ncol:
1168 raise AttributeError('fmt has wrong shape. %s' % str(fmt))
1169 format = delimiter.join(fmt)
1170 elif isinstance(fmt, str):
1171 if fmt.count('%') == 1:
1172 fmt = [fmt, ]*ncol
1173 format = delimiter.join(fmt)
1174 elif fmt.count('%') != ncol:
1175 raise AttributeError('fmt has wrong number of %% formats. %s'
1176 % fmt)
1177 else:
1178 format = fmt
1180 # handle names
1181 if names is None and X.dtype.names:
1182 names = X.dtype.names
1183 if names is not None:
1184 fh.write(delimiter.join(names) + '\n')
1186 for row in X:
1187 fh.write(format % tuple(row) + '\n')