Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

#!/usr/bin/env python 

# -*- coding: UTF-8 -*- 

# 

# Copyright 2014 European Commission (JRC); 

# Licensed under the EUPL (the 'Licence'); 

# You may not use this work except in compliance with the Licence. 

# You may obtain a copy of the Licence at: http://ec.europa.eu/idabc/eupl 

""" 

Implements the *xlrd* backend of *xlasso* that reads in-file Excel-spreadsheets. 

""" 

 

import datetime 

from distutils.version import LooseVersion 

import logging 

from os import path 

from pandalone.xlasso._capture import ABCSheet, SheetId 

from pandalone.xlasso._parse import Coords 

 

from future.moves.urllib import request 

from future.moves.urllib.parse import urlsplit 

from xlrd import (xldate, XL_CELL_DATE, XL_CELL_EMPTY, XL_CELL_TEXT, 

                  XL_CELL_BLANK, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) 

import xlrd 

 

import numpy as np 

 

from .. import utils 

 

 

log = logging.getLogger(__name__) 

 

# noinspection PyUnresolvedReferences 

if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): 

    _xlrd_0_9_3 = True 

else: 

    _xlrd_0_9_3 = False 

 

 

def _parse_cell(xcell, epoch1904=False): 

    """ 

    Parse a xl-xcell. 

 

    :param xlrd.Cell xcell: an excel xcell 

    :type xcell: xlrd.sheet.Cell 

 

    :param epoch1904: 

        Which date system was in force when this file was last saved. 

        False => 1900 system (the Excel for Windows default). 

        True => 1904 system (the Excel for Macintosh default). 

    :type epoch1904: bool 

 

    :return: formatted xcell value 

    :rtype: 

        int, float, datetime.datetime, bool, None, str, datetime.time, 

        float('nan') 

 

 

    Examples:: 

 

        >>> import xlrd 

        >>> from xlrd.sheet import Cell 

        >>> _parse_cell(Cell(xlrd.XL_CELL_NUMBER, 1.2)) 

        1.2 

 

        >>> _parse_cell(Cell(xlrd.XL_CELL_DATE, 1.2)) 

        datetime.datetime(1900, 1, 1, 4, 48) 

 

        >>> _parse_cell(Cell(xlrd.XL_CELL_TEXT, 'hi')) 

        'hi' 

    """ 

 

    ctype = xcell.ctype 

    cvalue = xcell.value 

    if ctype == XL_CELL_NUMBER: 

        # GH5394 - Excel 'numbers' are always floats 

        # it's a minimal perf hit and less suprising 

        cint = int(cvalue) 

        if cint == cvalue: 

            return cint 

        return cvalue 

    elif ctype in (XL_CELL_EMPTY, XL_CELL_BLANK): 

        return None  # RECT-LOOP NEVER USE THIS 

    elif ctype == XL_CELL_TEXT: 

        return cvalue 

    elif ctype == XL_CELL_BOOLEAN: 

        return bool(cvalue) 

    elif ctype == XL_CELL_DATE:  # modified from Pandas library 

        if _xlrd_0_9_3: 

            # Use the newer xlrd datetime handling. 

            d = xldate.xldate_as_datetime(cvalue, epoch1904) 

 

            # Excel doesn't distinguish between dates and time, so we treat 

            # dates on the epoch as times only. Also, Excel supports 1900 and 

            # 1904 epochs. 

            epoch = (1904, 1, 1) if epoch1904 else (1899, 12, 31) 

            if (d.timetuple())[0:3] == epoch: 

                d = datetime.time(d.hour, d.minute, d.second, d.microsecond) 

        else: 

            # Use the xlrd <= 0.9.2 date handling. 

            d = xldate.xldate_as_tuple(xcell.value, epoch1904) 

            if d[0] < datetime.MINYEAR:  # time 

                d = datetime.time(*d[3:]) 

            else:  # date 

                d = datetime.datetime(*d) 

        return d 

    elif ctype == XL_CELL_ERROR: 

        return float('nan') 

 

    raise ValueError('Invalid XL-cell type(%s) for value(%s)!' % 

                     (xcell.ctype, xcell.value)) 

 

 

def _open_sheet_by_name_or_index(xlrd_book, wb_id, sheet_id, opts=None): 

    """ 

    :param int or str or None sheet_id: 

            If `None`, opens 1st sheet. 

    :param dict opts: 

            does nothing with them 

    """ 

    if sheet_id is None: 

        sheet_id = 0 

    if isinstance(sheet_id, int): 

        xl_sh = xlrd_book.sheet_by_index(sheet_id) 

    else: 

        try: 

            xl_sh = xlrd_book.sheet_by_name(sheet_id) 

        except Exception as xl_ex: 

            try: 

                sheet_id = int(sheet_id) 

            except ValueError: 

                raise xl_ex 

            else: 

                xl_sh = xlrd_book.sheet_by_index(sheet_id) 

    return XlrdSheet(xl_sh, wb_id) 

 

 

def open_sheet(wb_url, sheet_id, opts): 

    """ 

    Opens the local or remote `wb_url` *xlrd* workbook wrapped as :class:`XlrdSheet`. 

    """ 

    assert wb_url, (wb_url, sheet_id, opts) 

    ropts = opts.get('read', {}) 

    if ropts: 

        ropts = ropts.copy() 

    if not 'logfile' in ropts: 

        level = logging.INFO if opts.get('verbose', None) else logging.DEBUG 

        ropts['logfile'] = utils.LoggerWriter(log, level) 

    parts = filename = urlsplit(wb_url) 

    if not parts.scheme or parts.scheme == 'file': 

        fpath = path.abspath(path.expanduser(path.expandvars(parts.path))) 

        book = xlrd.open_workbook(fpath, **ropts) 

    else: 

        ropts.pop('on_demand', None) 

        http_opts = ropts.get('http_opts', {}) 

        with request.urlopen(wb_url, **http_opts) as response: 

            book = xlrd.open_workbook( 

                filename, file_contents=response, **ropts) 

 

    return _open_sheet_by_name_or_index(book, wb_url, sheet_id, opts) 

 

 

class XlrdSheet(ABCSheet): 

    """ 

    The *xlrd* workbook wrapper required by xlasso library. 

    """ 

 

    def __init__(self, sheet, book_fname, epoch1904=False): 

        if not isinstance(sheet, xlrd.sheet.Sheet): 

            raise ValueError("Invalid xlrd-sheet({})".format(sheet)) 

        self._sheet = sheet 

        self._epoch1904 = epoch1904 

        self.book_fname = book_fname 

 

    def _close(self): 

        """ Override it to release resources for this sheet.""" 

        self._sheet.book.unload_sheet(self._sheet.name) 

 

    def _close_all(self): 

        """ Override it to release resources this and all sibling sheets.""" 

        self._sheet.book.release_resources() 

 

    def get_sheet_ids(self): 

        sh = self._sheet 

        return SheetId(self.book_fname or sh.book.filestr, 

                       [sh.name, sh.number]) 

 

    def open_sibling_sheet(self, sheet_id, opts=None): 

        """Gets by-index only if `sheet_id` is `int`, otherwise tries both by name and index.""" 

        return _open_sheet_by_name_or_index(self._sheet.book, 

                                            self.book_fname, sheet_id, 

                                            opts) 

 

    def _read_states_matrix(self): 

        """See super-method. """ 

        types = np.asarray(self._sheet._cell_types) 

        return (types != XL_CELL_EMPTY) & (types != XL_CELL_BLANK) 

 

    def _read_margin_coords(self): 

        return None, Coords(self._sheet.nrows - 1, self._sheet.ncols - 1) 

 

    def read_rect(self, st, nd): 

        """See super-method. """ 

        sheet = self._sheet 

 

        if nd is None: 

            return _parse_cell(sheet.cell(*st), self._epoch1904) 

 

        rect = np.array([st, nd]) + [[0, 0], [1, 1]] 

        states_matrix = self.get_states_matrix() 

 

        table = [] 

        for r in range(*rect[:, 0]): 

            row = [] 

            table.append(row) 

            for c in range(*rect[:, 1]): 

                try: 

                    if states_matrix[r, c]: 

                        c = _parse_cell(sheet.cell(r, c), self._epoch1904) 

                        row.append(c) 

                        continue 

                except IndexError: 

                    pass 

                row.append(None) 

 

        return table