Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

 

""" 

csv.py - read/write/investigate CSV files 

""" 

 

import re 

from _csv import Error, __version__, writer, reader, register_dialect, \ 

unregister_dialect, get_dialect, list_dialects, \ 

field_size_limit, \ 

QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ 

__doc__ 

from _csv import Dialect as _Dialect 

 

from collections import OrderedDict 

from io import StringIO 

 

__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", 

"Error", "Dialect", "__doc__", "excel", "excel_tab", 

"field_size_limit", "reader", "writer", 

"register_dialect", "get_dialect", "list_dialects", "Sniffer", 

"unregister_dialect", "__version__", "DictReader", "DictWriter", 

"unix_dialect"] 

 

class Dialect: 

"""Describe a CSV dialect. 

 

This must be subclassed (see csv.excel). Valid attributes are: 

delimiter, quotechar, escapechar, doublequote, skipinitialspace, 

lineterminator, quoting. 

 

""" 

_name = "" 

_valid = False 

# placeholders 

delimiter = None 

quotechar = None 

escapechar = None 

doublequote = None 

skipinitialspace = None 

lineterminator = None 

quoting = None 

 

def __init__(self): 

if self.__class__ != Dialect: 

self._valid = True 

self._validate() 

 

def _validate(self): 

try: 

_Dialect(self) 

except TypeError as e: 

# We do this for compatibility with py2.3 

raise Error(str(e)) 

 

class excel(Dialect): 

"""Describe the usual properties of Excel-generated CSV files.""" 

delimiter = ',' 

quotechar = '"' 

doublequote = True 

skipinitialspace = False 

lineterminator = '\r\n' 

quoting = QUOTE_MINIMAL 

register_dialect("excel", excel) 

 

class excel_tab(excel): 

"""Describe the usual properties of Excel-generated TAB-delimited files.""" 

delimiter = '\t' 

register_dialect("excel-tab", excel_tab) 

 

class unix_dialect(Dialect): 

"""Describe the usual properties of Unix-generated CSV files.""" 

delimiter = ',' 

quotechar = '"' 

doublequote = True 

skipinitialspace = False 

lineterminator = '\n' 

quoting = QUOTE_ALL 

register_dialect("unix", unix_dialect) 

 

 

class DictReader: 

def __init__(self, f, fieldnames=None, restkey=None, restval=None, 

dialect="excel", *args, **kwds): 

self._fieldnames = fieldnames # list of keys for the dict 

self.restkey = restkey # key to catch long rows 

self.restval = restval # default value for short rows 

self.reader = reader(f, dialect, *args, **kwds) 

self.dialect = dialect 

self.line_num = 0 

 

def __iter__(self): 

return self 

 

@property 

def fieldnames(self): 

if self._fieldnames is None: 

try: 

self._fieldnames = next(self.reader) 

except StopIteration: 

pass 

self.line_num = self.reader.line_num 

return self._fieldnames 

 

@fieldnames.setter 

def fieldnames(self, value): 

self._fieldnames = value 

 

def __next__(self): 

if self.line_num == 0: 

# Used only for its side effect. 

self.fieldnames 

row = next(self.reader) 

self.line_num = self.reader.line_num 

 

# unlike the basic reader, we prefer not to return blanks, 

# because we will typically wind up with a dict full of None 

# values 

while row == []: 

row = next(self.reader) 

d = OrderedDict(zip(self.fieldnames, row)) 

lf = len(self.fieldnames) 

lr = len(row) 

if lf < lr: 

d[self.restkey] = row[lf:] 

elif lf > lr: 

for key in self.fieldnames[lr:]: 

d[key] = self.restval 

return d 

 

 

class DictWriter: 

def __init__(self, f, fieldnames, restval="", extrasaction="raise", 

dialect="excel", *args, **kwds): 

self.fieldnames = fieldnames # list of keys for the dict 

self.restval = restval # for writing short dicts 

if extrasaction.lower() not in ("raise", "ignore"): 

raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'" 

% extrasaction) 

self.extrasaction = extrasaction 

self.writer = writer(f, dialect, *args, **kwds) 

 

def writeheader(self): 

header = dict(zip(self.fieldnames, self.fieldnames)) 

self.writerow(header) 

 

def _dict_to_list(self, rowdict): 

if self.extrasaction == "raise": 

wrong_fields = rowdict.keys() - self.fieldnames 

if wrong_fields: 

raise ValueError("dict contains fields not in fieldnames: " 

+ ", ".join([repr(x) for x in wrong_fields])) 

return (rowdict.get(key, self.restval) for key in self.fieldnames) 

 

def writerow(self, rowdict): 

return self.writer.writerow(self._dict_to_list(rowdict)) 

 

def writerows(self, rowdicts): 

return self.writer.writerows(map(self._dict_to_list, rowdicts)) 

 

# Guard Sniffer's type checking against builds that exclude complex() 

try: 

complex 

except NameError: 

complex = float 

 

class Sniffer: 

''' 

"Sniffs" the format of a CSV file (i.e. delimiter, quotechar) 

Returns a Dialect object. 

''' 

def __init__(self): 

# in case there is more than one possible delimiter 

self.preferred = [',', '\t', ';', ' ', ':'] 

 

 

def sniff(self, sample, delimiters=None): 

""" 

Returns a dialect (or None) corresponding to the sample 

""" 

 

quotechar, doublequote, delimiter, skipinitialspace = \ 

self._guess_quote_and_delimiter(sample, delimiters) 

if not delimiter: 

delimiter, skipinitialspace = self._guess_delimiter(sample, 

delimiters) 

 

if not delimiter: 

raise Error("Could not determine delimiter") 

 

class dialect(Dialect): 

_name = "sniffed" 

lineterminator = '\r\n' 

quoting = QUOTE_MINIMAL 

# escapechar = '' 

 

dialect.doublequote = doublequote 

dialect.delimiter = delimiter 

# _csv.reader won't accept a quotechar of '' 

dialect.quotechar = quotechar or '"' 

dialect.skipinitialspace = skipinitialspace 

 

return dialect 

 

 

def _guess_quote_and_delimiter(self, data, delimiters): 

""" 

Looks for text enclosed between two identical quotes 

(the probable quotechar) which are preceded and followed 

by the same character (the probable delimiter). 

For example: 

,'some text', 

The quote with the most wins, same with the delimiter. 

If there is no quotechar the delimiter can't be determined 

this way. 

""" 

 

matches = [] 

for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", 

r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", 

r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" 

r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) 

regexp = re.compile(restr, re.DOTALL | re.MULTILINE) 

matches = regexp.findall(data) 

if matches: 

break 

 

if not matches: 

# (quotechar, doublequote, delimiter, skipinitialspace) 

return ('', False, None, 0) 

quotes = {} 

delims = {} 

spaces = 0 

groupindex = regexp.groupindex 

for m in matches: 

n = groupindex['quote'] - 1 

key = m[n] 

if key: 

quotes[key] = quotes.get(key, 0) + 1 

try: 

n = groupindex['delim'] - 1 

key = m[n] 

except KeyError: 

continue 

if key and (delimiters is None or key in delimiters): 

delims[key] = delims.get(key, 0) + 1 

try: 

n = groupindex['space'] - 1 

except KeyError: 

continue 

if m[n]: 

spaces += 1 

 

quotechar = max(quotes, key=quotes.get) 

 

if delims: 

delim = max(delims, key=delims.get) 

skipinitialspace = delims[delim] == spaces 

if delim == '\n': # most likely a file with a single column 

delim = '' 

else: 

# there is *no* delimiter, it's a single column of quoted data 

delim = '' 

skipinitialspace = 0 

 

# if we see an extra quote between delimiters, we've got a 

# double quoted format 

dq_regexp = re.compile( 

r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ 

{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) 

 

 

 

if dq_regexp.search(data): 

doublequote = True 

else: 

doublequote = False 

 

return (quotechar, doublequote, delim, skipinitialspace) 

 

 

def _guess_delimiter(self, data, delimiters): 

""" 

The delimiter /should/ occur the same number of times on 

each row. However, due to malformed data, it may not. We don't want 

an all or nothing approach, so we allow for small variations in this 

number. 

1) build a table of the frequency of each character on every line. 

2) build a table of frequencies of this frequency (meta-frequency?), 

e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, 

7 times in 2 rows' 

3) use the mode of the meta-frequency to determine the /expected/ 

frequency for that character 

4) find out how often the character actually meets that goal 

5) the character that best meets its goal is the delimiter 

For performance reasons, the data is evaluated in chunks, so it can 

try and evaluate the smallest portion of the data possible, evaluating 

additional chunks as necessary. 

""" 

 

data = list(filter(None, data.split('\n'))) 

 

ascii = [chr(c) for c in range(127)] # 7-bit ASCII 

 

# build frequency tables 

chunkLength = min(10, len(data)) 

iteration = 0 

charFrequency = {} 

modes = {} 

delims = {} 

start, end = 0, chunkLength 

while start < len(data): 

iteration += 1 

for line in data[start:end]: 

for char in ascii: 

metaFrequency = charFrequency.get(char, {}) 

# must count even if frequency is 0 

freq = line.count(char) 

# value is the mode 

metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 

charFrequency[char] = metaFrequency 

 

for char in charFrequency.keys(): 

items = list(charFrequency[char].items()) 

if len(items) == 1 and items[0][0] == 0: 

continue 

# get the mode of the frequencies 

if len(items) > 1: 

modes[char] = max(items, key=lambda x: x[1]) 

# adjust the mode - subtract the sum of all 

# other frequencies 

items.remove(modes[char]) 

modes[char] = (modes[char][0], modes[char][1] 

- sum(item[1] for item in items)) 

else: 

modes[char] = items[0] 

 

# build a list of possible delimiters 

modeList = modes.items() 

total = float(min(chunkLength * iteration, len(data))) 

# (rows of consistent data) / (number of rows) = 100% 

consistency = 1.0 

# minimum consistency threshold 

threshold = 0.9 

while len(delims) == 0 and consistency >= threshold: 

for k, v in modeList: 

if v[0] > 0 and v[1] > 0: 

if ((v[1]/total) >= consistency and 

(delimiters is None or k in delimiters)): 

delims[k] = v 

consistency -= 0.01 

 

if len(delims) == 1: 

delim = list(delims.keys())[0] 

skipinitialspace = (data[0].count(delim) == 

data[0].count("%c " % delim)) 

return (delim, skipinitialspace) 

 

# analyze another chunkLength lines 

start = end 

end += chunkLength 

 

if not delims: 

return ('', 0) 

 

# if there's more than one, fall back to a 'preferred' list 

if len(delims) > 1: 

for d in self.preferred: 

if d in delims.keys(): 

skipinitialspace = (data[0].count(d) == 

data[0].count("%c " % d)) 

return (d, skipinitialspace) 

 

# nothing else indicates a preference, pick the character that 

# dominates(?) 

items = [(v,k) for (k,v) in delims.items()] 

items.sort() 

delim = items[-1][1] 

 

skipinitialspace = (data[0].count(delim) == 

data[0].count("%c " % delim)) 

return (delim, skipinitialspace) 

 

 

def has_header(self, sample): 

# Creates a dictionary of types of data in each column. If any 

# column is of a single type (say, integers), *except* for the first 

# row, then the first row is presumed to be labels. If the type 

# can't be determined, it is assumed to be a string in which case 

# the length of the string is the determining factor: if all of the 

# rows except for the first are the same length, it's a header. 

# Finally, a 'vote' is taken at the end for each column, adding or 

# subtracting from the likelihood of the first row being a header. 

 

rdr = reader(StringIO(sample), self.sniff(sample)) 

 

header = next(rdr) # assume first row is header 

 

columns = len(header) 

columnTypes = {} 

for i in range(columns): columnTypes[i] = None 

 

checked = 0 

for row in rdr: 

# arbitrary number of rows to check, to keep it sane 

if checked > 20: 

break 

checked += 1 

 

if len(row) != columns: 

continue # skip rows that have irregular number of columns 

 

for col in list(columnTypes.keys()): 

 

for thisType in [int, float, complex]: 

try: 

thisType(row[col]) 

break 

except (ValueError, OverflowError): 

pass 

else: 

# fallback to length of string 

thisType = len(row[col]) 

 

if thisType != columnTypes[col]: 

if columnTypes[col] is None: # add new column type 

columnTypes[col] = thisType 

else: 

# type is inconsistent, remove column from 

# consideration 

del columnTypes[col] 

 

# finally, compare results against first row and "vote" 

# on whether it's a header 

hasHeader = 0 

for col, colType in columnTypes.items(): 

if type(colType) == type(0): # it's a length 

if len(header[col]) != colType: 

hasHeader += 1 

else: 

hasHeader -= 1 

else: # attempt typecast 

try: 

colType(header[col]) 

except (ValueError, TypeError): 

hasHeader += 1 

else: 

hasHeader -= 1 

 

return hasHeader > 0