Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""A file interface for handling local and remote data files. 

2 

3The goal of datasource is to abstract some of the file system operations 

4when dealing with data files so the researcher doesn't have to know all the 

5low-level details. Through datasource, a researcher can obtain and use a 

6file with one function call, regardless of location of the file. 

7 

8DataSource is meant to augment standard python libraries, not replace them. 

9It should work seamlessly with standard file IO operations and the os 

10module. 

11 

12DataSource files can originate locally or remotely: 

13 

14- local files : '/home/guido/src/local/data.txt' 

15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt' 

16 

17DataSource files can also be compressed or uncompressed. Currently only 

18gzip, bz2 and xz are supported. 

19 

20Example:: 

21 

22 >>> # Create a DataSource, use os.curdir (default) for local storage. 

23 >>> from numpy import DataSource 

24 >>> ds = DataSource() 

25 >>> 

26 >>> # Open a remote file. 

27 >>> # DataSource downloads the file, stores it locally in: 

28 >>> # './www.google.com/index.html' 

29 >>> # opens the file and returns a file object. 

30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP 

31 >>> 

32 >>> # Use the file as you normally would 

33 >>> fp.read() # doctest: +SKIP 

34 >>> fp.close() # doctest: +SKIP 

35 

36""" 

37import os 

38import shutil 

39import io 

40from contextlib import closing 

41 

42from numpy.core.overrides import set_module 

43 

44 

45_open = open 

46 

47 

48def _check_mode(mode, encoding, newline): 

49 """Check mode and that encoding and newline are compatible. 

50 

51 Parameters 

52 ---------- 

53 mode : str 

54 File open mode. 

55 encoding : str 

56 File encoding. 

57 newline : str 

58 Newline for text files. 

59 

60 """ 

61 if "t" in mode: 

62 if "b" in mode: 

63 raise ValueError("Invalid mode: %r" % (mode,)) 

64 else: 

65 if encoding is not None: 

66 raise ValueError("Argument 'encoding' not supported in binary mode") 

67 if newline is not None: 

68 raise ValueError("Argument 'newline' not supported in binary mode") 

69 

70 

71# Using a class instead of a module-level dictionary 

72# to reduce the initial 'import numpy' overhead by 

73# deferring the import of lzma, bz2 and gzip until needed 

74 

75# TODO: .zip support, .tar support? 

76class _FileOpeners: 

77 """ 

78 Container for different methods to open (un-)compressed files. 

79 

80 `_FileOpeners` contains a dictionary that holds one method for each 

81 supported file format. Attribute lookup is implemented in such a way 

82 that an instance of `_FileOpeners` itself can be indexed with the keys 

83 of that dictionary. Currently uncompressed files as well as files 

84 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported. 

85 

86 Notes 

87 ----- 

88 `_file_openers`, an instance of `_FileOpeners`, is made available for 

89 use in the `_datasource` module. 

90 

91 Examples 

92 -------- 

93 >>> import gzip 

94 >>> np.lib._datasource._file_openers.keys() 

95 [None, '.bz2', '.gz', '.xz', '.lzma'] 

96 >>> np.lib._datasource._file_openers['.gz'] is gzip.open 

97 True 

98 

99 """ 

100 

101 def __init__(self): 

102 self._loaded = False 

103 self._file_openers = {None: io.open} 

104 

105 def _load(self): 

106 if self._loaded: 

107 return 

108 

109 try: 

110 import bz2 

111 self._file_openers[".bz2"] = bz2.open 

112 except ImportError: 

113 pass 

114 

115 try: 

116 import gzip 

117 self._file_openers[".gz"] = gzip.open 

118 except ImportError: 

119 pass 

120 

121 try: 

122 import lzma 

123 self._file_openers[".xz"] = lzma.open 

124 self._file_openers[".lzma"] = lzma.open 

125 except (ImportError, AttributeError): 

126 # There are incompatible backports of lzma that do not have the 

127 # lzma.open attribute, so catch that as well as ImportError. 

128 pass 

129 

130 self._loaded = True 

131 

132 def keys(self): 

133 """ 

134 Return the keys of currently supported file openers. 

135 

136 Parameters 

137 ---------- 

138 None 

139 

140 Returns 

141 ------- 

142 keys : list 

143 The keys are None for uncompressed files and the file extension 

144 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression 

145 methods. 

146 

147 """ 

148 self._load() 

149 return list(self._file_openers.keys()) 

150 

151 def __getitem__(self, key): 

152 self._load() 

153 return self._file_openers[key] 

154 

155_file_openers = _FileOpeners() 

156 

157def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None): 

158 """ 

159 Open `path` with `mode` and return the file object. 

160 

161 If ``path`` is an URL, it will be downloaded, stored in the 

162 `DataSource` `destpath` directory and opened from there. 

163 

164 Parameters 

165 ---------- 

166 path : str 

167 Local file path or URL to open. 

168 mode : str, optional 

169 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to 

170 append. Available modes depend on the type of object specified by 

171 path. Default is 'r'. 

172 destpath : str, optional 

173 Path to the directory where the source file gets downloaded to for 

174 use. If `destpath` is None, a temporary directory will be created. 

175 The default path is the current directory. 

176 encoding : {None, str}, optional 

177 Open text file with given encoding. The default encoding will be 

178 what `io.open` uses. 

179 newline : {None, str}, optional 

180 Newline to use when reading text file. 

181 

182 Returns 

183 ------- 

184 out : file object 

185 The opened file. 

186 

187 Notes 

188 ----- 

189 This is a convenience function that instantiates a `DataSource` and 

190 returns the file object from ``DataSource.open(path)``. 

191 

192 """ 

193 

194 ds = DataSource(destpath) 

195 return ds.open(path, mode, encoding=encoding, newline=newline) 

196 

197 

198@set_module('numpy') 

199class DataSource: 

200 """ 

201 DataSource(destpath='.') 

202 

203 A generic data source file (file, http, ftp, ...). 

204 

205 DataSources can be local files or remote files/URLs. The files may 

206 also be compressed or uncompressed. DataSource hides some of the 

207 low-level details of downloading the file, allowing you to simply pass 

208 in a valid file path (or URL) and obtain a file object. 

209 

210 Parameters 

211 ---------- 

212 destpath : str or None, optional 

213 Path to the directory where the source file gets downloaded to for 

214 use. If `destpath` is None, a temporary directory will be created. 

215 The default path is the current directory. 

216 

217 Notes 

218 ----- 

219 URLs require a scheme string (``http://``) to be used, without it they 

220 will fail:: 

221 

222 >>> repos = np.DataSource() 

223 >>> repos.exists('www.google.com/index.html') 

224 False 

225 >>> repos.exists('http://www.google.com/index.html') 

226 True 

227 

228 Temporary directories are deleted when the DataSource is deleted. 

229 

230 Examples 

231 -------- 

232 :: 

233 

234 >>> ds = np.DataSource('/home/guido') 

235 >>> urlname = 'http://www.google.com/' 

236 >>> gfile = ds.open('http://www.google.com/') 

237 >>> ds.abspath(urlname) 

238 '/home/guido/www.google.com/index.html' 

239 

240 >>> ds = np.DataSource(None) # use with temporary file 

241 >>> ds.open('/home/guido/foobar.txt') 

242 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430> 

243 >>> ds.abspath('/home/guido/foobar.txt') 

244 '/tmp/.../home/guido/foobar.txt' 

245 

246 """ 

247 

248 def __init__(self, destpath=os.curdir): 

249 """Create a DataSource with a local path at destpath.""" 

250 if destpath: 

251 self._destpath = os.path.abspath(destpath) 

252 self._istmpdest = False 

253 else: 

254 import tempfile # deferring import to improve startup time 

255 self._destpath = tempfile.mkdtemp() 

256 self._istmpdest = True 

257 

258 def __del__(self): 

259 # Remove temp directories 

260 if hasattr(self, '_istmpdest') and self._istmpdest: 

261 shutil.rmtree(self._destpath) 

262 

263 def _iszip(self, filename): 

264 """Test if the filename is a zip file by looking at the file extension. 

265 

266 """ 

267 fname, ext = os.path.splitext(filename) 

268 return ext in _file_openers.keys() 

269 

270 def _iswritemode(self, mode): 

271 """Test if the given mode will open a file for writing.""" 

272 

273 # Currently only used to test the bz2 files. 

274 _writemodes = ("w", "+") 

275 for c in mode: 

276 if c in _writemodes: 

277 return True 

278 return False 

279 

280 def _splitzipext(self, filename): 

281 """Split zip extension from filename and return filename. 

282 

283 *Returns*: 

284 base, zip_ext : {tuple} 

285 

286 """ 

287 

288 if self._iszip(filename): 

289 return os.path.splitext(filename) 

290 else: 

291 return filename, None 

292 

293 def _possible_names(self, filename): 

294 """Return a tuple containing compressed filename variations.""" 

295 names = [filename] 

296 if not self._iszip(filename): 

297 for zipext in _file_openers.keys(): 

298 if zipext: 

299 names.append(filename+zipext) 

300 return names 

301 

302 def _isurl(self, path): 

303 """Test if path is a net location. Tests the scheme and netloc.""" 

304 

305 # We do this here to reduce the 'import numpy' initial import time. 

306 from urllib.parse import urlparse 

307 

308 # BUG : URLs require a scheme string ('http://') to be used. 

309 # www.google.com will fail. 

310 # Should we prepend the scheme for those that don't have it and 

311 # test that also? Similar to the way we append .gz and test for 

312 # for compressed versions of files. 

313 

314 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 

315 return bool(scheme and netloc) 

316 

317 def _cache(self, path): 

318 """Cache the file specified by path. 

319 

320 Creates a copy of the file in the datasource cache. 

321 

322 """ 

323 # We import these here because importing urllib is slow and 

324 # a significant fraction of numpy's total import time. 

325 from urllib.request import urlopen 

326 from urllib.error import URLError 

327 

328 upath = self.abspath(path) 

329 

330 # ensure directory exists 

331 if not os.path.exists(os.path.dirname(upath)): 

332 os.makedirs(os.path.dirname(upath)) 

333 

334 # TODO: Doesn't handle compressed files! 

335 if self._isurl(path): 

336 try: 

337 with closing(urlopen(path)) as openedurl: 

338 with _open(upath, 'wb') as f: 

339 shutil.copyfileobj(openedurl, f) 

340 except URLError: 

341 raise URLError("URL not found: %s" % path) 

342 else: 

343 shutil.copyfile(path, upath) 

344 return upath 

345 

346 def _findfile(self, path): 

347 """Searches for ``path`` and returns full path if found. 

348 

349 If path is an URL, _findfile will cache a local copy and return the 

350 path to the cached file. If path is a local file, _findfile will 

351 return a path to that local file. 

352 

353 The search will include possible compressed versions of the file 

354 and return the first occurrence found. 

355 

356 """ 

357 

358 # Build list of possible local file paths 

359 if not self._isurl(path): 

360 # Valid local paths 

361 filelist = self._possible_names(path) 

362 # Paths in self._destpath 

363 filelist += self._possible_names(self.abspath(path)) 

364 else: 

365 # Cached URLs in self._destpath 

366 filelist = self._possible_names(self.abspath(path)) 

367 # Remote URLs 

368 filelist = filelist + self._possible_names(path) 

369 

370 for name in filelist: 

371 if self.exists(name): 

372 if self._isurl(name): 

373 name = self._cache(name) 

374 return name 

375 return None 

376 

377 def abspath(self, path): 

378 """ 

379 Return absolute path of file in the DataSource directory. 

380 

381 If `path` is an URL, then `abspath` will return either the location 

382 the file exists locally or the location it would exist when opened 

383 using the `open` method. 

384 

385 Parameters 

386 ---------- 

387 path : str 

388 Can be a local file or a remote URL. 

389 

390 Returns 

391 ------- 

392 out : str 

393 Complete path, including the `DataSource` destination directory. 

394 

395 Notes 

396 ----- 

397 The functionality is based on `os.path.abspath`. 

398 

399 """ 

400 # We do this here to reduce the 'import numpy' initial import time. 

401 from urllib.parse import urlparse 

402 

403 # TODO: This should be more robust. Handles case where path includes 

404 # the destpath, but not other sub-paths. Failing case: 

405 # path = /home/guido/datafile.txt 

406 # destpath = /home/alex/ 

407 # upath = self.abspath(path) 

408 # upath == '/home/alex/home/guido/datafile.txt' 

409 

410 # handle case where path includes self._destpath 

411 splitpath = path.split(self._destpath, 2) 

412 if len(splitpath) > 1: 

413 path = splitpath[1] 

414 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 

415 netloc = self._sanitize_relative_path(netloc) 

416 upath = self._sanitize_relative_path(upath) 

417 return os.path.join(self._destpath, netloc, upath) 

418 

419 def _sanitize_relative_path(self, path): 

420 """Return a sanitised relative path for which 

421 os.path.abspath(os.path.join(base, path)).startswith(base) 

422 """ 

423 last = None 

424 path = os.path.normpath(path) 

425 while path != last: 

426 last = path 

427 # Note: os.path.join treats '/' as os.sep on Windows 

428 path = path.lstrip(os.sep).lstrip('/') 

429 path = path.lstrip(os.pardir).lstrip('..') 

430 drive, path = os.path.splitdrive(path) # for Windows 

431 return path 

432 

433 def exists(self, path): 

434 """ 

435 Test if path exists. 

436 

437 Test if `path` exists as (and in this order): 

438 

439 - a local file. 

440 - a remote URL that has been downloaded and stored locally in the 

441 `DataSource` directory. 

442 - a remote URL that has not been downloaded, but is valid and 

443 accessible. 

444 

445 Parameters 

446 ---------- 

447 path : str 

448 Can be a local file or a remote URL. 

449 

450 Returns 

451 ------- 

452 out : bool 

453 True if `path` exists. 

454 

455 Notes 

456 ----- 

457 When `path` is an URL, `exists` will return True if it's either 

458 stored locally in the `DataSource` directory, or is a valid remote 

459 URL. `DataSource` does not discriminate between the two, the file 

460 is accessible if it exists in either location. 

461 

462 """ 

463 

464 # First test for local path 

465 if os.path.exists(path): 

466 return True 

467 

468 # We import this here because importing urllib is slow and 

469 # a significant fraction of numpy's total import time. 

470 from urllib.request import urlopen 

471 from urllib.error import URLError 

472 

473 # Test cached url 

474 upath = self.abspath(path) 

475 if os.path.exists(upath): 

476 return True 

477 

478 # Test remote url 

479 if self._isurl(path): 

480 try: 

481 netfile = urlopen(path) 

482 netfile.close() 

483 del(netfile) 

484 return True 

485 except URLError: 

486 return False 

487 return False 

488 

489 def open(self, path, mode='r', encoding=None, newline=None): 

490 """ 

491 Open and return file-like object. 

492 

493 If `path` is an URL, it will be downloaded, stored in the 

494 `DataSource` directory and opened from there. 

495 

496 Parameters 

497 ---------- 

498 path : str 

499 Local file path or URL to open. 

500 mode : {'r', 'w', 'a'}, optional 

501 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 

502 'a' to append. Available modes depend on the type of object 

503 specified by `path`. Default is 'r'. 

504 encoding : {None, str}, optional 

505 Open text file with given encoding. The default encoding will be 

506 what `io.open` uses. 

507 newline : {None, str}, optional 

508 Newline to use when reading text file. 

509 

510 Returns 

511 ------- 

512 out : file object 

513 File object. 

514 

515 """ 

516 

517 # TODO: There is no support for opening a file for writing which 

518 # doesn't exist yet (creating a file). Should there be? 

519 

520 # TODO: Add a ``subdir`` parameter for specifying the subdirectory 

521 # used to store URLs in self._destpath. 

522 

523 if self._isurl(path) and self._iswritemode(mode): 

524 raise ValueError("URLs are not writeable") 

525 

526 # NOTE: _findfile will fail on a new file opened for writing. 

527 found = self._findfile(path) 

528 if found: 

529 _fname, ext = self._splitzipext(found) 

530 if ext == 'bz2': 

531 mode.replace("+", "") 

532 return _file_openers[ext](found, mode=mode, 

533 encoding=encoding, newline=newline) 

534 else: 

535 raise IOError("%s not found." % path) 

536 

537 

538class Repository (DataSource): 

539 """ 

540 Repository(baseurl, destpath='.') 

541 

542 A data repository where multiple DataSource's share a base 

543 URL/directory. 

544 

545 `Repository` extends `DataSource` by prepending a base URL (or 

546 directory) to all the files it handles. Use `Repository` when you will 

547 be working with multiple files from one base URL. Initialize 

548 `Repository` with the base URL, then refer to each file by its filename 

549 only. 

550 

551 Parameters 

552 ---------- 

553 baseurl : str 

554 Path to the local directory or remote location that contains the 

555 data files. 

556 destpath : str or None, optional 

557 Path to the directory where the source file gets downloaded to for 

558 use. If `destpath` is None, a temporary directory will be created. 

559 The default path is the current directory. 

560 

561 Examples 

562 -------- 

563 To analyze all files in the repository, do something like this 

564 (note: this is not self-contained code):: 

565 

566 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/') 

567 >>> for filename in filelist: 

568 ... fp = repos.open(filename) 

569 ... fp.analyze() 

570 ... fp.close() 

571 

572 Similarly you could use a URL for a repository:: 

573 

574 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data') 

575 

576 """ 

577 

578 def __init__(self, baseurl, destpath=os.curdir): 

579 """Create a Repository with a shared url or directory of baseurl.""" 

580 DataSource.__init__(self, destpath=destpath) 

581 self._baseurl = baseurl 

582 

583 def __del__(self): 

584 DataSource.__del__(self) 

585 

586 def _fullpath(self, path): 

587 """Return complete path for path. Prepends baseurl if necessary.""" 

588 splitpath = path.split(self._baseurl, 2) 

589 if len(splitpath) == 1: 

590 result = os.path.join(self._baseurl, path) 

591 else: 

592 result = path # path contains baseurl already 

593 return result 

594 

595 def _findfile(self, path): 

596 """Extend DataSource method to prepend baseurl to ``path``.""" 

597 return DataSource._findfile(self, self._fullpath(path)) 

598 

599 def abspath(self, path): 

600 """ 

601 Return absolute path of file in the Repository directory. 

602 

603 If `path` is an URL, then `abspath` will return either the location 

604 the file exists locally or the location it would exist when opened 

605 using the `open` method. 

606 

607 Parameters 

608 ---------- 

609 path : str 

610 Can be a local file or a remote URL. This may, but does not 

611 have to, include the `baseurl` with which the `Repository` was 

612 initialized. 

613 

614 Returns 

615 ------- 

616 out : str 

617 Complete path, including the `DataSource` destination directory. 

618 

619 """ 

620 return DataSource.abspath(self, self._fullpath(path)) 

621 

622 def exists(self, path): 

623 """ 

624 Test if path exists prepending Repository base URL to path. 

625 

626 Test if `path` exists as (and in this order): 

627 

628 - a local file. 

629 - a remote URL that has been downloaded and stored locally in the 

630 `DataSource` directory. 

631 - a remote URL that has not been downloaded, but is valid and 

632 accessible. 

633 

634 Parameters 

635 ---------- 

636 path : str 

637 Can be a local file or a remote URL. This may, but does not 

638 have to, include the `baseurl` with which the `Repository` was 

639 initialized. 

640 

641 Returns 

642 ------- 

643 out : bool 

644 True if `path` exists. 

645 

646 Notes 

647 ----- 

648 When `path` is an URL, `exists` will return True if it's either 

649 stored locally in the `DataSource` directory, or is a valid remote 

650 URL. `DataSource` does not discriminate between the two, the file 

651 is accessible if it exists in either location. 

652 

653 """ 

654 return DataSource.exists(self, self._fullpath(path)) 

655 

656 def open(self, path, mode='r', encoding=None, newline=None): 

657 """ 

658 Open and return file-like object prepending Repository base URL. 

659 

660 If `path` is an URL, it will be downloaded, stored in the 

661 DataSource directory and opened from there. 

662 

663 Parameters 

664 ---------- 

665 path : str 

666 Local file path or URL to open. This may, but does not have to, 

667 include the `baseurl` with which the `Repository` was 

668 initialized. 

669 mode : {'r', 'w', 'a'}, optional 

670 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 

671 'a' to append. Available modes depend on the type of object 

672 specified by `path`. Default is 'r'. 

673 encoding : {None, str}, optional 

674 Open text file with given encoding. The default encoding will be 

675 what `io.open` uses. 

676 newline : {None, str}, optional 

677 Newline to use when reading text file. 

678 

679 Returns 

680 ------- 

681 out : file object 

682 File object. 

683 

684 """ 

685 return DataSource.open(self, self._fullpath(path), mode, 

686 encoding=encoding, newline=newline) 

687 

688 def listdir(self): 

689 """ 

690 List files in the source Repository. 

691 

692 Returns 

693 ------- 

694 files : list of str 

695 List of file names (not containing a directory part). 

696 

697 Notes 

698 ----- 

699 Does not currently work for remote repositories. 

700 

701 """ 

702 if self._isurl(self._baseurl): 

703 raise NotImplementedError( 

704 "Directory listing of URLs, not supported yet.") 

705 else: 

706 return os.listdir(self._baseurl)