Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/local/bin/python 

2# encoding: utf-8 

3""" 

4*Iterate through large line-based files in batches of lines* 

5 

6:Author: 

7 David Young 

8 

9:Date Created: 

10 December 4, 2017 

11""" 

12################# GLOBAL IMPORTS #################### 

13from builtins import range 

14from builtins import object 

15import sys 

16import os 

17os.environ['TERM'] = 'vt100' 

18from fundamentals import tools 

19import codecs 

20 

21 

22class fileChunker(object): 

23 """ 

24 *The fileChunker iterator - iterate over large line-based files to reduce memory footprint* 

25 

26 **Key Arguments:** 

27 - ``filepath`` -- path to the large file to iterate over 

28 - ``batchSize`` -- size of the chunks to return in lines 

29 

30 **Usage:** 

31 

32 To setup your logger, settings and database connections, please use the ``fundamentals`` package (`see tutorial here <http://fundamentals.readthedocs.io/en/latest/#tutorial>`_).  

33 

34 To initiate a fileChunker iterator and then process the file in batches of 100000 lines, use the following: 

35 

36 .. code-block:: python  

37 

38 from fundamentals.files import fileChunker 

39 fc = fileChunker( 

40 filepath="/path/to/large/file.csv", 

41 batchSize=100000 

42 ) 

43 for i in fc: 

44 print len(i) 

45 """ 

46 

47 def __init__(self, filepath, batchSize): 

48 self.filepath = filepath 

49 self.batchSize = batchSize 

50 

51 try: 

52 self.readFile = codecs.open( 

53 self.filepath, encoding='utf-8', mode='r') 

54 except IOError as e: 

55 message = 'could not open the file %s' % (self.filepath,) 

56 raise IOError(message) 

57 

58 def __iter__(self): return self 

59 

60 def __next__(self): 

61 batch = [] 

62 for lines in range(self.batchSize): 

63 l = self.readFile.readline() 

64 if len(l): 

65 batch.append(l) 

66 if len(batch) == 0: 

67 raise StopIteration 

68 

69 return batch