Coverage for lmcat\file_stats.py: 83%

42 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-29 16:42 -0700

1from dataclasses import dataclass 

2from pathlib import Path 

3from typing import NamedTuple, Optional 

4 

5# Handle Python 3.11+ vs older Python for TOML parsing 

6try: 

7 import tomllib 

8except ImportError: 

9 try: 

10 import tomli as tomllib # type: ignore 

11 except ImportError: 

12 tomllib = None # type: ignore[assignment] 

13 

14 

15# tokenizers (optional dep) 

16TOKENIZERS_PRESENT: bool = False 

17try: 

18 import tokenizers # type: ignore[import-untyped] 

19 

20 TOKENIZERS_PRESENT = True 

21except ImportError: 

22 pass 

23 

24 

25class TokenizerWrapper: 

26 """tokenizer wrapper. stores name and provides `n_tokens` method. 

27 

28 uses splitting by whitespace as a fallback -- `whitespace-split`""" 

29 

30 def __init__(self, name: str = "whitespace-split") -> None: 

31 self.name: str = name 

32 self.use_fallback: bool = name == "whitespace-split" 

33 self.tokenizer: Optional[tokenizers.Tokenizer] = ( 

34 None if self.use_fallback else tokenizers.Tokenizer.from_pretrained(name) 

35 ) 

36 

37 def n_tokens(self, text: str) -> int: 

38 """Return number of tokens in text""" 

39 if self.use_fallback: 

40 return len(text.split()) 

41 else: 

42 assert self.tokenizer is not None 

43 return len(self.tokenizer.encode(text).tokens) 

44 

45 

46@dataclass 

47class FileStats: 

48 """Statistics for a single file""" 

49 

50 lines: int 

51 chars: int 

52 tokens: Optional[int] = None 

53 

54 @classmethod 

55 def from_file( 

56 cls, 

57 path: Path, 

58 tokenizer: TokenizerWrapper, 

59 ) -> "FileStats": 

60 """Get statistics for a single file 

61 

62 # Parameters: 

63 - `path : Path` 

64 Path to the file to analyze 

65 - `tokenizer : Optional[tokenizers.Tokenizer]` 

66 Tokenizer to use for counting tokens, if any 

67 

68 # Returns: 

69 - `FileStats` 

70 Statistics for the file 

71 """ 

72 with path.open("r", encoding="utf-8", errors="ignore") as f: 

73 content: str = f.read() 

74 lines: int = len(content.splitlines()) 

75 chars: int = len(content) 

76 tokens: int = tokenizer.n_tokens(content) 

77 return FileStats(lines=lines, chars=chars, tokens=tokens) 

78 

79 

80class TreeEntry(NamedTuple): 

81 """Entry in the tree output with optional stats""" 

82 

83 line: str 

84 stats: Optional[FileStats] = None