lmcat.file_stats
1from dataclasses import dataclass 2from pathlib import Path 3from typing import NamedTuple, Optional 4 5# Handle Python 3.11+ vs older Python for TOML parsing 6try: 7 import tomllib 8except ImportError: 9 try: 10 import tomli as tomllib # type: ignore 11 except ImportError: 12 tomllib = None # type: ignore[assignment] 13 14 15# tokenizers (optional dep) 16TOKENIZERS_PRESENT: bool = False 17try: 18 import tokenizers # type: ignore[import-untyped] 19 20 TOKENIZERS_PRESENT = True 21except ImportError: 22 pass 23 24 25class TokenizerWrapper: 26 """tokenizer wrapper. stores name and provides `n_tokens` method. 27 28 uses splitting by whitespace as a fallback -- `whitespace-split`""" 29 30 def __init__(self, name: str = "whitespace-split") -> None: 31 self.name: str = name 32 self.use_fallback: bool = name == "whitespace-split" 33 self.tokenizer: Optional[tokenizers.Tokenizer] = ( 34 None if self.use_fallback else tokenizers.Tokenizer.from_pretrained(name) 35 ) 36 37 def n_tokens(self, text: str) -> int: 38 """Return number of tokens in text""" 39 if self.use_fallback: 40 return len(text.split()) 41 else: 42 assert self.tokenizer is not None 43 return len(self.tokenizer.encode(text).tokens) 44 45 46@dataclass 47class FileStats: 48 """Statistics for a single file""" 49 50 lines: int 51 chars: int 52 tokens: Optional[int] = None 53 54 @classmethod 55 def from_file( 56 cls, 57 path: Path, 58 tokenizer: TokenizerWrapper, 59 ) -> "FileStats": 60 """Get statistics for a single file 61 62 # Parameters: 63 - `path : Path` 64 Path to the file to analyze 65 - `tokenizer : Optional[tokenizers.Tokenizer]` 66 Tokenizer to use for counting tokens, if any 67 68 # Returns: 69 - `FileStats` 70 Statistics for the file 71 """ 72 with path.open("r", encoding="utf-8", errors="ignore") as f: 73 content: str = f.read() 74 lines: int = len(content.splitlines()) 75 chars: int = len(content) 76 tokens: int = tokenizer.n_tokens(content) 77 return FileStats(lines=lines, chars=chars, tokens=tokens) 78 79 80class TreeEntry(NamedTuple): 81 """Entry in the tree output with optional stats""" 82 83 line: str 84 stats: Optional[FileStats] = None
TOKENIZERS_PRESENT: bool =
True
class
TokenizerWrapper:
26class TokenizerWrapper: 27 """tokenizer wrapper. stores name and provides `n_tokens` method. 28 29 uses splitting by whitespace as a fallback -- `whitespace-split`""" 30 31 def __init__(self, name: str = "whitespace-split") -> None: 32 self.name: str = name 33 self.use_fallback: bool = name == "whitespace-split" 34 self.tokenizer: Optional[tokenizers.Tokenizer] = ( 35 None if self.use_fallback else tokenizers.Tokenizer.from_pretrained(name) 36 ) 37 38 def n_tokens(self, text: str) -> int: 39 """Return number of tokens in text""" 40 if self.use_fallback: 41 return len(text.split()) 42 else: 43 assert self.tokenizer is not None 44 return len(self.tokenizer.encode(text).tokens)
tokenizer wrapper. stores name and provides n_tokens
method.
uses splitting by whitespace as a fallback -- whitespace-split
def
n_tokens(self, text: str) -> int:
38 def n_tokens(self, text: str) -> int: 39 """Return number of tokens in text""" 40 if self.use_fallback: 41 return len(text.split()) 42 else: 43 assert self.tokenizer is not None 44 return len(self.tokenizer.encode(text).tokens)
Return number of tokens in text
@dataclass
class
FileStats:
47@dataclass 48class FileStats: 49 """Statistics for a single file""" 50 51 lines: int 52 chars: int 53 tokens: Optional[int] = None 54 55 @classmethod 56 def from_file( 57 cls, 58 path: Path, 59 tokenizer: TokenizerWrapper, 60 ) -> "FileStats": 61 """Get statistics for a single file 62 63 # Parameters: 64 - `path : Path` 65 Path to the file to analyze 66 - `tokenizer : Optional[tokenizers.Tokenizer]` 67 Tokenizer to use for counting tokens, if any 68 69 # Returns: 70 - `FileStats` 71 Statistics for the file 72 """ 73 with path.open("r", encoding="utf-8", errors="ignore") as f: 74 content: str = f.read() 75 lines: int = len(content.splitlines()) 76 chars: int = len(content) 77 tokens: int = tokenizer.n_tokens(content) 78 return FileStats(lines=lines, chars=chars, tokens=tokens)
Statistics for a single file
55 @classmethod 56 def from_file( 57 cls, 58 path: Path, 59 tokenizer: TokenizerWrapper, 60 ) -> "FileStats": 61 """Get statistics for a single file 62 63 # Parameters: 64 - `path : Path` 65 Path to the file to analyze 66 - `tokenizer : Optional[tokenizers.Tokenizer]` 67 Tokenizer to use for counting tokens, if any 68 69 # Returns: 70 - `FileStats` 71 Statistics for the file 72 """ 73 with path.open("r", encoding="utf-8", errors="ignore") as f: 74 content: str = f.read() 75 lines: int = len(content.splitlines()) 76 chars: int = len(content) 77 tokens: int = tokenizer.n_tokens(content) 78 return FileStats(lines=lines, chars=chars, tokens=tokens)
Get statistics for a single file
Parameters:
path : Path
Path to the file to analyzetokenizer : Optional[tokenizers.Tokenizer]
Tokenizer to use for counting tokens, if any
Returns:
FileStats
Statistics for the file
class
TreeEntry(typing.NamedTuple):
81class TreeEntry(NamedTuple): 82 """Entry in the tree output with optional stats""" 83 84 line: str 85 stats: Optional[FileStats] = None
Entry in the tree output with optional stats
TreeEntry(line: str, stats: Optional[FileStats] = None)
Create new instance of TreeEntry(line, stats)
Inherited Members
- builtins.tuple
- index
- count