Coverage for src/edwh_files_plugin/compression.py: 82%

218 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-02-28 11:40 +0100

1import abc 

2import os 

3import shutil 

4import typing 

5import warnings 

6from pathlib import Path 

7from subprocess import run 

8from typing import Optional, Self 

9 

10from plumbum import local 

11from plumbum.commands.processes import CommandNotFound 

12from rich import print # noqa: A004 

13 

14PathLike: typing.TypeAlias = str | Path 

15 

16DEFAULT_COMPRESSION_LEVEL = 5 

17 

18 

19def run_ok(command: str) -> bool: 

20 """ 

21 Executes a command and returns whether it ended successfully (with return code 0). 

22 

23 Args: 

24 command (str): The command to run. 

25 

26 Returns: 

27 bool: True if the command ended successfully, False otherwise. 

28 """ 

29 with Path(os.devnull).open("w") as devnull: 

30 return run(command.split(" "), stdout=devnull, stderr=devnull).returncode == 0 

31 

32 

33def is_installed(program: str) -> bool: 

34 """ 

35 Checks if a given program is installed on the system. 

36 

37 Args: 

38 program (str): The name of the program to check. 

39 

40 Returns: 

41 bool: True if the program is installed, False otherwise. 

42 """ 

43 return run_ok(f"which {program}") 

44 

45 

46# FileLike: typing.TypeAlias = PathLike | typing.BinaryIO | typing.TextIO 

47# def filelike_to_binaryio(fl: FileLike) -> typing.BinaryIO: ... 

48 

49 

50class Compression(abc.ABC): 

51 _registrations: dict[tuple[int, str], typing.Type[Self]] = {} 

52 extension: str | tuple[str, ...] 

53 

54 def __init_subclass__(cls, extension: str | tuple[str, ...] = "", prio: int = 0): 

55 if not extension: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 warnings.warn("Defined compression algorithm without extension, it will be ignored.") 

57 

58 if isinstance(extension, str): 

59 Compression._registrations[(prio, extension)] = cls 

60 else: 

61 for ext in extension: 

62 Compression._registrations[(prio, ext)] = cls 

63 

64 cls.extension = extension 

65 

66 @abc.abstractmethod 

67 def _compress( 

68 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True 

69 ) -> bool: 

70 """ 

71 Compresses the source file or directory to the target location. 

72 

73 Args: 

74 source (Path): Path to the source file or directory to compress. 

75 target (Path): Path where the compressed file will be saved. 

76 level (int, optional): Compression level (1-9), where higher numbers indicate higher compression. 

77 Defaults to 5. 

78 overwrite (bool, optional): Whether to overwrite the target file if it already exists. Defaults to True. 

79 """ 

80 

81 def compress( 

82 self, 

83 source: PathLike, 

84 target: Optional[PathLike] = None, 

85 level: int = DEFAULT_COMPRESSION_LEVEL, 

86 overwrite: bool = True, 

87 ) -> bool: 

88 source = Path(source).expanduser().absolute() 

89 

90 if target is None: 

91 target = self.filepath(source) 

92 # assert target != source, "Please provide a target file to compress to" 

93 else: 

94 target = Path(target) 

95 

96 try: 

97 return self._compress( 

98 source, 

99 target, 

100 level=level, 

101 overwrite=overwrite, 

102 ) 

103 except Exception as e: 

104 print("[red] Something went wrong during compression [/red]") 

105 print(e) 

106 return False 

107 

108 @abc.abstractmethod 

109 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool: 

110 """ 

111 Decompresses the source file to the target location. 

112 

113 Args: 

114 source (str): Path to the compressed file. 

115 target (str): Path where the decompressed contents will be saved. 

116 overwrite (bool, optional): Whether to overwrite the target files if they already exist. Defaults to True. 

117 """ 

118 

119 def decompress(self, source: PathLike, target: Optional[PathLike] = None, overwrite: bool = True) -> bool: 

120 source = Path(source).expanduser().absolute() 

121 

122 if target is None and source.suffix in (".tgz", ".tar", ".gz", ".zip"): 

123 # strip last extension (e.g. .tgz); retain other extension (.txt) 

124 extension = ".".join(source.suffixes[:-1]) 

125 target = source.with_suffix(f".{extension}" if extension else "") 

126 elif target is None: 

127 target = source 

128 else: 

129 target = Path(target) 

130 

131 try: 

132 return self._decompress( 

133 source, 

134 target, 

135 overwrite=overwrite, 

136 ) 

137 except Exception as e: 

138 print("[red] Something went wrong during decompression [/red]") 

139 print(e) 

140 return False 

141 

142 @classmethod 

143 @abc.abstractmethod 

144 def is_available(cls) -> bool: 

145 """ 

146 Checks if the required compression tool is available. 

147 

148 Returns: 

149 bool: True if the compression tool is available, False otherwise. 

150 """ 

151 

152 @classmethod 

153 def registrations( 

154 cls, extension_filter: Optional[str] = None 

155 ) -> list[tuple[tuple[int, str], typing.Type["Compression"]]]: 

156 return sorted( 

157 ( 

158 (key, CompressionClass) 

159 for (key, CompressionClass) in cls._registrations.items() 

160 if CompressionClass.is_available() and extension_filter in (None, key[1]) 

161 ), 

162 key=lambda registration: registration[0], 

163 reverse=True, 

164 ) 

165 

166 @classmethod 

167 def available(cls) -> set[str]: 

168 return set([extension for (_, extension) in cls._registrations]) 

169 

170 @classmethod 

171 def best(cls) -> Self | None: 

172 """ 

173 Find the absolute best (by priority) available compression method. 

174 """ 

175 if registrations := cls.registrations(): 175 ↛ 179line 175 didn't jump to line 179 because the condition on line 175 was always true

176 CompressionClass = registrations[0][1] # noqa: N806 

177 return typing.cast(Self, CompressionClass()) 

178 

179 return None 

180 

181 @classmethod 

182 def for_extension(cls, extension: str) -> Self | None: 

183 """ 

184 Find the best (by priority) available compression method for a specific extension (zip, gz). 

185 """ 

186 if registrations := cls.registrations(extension.strip(".").strip()): 

187 CompressionClass = registrations[0][1] # noqa: N806 

188 return typing.cast(Self, CompressionClass()) 

189 

190 return None 

191 

192 @classmethod 

193 def filepath(cls, filepath: str | Path) -> Path: 

194 """ 

195 Generate an output filepath with the right extension 

196 """ 

197 filepath = Path(filepath) 

198 extension = f"{filepath.suffix}.{cls.extension}" if filepath.is_file() else f".{cls.extension}" 

199 return filepath.with_suffix(extension) 

200 

201 @classmethod 

202 def filename(cls, filepath: str | Path) -> str: 

203 """ 

204 Generate an output filename with the right extension 

205 """ 

206 return cls.filepath(filepath).name 

207 

208 

209class Nocompression(Compression, extension=("none", "tar"), prio=0): 

210 def _compress( 

211 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True 

212 ) -> bool: 

213 if source.is_dir(): 

214 tar = local["tar"] 

215 cmd = tar["-cf", "-", "-C", source.parent, source.name] > str(target) 

216 cmd() 

217 elif source != target: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 shutil.copyfile(source, target) 

219 # else: nothing to do 

220 

221 return True 

222 

223 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool: 

224 if source.suffix == ".tar": 

225 target.mkdir(exist_ok=True) 

226 tar = local["tar"] 

227 cmd = tar["-xvf", source, "--strip-components=1", "-C", target] 

228 cmd() 

229 elif source != target: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 shutil.copyfile(source, target) 

231 # else: nothing to do 

232 

233 return True 

234 

235 @classmethod 

236 def is_available(cls) -> bool: 

237 return True 

238 

239 @classmethod 

240 def filepath(cls, filepath: str | Path) -> Path: 

241 filepath = Path(filepath) 

242 if filepath.is_dir(): 

243 return filepath.with_suffix(".tar") 

244 else: 

245 return filepath 

246 

247 

248class Zip(Compression, extension="zip"): 

249 def _compress( 

250 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True 

251 ) -> bool: 

252 from zipfile import ZIP_DEFLATED, ZipFile 

253 

254 if target.exists() and not overwrite: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true

255 return False 

256 

257 with ZipFile(target, "w", compression=ZIP_DEFLATED, compresslevel=level) as zip_object: 

258 if source.is_dir(): 

259 # shutil.make_archive(str(target), "zip", str(source)) 

260 # Traverse all files in directory 

261 for file_path in source.rglob("*"): 

262 if file_path.is_file(): 262 ↛ 261line 262 didn't jump to line 261 because the condition on line 262 was always true

263 # Add files to zip file with the correct relative path 

264 arcname = file_path.relative_to(source) 

265 zip_object.write(file_path, arcname) 

266 else: 

267 zip_object.write(source, source.name) 

268 

269 return True 

270 

271 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool: 

272 if not source.exists() or not source.is_file(): 272 ↛ 273line 272 didn't jump to line 273 because the condition on line 272 was never true

273 return False 

274 

275 from zipfile import ZipFile 

276 

277 with ZipFile(source, "r") as zip_object: 

278 namelist = zip_object.namelist() 

279 

280 # Check if the archive contains exactly one file 

281 if len(namelist) == 1 and not namelist[0].endswith("/"): 

282 # The archive contains a single file; treat target as a file 

283 first_file = namelist[0] 

284 

285 # If the target is a directory, ensure we create the file inside 

286 if target.is_dir(): 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true

287 target = target / Path(first_file).name 

288 

289 # Handle overwrite behavior 

290 if target.exists() and not overwrite: 290 ↛ 291line 290 didn't jump to line 291 because the condition on line 290 was never true

291 return False 

292 

293 # Ensure the parent directory exists 

294 target.parent.mkdir(parents=True, exist_ok=True) 

295 

296 # Extract the single file directly to the target 

297 with target.open("wb") as f: 

298 f.write(zip_object.read(first_file)) 

299 

300 else: 

301 # Treat target as a directory and extract all files 

302 target.mkdir(parents=True, exist_ok=True) 

303 

304 for member in namelist: 

305 # Resolve full path of the extracted file 

306 file_path = target / member 

307 

308 # Check if file already exists and handle overwrite 

309 if file_path.exists() and not overwrite: 309 ↛ 310line 309 didn't jump to line 310 because the condition on line 309 was never true

310 continue 

311 

312 # Ensure parent directories exist 

313 file_path.parent.mkdir(parents=True, exist_ok=True) 

314 

315 # Extract the file 

316 zip_object.extract(member, target) 

317 

318 return True 

319 

320 @classmethod 

321 def is_available(cls) -> bool: 

322 try: 

323 import zipfile # noqa: F401 

324 

325 return True 

326 except ImportError: 

327 return False 

328 

329 

330class Gzip(Compression, extension=("tgz", "gz"), prio=1): 

331 def gzip_compress( 

332 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, _tar: str = "tar", _gzip: str = "gzip" 

333 ) -> bool: 

334 """ 

335 Compress data using gzip. 

336 

337 This function compresses data from a source to a target path using the gzip tool. 

338 

339 Args: 

340 source (Path): Path to the file or data to be compressed. 

341 target (Path): Path where the compressed data will be saved. 

342 level (int): compression level, where 0 is fastest and 9 is strongest but slowest. 

343 Defaults to DEFAULT_COMPRESSION_LEVEL. 

344 _tar (str): For internal usage 

345 _gzip (str): For internal usage 

346 

347 Returns: 

348 bool: True if compression was successful, False on any failure. 

349 """ 

350 tar = local[_tar] 

351 gzip = local[_gzip] 

352 

353 if source.is_dir(): 

354 # .tar.gz 

355 # cmd = tar["-cf", "-", source] | gzip[f"-{level}"] > str(target) 

356 # ↑ stores whole path in tar; ↓ stores only folder name 

357 cmd = tar["-cf", "-", "-C", source.parent, source.name] | gzip[f"-{level}"] > str(target) 

358 else: 

359 cmd = gzip[f"-{level}", "-c", source] > str(target) 

360 

361 cmd() 

362 return True 

363 

364 def _compress( 

365 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True 

366 ) -> bool: 

367 if target.exists() and not overwrite: 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true

368 return False 

369 

370 try: 

371 self.gzip_compress(source, target, level=level) 

372 return True 

373 except Exception: 

374 return False 

375 

376 def gzip_decompress(self, source: Path, target: Path, _tar: str = "tar", _gunzip: str = "gunzip") -> bool: 

377 """ 

378 Decompresses a gzipped file and extracts it into the specified target directory. 

379 

380 Args: 

381 source (Path): The path to the gzipped file. 

382 target (Path): The directory to extract the decompressed file(s) to. 

383 

384 Returns: 

385 bool: True if the decompression and extraction were successful, False otherwise. 

386 """ 

387 gunzip = local[_gunzip] 

388 tar = local[_tar] 

389 

390 if ".tar" in source.suffixes or ".tgz" in source.suffixes: 

391 # tar gz 

392 target.mkdir(parents=True, exist_ok=True) 

393 cmd = tar["-xvf", source, "--strip-components=1", f"--use-compress-program={_gunzip}", "-C", target] 

394 else: 

395 # assume just a .gz 

396 cmd = gunzip["-c", source] > str(target) 

397 

398 cmd() 

399 return True 

400 

401 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool: 

402 if target.exists() and not overwrite: 402 ↛ 403line 402 didn't jump to line 403 because the condition on line 402 was never true

403 return False 

404 

405 self.gzip_decompress(source, target) 

406 return True 

407 

408 @classmethod 

409 def is_available(cls) -> bool: 

410 """ 

411 Check if 'gzip' and 'gunzip' are available in the local context. 

412 

413 Returns: 

414 bool: The return value is True if 'gzip' and 'gunzip' are found, 

415 False otherwise. 

416 """ 

417 try: 

418 assert local["gzip"] and local["gunzip"] 

419 return True 

420 except CommandNotFound: 

421 return False 

422 

423 @classmethod 

424 def filepath(cls, filepath: str | Path) -> Path: 

425 """ 

426 Return a Path object with either '.gz' or '.tgz' appended as file extension based on whether 

427 the provided file path is a file or not. 

428 

429 Args: 

430 filepath (str | Path): The input file path in string or Path format. 

431 

432 Returns: 

433 Path: The updated file path with appended file extension. 

434 """ 

435 filepath = Path(filepath) 

436 extension = f"{filepath.suffix}.gz" if filepath.is_file() else ".tgz" 

437 return filepath.with_suffix(extension) 

438 

439 

440class Pigz(Gzip, extension=("tgz", "gz"), prio=2): 

441 """ 

442 The Pigz class inherits from the Gzip base class. 

443 

444 Its priority is higher than that of the base class, as indicated by the value 2. 

445 

446 Pigz (Parallel Implementation of GZip) is a fully functional replacement for gzip 

447 that exploits multiple processors and multiple cores to the hilt when compressing data. 

448 Pigz can be a good choice when you're handling large amounts of data, 

449 and your machine has multiple cores/processors. 

450 

451 Advantages of pigz over classic gzip: 

452 - Multithreading: Pigz can split the input data into chunks and process them in parallel. 

453 This utilizes multiple cores on your machine, 

454 leading to faster compression times. 

455 - Compatibility: Pigz maintains backward compatibility with gzip, so it can handle any file that gzip can. 

456 - Speed: In multi-core systems, pigz can be significantly faster than gzip 

457 because of its ability to process different parts of the data simultaneously. 

458 """ 

459 

460 def _compress( 

461 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True 

462 ) -> bool: 

463 if target.exists() and not overwrite: 463 ↛ 464line 463 didn't jump to line 464 because the condition on line 463 was never true

464 return False 

465 

466 self.gzip_compress(source, target, level=level, _gzip="pigz") 

467 return True 

468 

469 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool: 

470 if target.exists() and not overwrite: 470 ↛ 471line 470 didn't jump to line 471 because the condition on line 470 was never true

471 return False 

472 

473 self.gzip_decompress(source, target, _gunzip="unpigz") 

474 return True 

475 

476 @classmethod 

477 def is_available(cls) -> bool: 

478 """ 

479 Check if 'pigz' and 'unpigz' commands are available in the local environment. 

480 

481 Returns: 

482 bool: The return value. True for success, False otherwise. 

483 """ 

484 try: 

485 assert local["pigz"] and local["unpigz"] 

486 return True 

487 except CommandNotFound: 

488 return False