Coverage for src/edwh_files_plugin/compression.py: 82%
218 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-02-28 11:40 +0100
« prev ^ index » next coverage.py v7.6.10, created at 2025-02-28 11:40 +0100
1import abc
2import os
3import shutil
4import typing
5import warnings
6from pathlib import Path
7from subprocess import run
8from typing import Optional, Self
10from plumbum import local
11from plumbum.commands.processes import CommandNotFound
12from rich import print # noqa: A004
14PathLike: typing.TypeAlias = str | Path
16DEFAULT_COMPRESSION_LEVEL = 5
19def run_ok(command: str) -> bool:
20 """
21 Executes a command and returns whether it ended successfully (with return code 0).
23 Args:
24 command (str): The command to run.
26 Returns:
27 bool: True if the command ended successfully, False otherwise.
28 """
29 with Path(os.devnull).open("w") as devnull:
30 return run(command.split(" "), stdout=devnull, stderr=devnull).returncode == 0
33def is_installed(program: str) -> bool:
34 """
35 Checks if a given program is installed on the system.
37 Args:
38 program (str): The name of the program to check.
40 Returns:
41 bool: True if the program is installed, False otherwise.
42 """
43 return run_ok(f"which {program}")
46# FileLike: typing.TypeAlias = PathLike | typing.BinaryIO | typing.TextIO
47# def filelike_to_binaryio(fl: FileLike) -> typing.BinaryIO: ...
50class Compression(abc.ABC):
51 _registrations: dict[tuple[int, str], typing.Type[Self]] = {}
52 extension: str | tuple[str, ...]
54 def __init_subclass__(cls, extension: str | tuple[str, ...] = "", prio: int = 0):
55 if not extension: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 warnings.warn("Defined compression algorithm without extension, it will be ignored.")
58 if isinstance(extension, str):
59 Compression._registrations[(prio, extension)] = cls
60 else:
61 for ext in extension:
62 Compression._registrations[(prio, ext)] = cls
64 cls.extension = extension
66 @abc.abstractmethod
67 def _compress(
68 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True
69 ) -> bool:
70 """
71 Compresses the source file or directory to the target location.
73 Args:
74 source (Path): Path to the source file or directory to compress.
75 target (Path): Path where the compressed file will be saved.
76 level (int, optional): Compression level (1-9), where higher numbers indicate higher compression.
77 Defaults to 5.
78 overwrite (bool, optional): Whether to overwrite the target file if it already exists. Defaults to True.
79 """
81 def compress(
82 self,
83 source: PathLike,
84 target: Optional[PathLike] = None,
85 level: int = DEFAULT_COMPRESSION_LEVEL,
86 overwrite: bool = True,
87 ) -> bool:
88 source = Path(source).expanduser().absolute()
90 if target is None:
91 target = self.filepath(source)
92 # assert target != source, "Please provide a target file to compress to"
93 else:
94 target = Path(target)
96 try:
97 return self._compress(
98 source,
99 target,
100 level=level,
101 overwrite=overwrite,
102 )
103 except Exception as e:
104 print("[red] Something went wrong during compression [/red]")
105 print(e)
106 return False
108 @abc.abstractmethod
109 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool:
110 """
111 Decompresses the source file to the target location.
113 Args:
114 source (str): Path to the compressed file.
115 target (str): Path where the decompressed contents will be saved.
116 overwrite (bool, optional): Whether to overwrite the target files if they already exist. Defaults to True.
117 """
119 def decompress(self, source: PathLike, target: Optional[PathLike] = None, overwrite: bool = True) -> bool:
120 source = Path(source).expanduser().absolute()
122 if target is None and source.suffix in (".tgz", ".tar", ".gz", ".zip"):
123 # strip last extension (e.g. .tgz); retain other extension (.txt)
124 extension = ".".join(source.suffixes[:-1])
125 target = source.with_suffix(f".{extension}" if extension else "")
126 elif target is None:
127 target = source
128 else:
129 target = Path(target)
131 try:
132 return self._decompress(
133 source,
134 target,
135 overwrite=overwrite,
136 )
137 except Exception as e:
138 print("[red] Something went wrong during decompression [/red]")
139 print(e)
140 return False
142 @classmethod
143 @abc.abstractmethod
144 def is_available(cls) -> bool:
145 """
146 Checks if the required compression tool is available.
148 Returns:
149 bool: True if the compression tool is available, False otherwise.
150 """
152 @classmethod
153 def registrations(
154 cls, extension_filter: Optional[str] = None
155 ) -> list[tuple[tuple[int, str], typing.Type["Compression"]]]:
156 return sorted(
157 (
158 (key, CompressionClass)
159 for (key, CompressionClass) in cls._registrations.items()
160 if CompressionClass.is_available() and extension_filter in (None, key[1])
161 ),
162 key=lambda registration: registration[0],
163 reverse=True,
164 )
166 @classmethod
167 def available(cls) -> set[str]:
168 return set([extension for (_, extension) in cls._registrations])
170 @classmethod
171 def best(cls) -> Self | None:
172 """
173 Find the absolute best (by priority) available compression method.
174 """
175 if registrations := cls.registrations(): 175 ↛ 179line 175 didn't jump to line 179 because the condition on line 175 was always true
176 CompressionClass = registrations[0][1] # noqa: N806
177 return typing.cast(Self, CompressionClass())
179 return None
181 @classmethod
182 def for_extension(cls, extension: str) -> Self | None:
183 """
184 Find the best (by priority) available compression method for a specific extension (zip, gz).
185 """
186 if registrations := cls.registrations(extension.strip(".").strip()):
187 CompressionClass = registrations[0][1] # noqa: N806
188 return typing.cast(Self, CompressionClass())
190 return None
192 @classmethod
193 def filepath(cls, filepath: str | Path) -> Path:
194 """
195 Generate an output filepath with the right extension
196 """
197 filepath = Path(filepath)
198 extension = f"{filepath.suffix}.{cls.extension}" if filepath.is_file() else f".{cls.extension}"
199 return filepath.with_suffix(extension)
201 @classmethod
202 def filename(cls, filepath: str | Path) -> str:
203 """
204 Generate an output filename with the right extension
205 """
206 return cls.filepath(filepath).name
209class Nocompression(Compression, extension=("none", "tar"), prio=0):
210 def _compress(
211 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True
212 ) -> bool:
213 if source.is_dir():
214 tar = local["tar"]
215 cmd = tar["-cf", "-", "-C", source.parent, source.name] > str(target)
216 cmd()
217 elif source != target: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true
218 shutil.copyfile(source, target)
219 # else: nothing to do
221 return True
223 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool:
224 if source.suffix == ".tar":
225 target.mkdir(exist_ok=True)
226 tar = local["tar"]
227 cmd = tar["-xvf", source, "--strip-components=1", "-C", target]
228 cmd()
229 elif source != target: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true
230 shutil.copyfile(source, target)
231 # else: nothing to do
233 return True
235 @classmethod
236 def is_available(cls) -> bool:
237 return True
239 @classmethod
240 def filepath(cls, filepath: str | Path) -> Path:
241 filepath = Path(filepath)
242 if filepath.is_dir():
243 return filepath.with_suffix(".tar")
244 else:
245 return filepath
248class Zip(Compression, extension="zip"):
249 def _compress(
250 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True
251 ) -> bool:
252 from zipfile import ZIP_DEFLATED, ZipFile
254 if target.exists() and not overwrite: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true
255 return False
257 with ZipFile(target, "w", compression=ZIP_DEFLATED, compresslevel=level) as zip_object:
258 if source.is_dir():
259 # shutil.make_archive(str(target), "zip", str(source))
260 # Traverse all files in directory
261 for file_path in source.rglob("*"):
262 if file_path.is_file(): 262 ↛ 261line 262 didn't jump to line 261 because the condition on line 262 was always true
263 # Add files to zip file with the correct relative path
264 arcname = file_path.relative_to(source)
265 zip_object.write(file_path, arcname)
266 else:
267 zip_object.write(source, source.name)
269 return True
271 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool:
272 if not source.exists() or not source.is_file(): 272 ↛ 273line 272 didn't jump to line 273 because the condition on line 272 was never true
273 return False
275 from zipfile import ZipFile
277 with ZipFile(source, "r") as zip_object:
278 namelist = zip_object.namelist()
280 # Check if the archive contains exactly one file
281 if len(namelist) == 1 and not namelist[0].endswith("/"):
282 # The archive contains a single file; treat target as a file
283 first_file = namelist[0]
285 # If the target is a directory, ensure we create the file inside
286 if target.is_dir(): 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true
287 target = target / Path(first_file).name
289 # Handle overwrite behavior
290 if target.exists() and not overwrite: 290 ↛ 291line 290 didn't jump to line 291 because the condition on line 290 was never true
291 return False
293 # Ensure the parent directory exists
294 target.parent.mkdir(parents=True, exist_ok=True)
296 # Extract the single file directly to the target
297 with target.open("wb") as f:
298 f.write(zip_object.read(first_file))
300 else:
301 # Treat target as a directory and extract all files
302 target.mkdir(parents=True, exist_ok=True)
304 for member in namelist:
305 # Resolve full path of the extracted file
306 file_path = target / member
308 # Check if file already exists and handle overwrite
309 if file_path.exists() and not overwrite: 309 ↛ 310line 309 didn't jump to line 310 because the condition on line 309 was never true
310 continue
312 # Ensure parent directories exist
313 file_path.parent.mkdir(parents=True, exist_ok=True)
315 # Extract the file
316 zip_object.extract(member, target)
318 return True
320 @classmethod
321 def is_available(cls) -> bool:
322 try:
323 import zipfile # noqa: F401
325 return True
326 except ImportError:
327 return False
330class Gzip(Compression, extension=("tgz", "gz"), prio=1):
331 def gzip_compress(
332 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, _tar: str = "tar", _gzip: str = "gzip"
333 ) -> bool:
334 """
335 Compress data using gzip.
337 This function compresses data from a source to a target path using the gzip tool.
339 Args:
340 source (Path): Path to the file or data to be compressed.
341 target (Path): Path where the compressed data will be saved.
342 level (int): compression level, where 0 is fastest and 9 is strongest but slowest.
343 Defaults to DEFAULT_COMPRESSION_LEVEL.
344 _tar (str): For internal usage
345 _gzip (str): For internal usage
347 Returns:
348 bool: True if compression was successful, False on any failure.
349 """
350 tar = local[_tar]
351 gzip = local[_gzip]
353 if source.is_dir():
354 # .tar.gz
355 # cmd = tar["-cf", "-", source] | gzip[f"-{level}"] > str(target)
356 # ↑ stores whole path in tar; ↓ stores only folder name
357 cmd = tar["-cf", "-", "-C", source.parent, source.name] | gzip[f"-{level}"] > str(target)
358 else:
359 cmd = gzip[f"-{level}", "-c", source] > str(target)
361 cmd()
362 return True
364 def _compress(
365 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True
366 ) -> bool:
367 if target.exists() and not overwrite: 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true
368 return False
370 try:
371 self.gzip_compress(source, target, level=level)
372 return True
373 except Exception:
374 return False
376 def gzip_decompress(self, source: Path, target: Path, _tar: str = "tar", _gunzip: str = "gunzip") -> bool:
377 """
378 Decompresses a gzipped file and extracts it into the specified target directory.
380 Args:
381 source (Path): The path to the gzipped file.
382 target (Path): The directory to extract the decompressed file(s) to.
384 Returns:
385 bool: True if the decompression and extraction were successful, False otherwise.
386 """
387 gunzip = local[_gunzip]
388 tar = local[_tar]
390 if ".tar" in source.suffixes or ".tgz" in source.suffixes:
391 # tar gz
392 target.mkdir(parents=True, exist_ok=True)
393 cmd = tar["-xvf", source, "--strip-components=1", f"--use-compress-program={_gunzip}", "-C", target]
394 else:
395 # assume just a .gz
396 cmd = gunzip["-c", source] > str(target)
398 cmd()
399 return True
401 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool:
402 if target.exists() and not overwrite: 402 ↛ 403line 402 didn't jump to line 403 because the condition on line 402 was never true
403 return False
405 self.gzip_decompress(source, target)
406 return True
408 @classmethod
409 def is_available(cls) -> bool:
410 """
411 Check if 'gzip' and 'gunzip' are available in the local context.
413 Returns:
414 bool: The return value is True if 'gzip' and 'gunzip' are found,
415 False otherwise.
416 """
417 try:
418 assert local["gzip"] and local["gunzip"]
419 return True
420 except CommandNotFound:
421 return False
423 @classmethod
424 def filepath(cls, filepath: str | Path) -> Path:
425 """
426 Return a Path object with either '.gz' or '.tgz' appended as file extension based on whether
427 the provided file path is a file or not.
429 Args:
430 filepath (str | Path): The input file path in string or Path format.
432 Returns:
433 Path: The updated file path with appended file extension.
434 """
435 filepath = Path(filepath)
436 extension = f"{filepath.suffix}.gz" if filepath.is_file() else ".tgz"
437 return filepath.with_suffix(extension)
440class Pigz(Gzip, extension=("tgz", "gz"), prio=2):
441 """
442 The Pigz class inherits from the Gzip base class.
444 Its priority is higher than that of the base class, as indicated by the value 2.
446 Pigz (Parallel Implementation of GZip) is a fully functional replacement for gzip
447 that exploits multiple processors and multiple cores to the hilt when compressing data.
448 Pigz can be a good choice when you're handling large amounts of data,
449 and your machine has multiple cores/processors.
451 Advantages of pigz over classic gzip:
452 - Multithreading: Pigz can split the input data into chunks and process them in parallel.
453 This utilizes multiple cores on your machine,
454 leading to faster compression times.
455 - Compatibility: Pigz maintains backward compatibility with gzip, so it can handle any file that gzip can.
456 - Speed: In multi-core systems, pigz can be significantly faster than gzip
457 because of its ability to process different parts of the data simultaneously.
458 """
460 def _compress(
461 self, source: Path, target: Path, level: int = DEFAULT_COMPRESSION_LEVEL, overwrite: bool = True
462 ) -> bool:
463 if target.exists() and not overwrite: 463 ↛ 464line 463 didn't jump to line 464 because the condition on line 463 was never true
464 return False
466 self.gzip_compress(source, target, level=level, _gzip="pigz")
467 return True
469 def _decompress(self, source: Path, target: Path, overwrite: bool = True) -> bool:
470 if target.exists() and not overwrite: 470 ↛ 471line 470 didn't jump to line 471 because the condition on line 470 was never true
471 return False
473 self.gzip_decompress(source, target, _gunzip="unpigz")
474 return True
476 @classmethod
477 def is_available(cls) -> bool:
478 """
479 Check if 'pigz' and 'unpigz' commands are available in the local environment.
481 Returns:
482 bool: The return value. True for success, False otherwise.
483 """
484 try:
485 assert local["pigz"] and local["unpigz"]
486 return True
487 except CommandNotFound:
488 return False