Coverage for src/artemis_sg/img_downloader.py: 86%
93 statements
« prev ^ index » next coverage.py v7.3.1, created at 2024-03-06 08:01 -0800
« prev ^ index » next coverage.py v7.3.1, created at 2024-03-06 08:01 -0800
1#!/usr/bin/env python
3import json
4import logging
5import os
6import tempfile
8import isbnlib
9import puremagic
10import requests
11from rich.console import Console
12from rich.text import Text
14from artemis_sg.config import CFG
16MODULE = os.path.splitext(os.path.basename(__file__))[0]
17console = Console()
20class ImgDownloader:
21 # constants
22 MAX_FILESIZE = 1048576 # 1 MB
24 def is_image(self, path):
25 """Check given filepath to see if it is an image.
26 If so, return extension type, else return None."""
27 namespace = f"{type(self).__name__}.{self.is_image.__name__}"
28 try:
29 kind = puremagic.from_file(path)
30 except (puremagic.main.PureError, ValueError):
31 logging.warning(f"{namespace}: non-image file found")
32 kind = None
33 if kind not in [".jpg", ".png"]:
34 kind = None
35 return kind
37 def download(self, image_dict, target_dir=""): # noqa: C901
38 namespace = f"{type(self).__name__}.{self.download.__name__}"
40 if not target_dir:
41 target_dir = tempfile.mkdtemp(prefix="ImgDownloader-")
42 logging.warning(f"{namespace}: Creating target directory at {target_dir}")
43 if not os.path.isdir(target_dir): 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true
44 os.mkdir(target_dir)
46 for key in image_dict:
47 for i, url in enumerate(image_dict[key]):
48 isbn = isbnlib.to_isbn13(key)
49 if not isbn:
50 isbn = key
51 suffix = "" if i == 0 else f"-{i}"
52 image = f"{isbn}{suffix}.jpg"
53 image_path = os.path.join(target_dir, image)
54 if not os.path.isfile(image_path) or not self.is_image(image_path):
55 logging.debug(f"{namespace}: Downloading '{url}' to '{target_dir}'")
56 with open(image_path, "wb") as fp:
57 r = requests.get(url, timeout=10)
58 fp.write(r.content)
60 # validate file and name it in accordance with its type
61 fmt = self.is_image(image_path)
62 if fmt == ".jpg":
63 pass
64 elif fmt == ".png":
65 # rename file with png suffix
66 old_path = image_path
67 image_path = os.path.splitext(old_path)[0] + ".png"
68 if os.path.isfile(image_path):
69 logging.warning(
70 f"{namespace}: Overwriting existing file "
71 f"'{image_path}'."
72 )
73 os.remove(image_path)
74 os.rename(old_path, image_path)
75 else:
76 os.remove(image_path)
77 logging.warning(
78 f"{namespace}: Skipping unsupported file type in '{url}'"
79 )
80 # validate file size
81 if os.path.isfile(image_path):
82 file_size = os.path.getsize(image_path)
83 if file_size > self.MAX_FILESIZE: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true
84 os.remove(image_path)
85 logging.warning(
86 f"{namespace}: Skipping file too large at '{url}'"
87 )
88 logging.info(f"{namespace}: Saved '{image_path}")
90 return target_dir
93def main():
94 scraped_datafile = CFG["asg"]["data"]["file"]["scraped"]
95 saved_images_dir = CFG["asg"]["data"]["dir"]["images"]
96 if not os.path.isdir(saved_images_dir): 96 ↛ 97line 96 didn't jump to line 97, because the condition on line 96 was never true
97 dest = None
99 dloader = ImgDownloader()
101 def get_json_data_from_file(datafile):
102 namespace = f"{MODULE}.main.{get_json_data_from_file.__name__}"
103 try:
104 with open(datafile) as filepointer: 104 ↛ 105, 104 ↛ 1062 missed branches: 1) line 104 didn't jump to line 105, 2) line 104 didn't jump to line 106
105 data = json.load(filepointer)
106 filepointer.close()
107 return data
108 except FileNotFoundError: 108 ↛ 111line 108 didn't jump to line 111
109 logging.error(f"{namespace}: Datafile '{datafile}' not found")
110 return {}
111 except json.decoder.JSONDecodeError:
112 logging.error(
113 f"{namespace}: Datafile '{datafile}' did not contain valid JSON"
114 )
115 return {}
117 def get_image_url_dict(data):
118 url_dict = {}
119 for key in data: 119 ↛ 120line 119 didn't jump to line 120, because the loop on line 119 never started
120 url_dict[key] = data[key]["image_urls"]
121 return url_dict
123 scraped_data = get_json_data_from_file(scraped_datafile)
124 img_dict = get_image_url_dict(scraped_data)
125 dest = dloader.download(img_dict, saved_images_dir)
126 dest_text = Text(f"Images downloaded to {dest}.")
127 dest_text.stylize("green")
128 console.print(dest_text)
131if __name__ == "__main__":
132 main()