Coverage for src/pdfbaker/pdf.py: 85%

73 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-20 04:55 +1200

1"""PDF-related functions.""" 

2 

3import logging 

4import os 

5import select 

6import subprocess 

7from collections.abc import Sequence 

8from pathlib import Path 

9 

10import pypdf 

11from cairosvg import svg2pdf 

12 

13from .errors import ( 

14 PDFCombineError, 

15 PDFCompressionError, 

16 SVGConversionError, 

17) 

18 

19__all__ = [ 

20 "combine_pdfs", 

21 "compress_pdf", 

22 "convert_svg_to_pdf", 

23] 

24 

25logger = logging.getLogger(__name__) 

26 

27 

28def combine_pdfs( 

29 pdf_files: Sequence[Path], output_file: Path 

30) -> Path | PDFCombineError: 

31 """Combine multiple PDF files into a single PDF. 

32 

33 Args: 

34 pdf_files: List of paths to PDF files to combine 

35 output_file: Path where the combined PDF will be written 

36 

37 Returns: 

38 Path to the combined PDF file 

39 

40 Raises: 

41 PDFCombineError: If no PDF files provided or if combining fails 

42 """ 

43 if not pdf_files: 

44 raise PDFCombineError("No PDF files provided to combine") 

45 

46 pdf_writer = pypdf.PdfWriter() 

47 

48 with open(output_file, "wb") as output_stream: 

49 for pdf_file in pdf_files: 

50 with open(pdf_file, "rb") as file_obj: 

51 try: 

52 pdf_reader = pypdf.PdfReader(file_obj) 

53 try: 

54 pdf_writer.append(pdf_reader) 

55 except KeyError as exc: 

56 if str(exc) == "'/Subtype'": 

57 # PDF has broken annotations with missing /Subtype 

58 logger.warning( 

59 "Broken annotations in PDF: %s" 

60 "Falling back to page-by-page method.", 

61 pdf_file, 

62 ) 

63 for page in pdf_reader.pages: 

64 pdf_writer.add_page(page) 

65 else: 

66 raise 

67 except Exception as exc: 

68 raise PDFCombineError(f"Failed to combine PDFs: {exc}") from exc 

69 pdf_writer.write(output_stream) 

70 

71 return output_file 

72 

73 

74def _run_subprocess_logged(cmd: list[str], env: dict[str, str] | None = None) -> int: 

75 """Run a subprocess with output redirected to logging. 

76 

77 Args: 

78 cmd: Command and arguments to run 

79 env: Optional environment variables to set 

80 

81 Returns: 

82 0 if successful, otherwise raises CalledProcessError 

83 """ 

84 env = env or os.environ.copy() 

85 env["PYTHONUNBUFFERED"] = "True" 

86 

87 with subprocess.Popen( 

88 cmd, 

89 bufsize=1, 

90 text=True, 

91 stdout=subprocess.PIPE, 

92 stderr=subprocess.PIPE, 

93 env=env, 

94 ) as proc: 

95 # Set up select for both pipes 

96 readable = { 

97 proc.stdout.fileno(): (proc.stdout, logger.info), 

98 proc.stderr.fileno(): (proc.stderr, logger.warning), 

99 } 

100 

101 while (ret_code := proc.poll()) is None: 

102 # Wait for output on either pipe 

103 ready, _, _ = select.select(readable.keys(), [], []) 

104 

105 for fd in ready: 

106 stream, log = readable[fd] 

107 line = stream.readline() 

108 if line: 

109 log(line.rstrip()) 

110 

111 # Read any remaining output after process exits 

112 for stream, log in readable.values(): 

113 for line in stream: 

114 if line.strip(): 

115 log(line.rstrip()) 

116 

117 if ret_code != 0: 

118 raise subprocess.CalledProcessError(ret_code, cmd) 

119 

120 return 0 

121 

122 

123def compress_pdf( 

124 input_pdf: Path, output_pdf: Path, dpi: int = 300 

125) -> Path | PDFCompressionError: 

126 """Compress a PDF file using Ghostscript. 

127 

128 Args: 

129 input_pdf: Path to the input PDF file 

130 output_pdf: Path where the compressed PDF will be written 

131 dpi: Resolution in dots per inch (default: 300) 

132 

133 Returns: 

134 Path to the compressed PDF file 

135 

136 Raises: 

137 PDFCompressionError: If Ghostscript compression fails 

138 """ 

139 try: 

140 _run_subprocess_logged( 

141 [ 

142 "gs", 

143 "-sDEVICE=pdfwrite", 

144 "-dCompatibilityLevel=1.7", 

145 "-dPDFSETTINGS=/printer", 

146 f"-r{dpi}", 

147 "-dNOPAUSE", 

148 "-dQUIET", 

149 "-dBATCH", 

150 f"-sOutputFile={output_pdf}", 

151 str(input_pdf), 

152 ] 

153 ) 

154 return output_pdf 

155 except FileNotFoundError as exc: 

156 raise PDFCompressionError(f"Ghostscript not found: {exc}") from exc 

157 except subprocess.SubprocessError as exc: 

158 raise PDFCompressionError(f"Ghostscript compression failed: {exc}") from exc 

159 

160 

161def convert_svg_to_pdf( 

162 svg_path: Path, 

163 pdf_path: Path, 

164 backend: str = "cairosvg", 

165) -> Path | SVGConversionError: 

166 """Convert an SVG file to PDF. 

167 

168 Args: 

169 svg_path: Path to the input SVG file 

170 pdf_path: Path where the PDF will be written 

171 backend: Conversion backend to use, either "cairosvg" or "inkscape" 

172 (default: "cairosvg") 

173 

174 Returns: 

175 Path to the converted PDF file 

176 

177 Raises: 

178 SVGConversionError: If SVG conversion fails, includes the backend used and cause 

179 """ 

180 if backend == "inkscape": 

181 try: 

182 _run_subprocess_logged( 

183 [ 

184 "inkscape", 

185 f"--export-filename={pdf_path}", 

186 str(svg_path), 

187 ] 

188 ) 

189 except subprocess.SubprocessError as exc: 

190 raise SVGConversionError(svg_path, backend, str(exc)) from exc 

191 else: 

192 if backend != "cairosvg": 

193 logger.warning( 

194 "Unknown svg2pdf backend: %s - falling back to cairosvg", 

195 backend, 

196 ) 

197 try: 

198 with open(svg_path, "rb") as svg_file: 

199 svg2pdf(file_obj=svg_file, write_to=str(pdf_path)) 

200 except Exception as exc: 

201 raise SVGConversionError(svg_path, backend, str(exc)) from exc 

202 

203 return pdf_path