Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

# Copyright (C) 2001-2006 Python Software Foundation 

# Author: Ben Gertzfield 

# Contact: email-sig@python.org 

 

"""Quoted-printable content transfer encoding per RFCs 2045-2047. 

 

This module handles the content transfer encoding method defined in RFC 2045 

to encode US ASCII-like 8-bit data called `quoted-printable'.  It is used to 

safely encode text that is in a character set similar to the 7-bit US ASCII 

character set, but that includes some 8-bit characters that are normally not 

allowed in email bodies or headers. 

 

Quoted-printable is very space-inefficient for encoding binary files; use the 

email.base64mime module for that instead. 

 

This module provides an interface to encode and decode both headers and bodies 

with quoted-printable encoding. 

 

RFC 2045 defines a method for including character set information in an 

`encoded-word' in a header.  This method is commonly used for 8-bit real names 

in To:/From:/Cc: etc. fields, as well as Subject: lines. 

 

This module does not do the line wrapping or end-of-line character 

conversion necessary for proper internationalized headers; it only 

does dumb encoding and decoding.  To deal with the various line 

wrapping issues, use the email.header module. 

""" 

 

__all__ = [ 

    'body_decode', 

    'body_encode', 

    'body_quopri_check', 

    'body_quopri_len', 

    'decode', 

    'decodestring', 

    'encode', 

    'encodestring', 

    'header_decode', 

    'header_encode', 

    'header_quopri_check', 

    'header_quopri_len', 

    'quote', 

    'unquote', 

    ] 

 

import re 

 

from string import hexdigits 

from email.utils import fix_eols 

 

CRLF = '\r\n' 

NL = '\n' 

 

# See also Charset.py 

MISC_LEN = 7 

 

hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]') 

bqre = re.compile(r'[^ !-<>-~\t]') 

 

 

 

# Helpers 

def header_quopri_check(c): 

    """Return True if the character should be escaped with header quopri.""" 

    return bool(hqre.match(c)) 

 

 

def body_quopri_check(c): 

    """Return True if the character should be escaped with body quopri.""" 

    return bool(bqre.match(c)) 

 

 

def header_quopri_len(s): 

    """Return the length of str when it is encoded with header quopri.""" 

    count = 0 

    for c in s: 

        if hqre.match(c): 

            count += 3 

        else: 

            count += 1 

    return count 

 

 

def body_quopri_len(str): 

    """Return the length of str when it is encoded with body quopri.""" 

    count = 0 

    for c in str: 

        if bqre.match(c): 

            count += 3 

        else: 

            count += 1 

    return count 

 

 

def _max_append(L, s, maxlen, extra=''): 

    if not L: 

        L.append(s.lstrip()) 

    elif len(L[-1]) + len(s) <= maxlen: 

        L[-1] += extra + s 

    else: 

        L.append(s.lstrip()) 

 

 

def unquote(s): 

    """Turn a string in the form =AB to the ASCII character with value 0xab""" 

    return chr(int(s[1:3], 16)) 

 

 

def quote(c): 

    return "=%02X" % ord(c) 

 

 

 

def header_encode(header, charset="iso-8859-1", keep_eols=False, 

                  maxlinelen=76, eol=NL): 

    """Encode a single header line with quoted-printable (like) encoding. 

 

    Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but 

    used specifically for email header fields to allow charsets with mostly 7 

    bit characters (and some 8 bit) to remain more or less readable in non-RFC 

    2045 aware mail clients. 

 

    charset names the character set to use to encode the header.  It defaults 

    to iso-8859-1. 

 

    The resulting string will be in the form: 

 

    "=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n 

      =?charset?q?Silly_=C8nglish_Kn=EEghts?=" 

 

    with each line wrapped safely at, at most, maxlinelen characters (defaults 

    to 76 characters).  If maxlinelen is None, the entire string is encoded in 

    one chunk with no splitting. 

 

    End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted 

    to the canonical email line separator \\r\\n unless the keep_eols 

    parameter is True (the default is False). 

 

    Each line of the header will be terminated in the value of eol, which 

    defaults to "\\n".  Set this to "\\r\\n" if you are using the result of 

    this function directly in email. 

    """ 

    # Return empty headers unchanged 

    if not header: 

        return header 

 

    if not keep_eols: 

        header = fix_eols(header) 

 

    # Quopri encode each line, in encoded chunks no greater than maxlinelen in 

    # length, after the RFC chrome is added in. 

    quoted = [] 

    if maxlinelen is None: 

        # An obnoxiously large number that's good enough 

        max_encoded = 100000 

    else: 

        max_encoded = maxlinelen - len(charset) - MISC_LEN - 1 

 

    for c in header: 

        # Space may be represented as _ instead of =20 for readability 

        if c == ' ': 

            _max_append(quoted, '_', max_encoded) 

        # These characters can be included verbatim 

        elif not hqre.match(c): 

            _max_append(quoted, c, max_encoded) 

        # Otherwise, replace with hex value like =E2 

        else: 

            _max_append(quoted, "=%02X" % ord(c), max_encoded) 

 

    # Now add the RFC chrome to each encoded chunk and glue the chunks 

    # together.  BAW: should we be able to specify the leading whitespace in 

    # the joiner? 

    joiner = eol + ' ' 

    return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted]) 

 

 

 

def encode(body, binary=False, maxlinelen=76, eol=NL): 

    """Encode with quoted-printable, wrapping at maxlinelen characters. 

 

    If binary is False (the default), end-of-line characters will be converted 

    to the canonical email end-of-line sequence \\r\\n.  Otherwise they will 

    be left verbatim. 

 

    Each line of encoded text will end with eol, which defaults to "\\n".  Set 

    this to "\\r\\n" if you will be using the result of this function directly 

    in an email. 

 

    Each line will be wrapped at, at most, maxlinelen characters (defaults to 

    76 characters).  Long lines will have the `soft linefeed' quoted-printable 

    character "=" appended to them, so the decoded text will be identical to 

    the original text. 

    """ 

    if not body: 

        return body 

 

    if not binary: 

        body = fix_eols(body) 

 

    # BAW: We're accumulating the body text by string concatenation.  That 

    # can't be very efficient, but I don't have time now to rewrite it.  It 

    # just feels like this algorithm could be more efficient. 

    encoded_body = '' 

    lineno = -1 

    # Preserve line endings here so we can check later to see an eol needs to 

    # be added to the output later. 

    lines = body.splitlines(1) 

    for line in lines: 

        # But strip off line-endings for processing this line. 

        if line.endswith(CRLF): 

            line = line[:-2] 

        elif line[-1] in CRLF: 

            line = line[:-1] 

 

        lineno += 1 

        encoded_line = '' 

        prev = None 

        linelen = len(line) 

        # Now we need to examine every character to see if it needs to be 

        # quopri encoded.  BAW: again, string concatenation is inefficient. 

        for j in range(linelen): 

            c = line[j] 

            prev = c 

            if bqre.match(c): 

                c = quote(c) 

            elif j+1 == linelen: 

                # Check for whitespace at end of line; special case 

                if c not in ' \t': 

                    encoded_line += c 

                prev = c 

                continue 

            # Check to see to see if the line has reached its maximum length 

            if len(encoded_line) + len(c) >= maxlinelen: 

                encoded_body += encoded_line + '=' + eol 

                encoded_line = '' 

            encoded_line += c 

        # Now at end of line.. 

        if prev and prev in ' \t': 

            # Special case for whitespace at end of file 

            if lineno + 1 == len(lines): 

                prev = quote(prev) 

                if len(encoded_line) + len(prev) > maxlinelen: 

                    encoded_body += encoded_line + '=' + eol + prev 

                else: 

                    encoded_body += encoded_line + prev 

            # Just normal whitespace at end of line 

            else: 

                encoded_body += encoded_line + prev + '=' + eol 

            encoded_line = '' 

        # Now look at the line we just finished and it has a line ending, we 

        # need to add eol to the end of the line. 

        if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF: 

            encoded_body += encoded_line + eol 

        else: 

            encoded_body += encoded_line 

        encoded_line = '' 

    return encoded_body 

 

 

# For convenience and backwards compatibility w/ standard base64 module 

body_encode = encode 

encodestring = encode 

 

 

 

# BAW: I'm not sure if the intent was for the signature of this function to be 

# the same as base64MIME.decode() or not... 

def decode(encoded, eol=NL): 

    """Decode a quoted-printable string. 

 

    Lines are separated with eol, which defaults to \\n. 

    """ 

    if not encoded: 

        return encoded 

    # BAW: see comment in encode() above.  Again, we're building up the 

    # decoded string with string concatenation, which could be done much more 

    # efficiently. 

    decoded = '' 

 

    for line in encoded.splitlines(): 

        line = line.rstrip() 

        if not line: 

            decoded += eol 

            continue 

 

        i = 0 

        n = len(line) 

        while i < n: 

            c = line[i] 

            if c != '=': 

                decoded += c 

                i += 1 

            # Otherwise, c == "=".  Are we at the end of the line?  If so, add 

            # a soft line break. 

            elif i+1 == n: 

                i += 1 

                continue 

            # Decode if in form =AB 

            elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits: 

                decoded += unquote(line[i:i+3]) 

                i += 3 

            # Otherwise, not in form =AB, pass literally 

            else: 

                decoded += c 

                i += 1 

 

            if i == n: 

                decoded += eol 

    # Special case if original string did not end with eol 

    if not encoded.endswith(eol) and decoded.endswith(eol): 

        decoded = decoded[:-1] 

    return decoded 

 

 

# For convenience and backwards compatibility w/ standard base64 module 

body_decode = decode 

decodestring = decode 

 

 

 

def _unquote_match(match): 

    """Turn a match in the form =AB to the ASCII character with value 0xab""" 

    s = match.group(0) 

    return unquote(s) 

 

 

# Header decoding is done a bit differently 

def header_decode(s): 

    """Decode a string encoded with RFC 2045 MIME header `Q' encoding. 

 

    This function does not parse a full MIME header value encoded with 

    quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use 

    the high level email.header class for that functionality. 

    """ 

    s = s.replace('_', ' ') 

    return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s)