Coverage for /Users/davegaeddert/Development/dropseed/plain/plain/plain/utils/encoding.py: 33%
91 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-16 22:04 -0500
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-16 22:04 -0500
1import codecs
2import datetime
3import locale
4from decimal import Decimal
5from types import NoneType
6from urllib.parse import quote
8from plain.utils.functional import Promise
11class PlainUnicodeDecodeError(UnicodeDecodeError):
12 def __init__(self, obj, *args):
13 self.obj = obj
14 super().__init__(*args)
16 def __str__(self):
17 return f"{super().__str__()}. You passed in {self.obj!r} ({type(self.obj)})"
20_PROTECTED_TYPES = (
21 NoneType,
22 int,
23 float,
24 Decimal,
25 datetime.datetime,
26 datetime.date,
27 datetime.time,
28)
31def is_protected_type(obj):
32 """Determine if the object instance is of a protected type.
34 Objects of protected types are preserved as-is when passed to
35 force_str(strings_only=True).
36 """
37 return isinstance(obj, _PROTECTED_TYPES)
40def force_str(s, encoding="utf-8", strings_only=False, errors="strict"):
41 """
42 Similar to smart_str(), except that lazy instances are resolved to
43 strings, rather than kept as lazy objects.
45 If strings_only is True, don't convert (some) non-string-like objects.
46 """
47 # Handle the common case first for performance reasons.
48 if issubclass(type(s), str):
49 return s
50 if strings_only and is_protected_type(s):
51 return s
52 try:
53 if isinstance(s, bytes):
54 s = str(s, encoding, errors)
55 else:
56 s = str(s)
57 except UnicodeDecodeError as e:
58 raise PlainUnicodeDecodeError(s, *e.args)
59 return s
62def force_bytes(s, encoding="utf-8", strings_only=False, errors="strict"):
63 """
64 Similar to smart_bytes, except that lazy instances are resolved to
65 strings, rather than kept as lazy objects.
67 If strings_only is True, don't convert (some) non-string-like objects.
68 """
69 # Handle the common case first for performance reasons.
70 if isinstance(s, bytes):
71 if encoding == "utf-8":
72 return s
73 else:
74 return s.decode("utf-8", errors).encode(encoding, errors)
75 if strings_only and is_protected_type(s):
76 return s
77 if isinstance(s, memoryview):
78 return bytes(s)
79 return str(s).encode(encoding, errors)
82def iri_to_uri(iri):
83 """
84 Convert an Internationalized Resource Identifier (IRI) portion to a URI
85 portion that is suitable for inclusion in a URL.
87 This is the algorithm from RFC 3987 Section 3.1, slightly simplified since
88 the input is assumed to be a string rather than an arbitrary byte stream.
90 Take an IRI (string or UTF-8 bytes, e.g. '/I ♥ Plain/' or
91 b'/I \xe2\x99\xa5 Plain/') and return a string containing the encoded
92 result with ASCII chars only (e.g. '/I%20%E2%99%A5%20Plain/').
93 """
94 # The list of safe characters here is constructed from the "reserved" and
95 # "unreserved" characters specified in RFC 3986 Sections 2.2 and 2.3:
96 # reserved = gen-delims / sub-delims
97 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
98 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
99 # / "*" / "+" / "," / ";" / "="
100 # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
101 # Of the unreserved characters, urllib.parse.quote() already considers all
102 # but the ~ safe.
103 # The % character is also added to the list of safe characters here, as the
104 # end of RFC 3987 Section 3.1 specifically mentions that % must not be
105 # converted.
106 if iri is None:
107 return iri
108 elif isinstance(iri, Promise):
109 iri = str(iri)
110 return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
113# List of byte values that uri_to_iri() decodes from percent encoding.
114# First, the unreserved characters from RFC 3986:
115_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
116_hextobyte = {
117 (fmt % char).encode(): bytes((char,))
118 for ascii_range in _ascii_ranges
119 for char in ascii_range
120 for fmt in ["%02x", "%02X"]
121}
122# And then everything above 128, because bytes ≥ 128 are part of multibyte
123# Unicode characters.
124_hexdig = "0123456789ABCDEFabcdef"
125_hextobyte.update(
126 {(a + b).encode(): bytes.fromhex(a + b) for a in _hexdig[8:] for b in _hexdig}
127)
130def uri_to_iri(uri):
131 """
132 Convert a Uniform Resource Identifier(URI) into an Internationalized
133 Resource Identifier(IRI).
135 This is the algorithm from RFC 3987 Section 3.2, excluding step 4.
137 Take an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Plain/') and return
138 a string containing the encoded result (e.g. '/I%20♥%20Plain/').
139 """
140 if uri is None:
141 return uri
142 uri = force_bytes(uri)
143 # Fast selective unquote: First, split on '%' and then starting with the
144 # second block, decode the first 2 bytes if they represent a hex code to
145 # decode. The rest of the block is the part after '%AB', not containing
146 # any '%'. Add that to the output without further processing.
147 bits = uri.split(b"%")
148 if len(bits) == 1:
149 iri = uri
150 else:
151 parts = [bits[0]]
152 append = parts.append
153 hextobyte = _hextobyte
154 for item in bits[1:]:
155 hex = item[:2]
156 if hex in hextobyte:
157 append(hextobyte[item[:2]])
158 append(item[2:])
159 else:
160 append(b"%")
161 append(item)
162 iri = b"".join(parts)
163 return repercent_broken_unicode(iri).decode()
166def escape_uri_path(path):
167 """
168 Escape the unsafe characters from the path portion of a Uniform Resource
169 Identifier (URI).
170 """
171 # These are the "reserved" and "unreserved" characters specified in RFC
172 # 3986 Sections 2.2 and 2.3:
173 # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
174 # unreserved = alphanum | mark
175 # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
176 # The list of safe characters here is constructed subtracting ";", "=",
177 # and "?" according to RFC 3986 Section 3.3.
178 # The reason for not subtracting and escaping "/" is that we are escaping
179 # the entire path, not a path segment.
180 return quote(path, safe="/:@&+$,-_.!~*'()")
183def punycode(domain):
184 """Return the Punycode of the given domain if it's non-ASCII."""
185 return domain.encode("idna").decode("ascii")
188def repercent_broken_unicode(path):
189 """
190 As per RFC 3987 Section 3.2, step three of converting a URI into an IRI,
191 repercent-encode any octet produced that is not part of a strictly legal
192 UTF-8 octet sequence.
193 """
194 while True:
195 try:
196 path.decode()
197 except UnicodeDecodeError as e:
198 # CVE-2019-14235: A recursion shouldn't be used since the exception
199 # handling uses massive amounts of memory
200 repercent = quote(path[e.start : e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
201 path = path[: e.start] + repercent.encode() + path[e.end :]
202 else:
203 return path
206def filepath_to_uri(path):
207 """Convert a file system path to a URI portion that is suitable for
208 inclusion in a URL.
210 Encode certain chars that would normally be recognized as special chars
211 for URIs. Do not encode the ' character, as it is a valid character
212 within URIs. See the encodeURIComponent() JavaScript function for details.
213 """
214 if path is None:
215 return path
216 # I know about `os.sep` and `os.altsep` but I want to leave
217 # some flexibility for hardcoding separators.
218 return quote(str(path).replace("\\", "/"), safe="/~!*()'")
221def get_system_encoding():
222 """
223 The encoding for the character type functions. Fallback to 'ascii' if the
224 #encoding is unsupported by Python or could not be determined. See tickets
225 #10335 and #5846.
226 """
227 try:
228 encoding = locale.getlocale()[1] or "ascii"
229 codecs.lookup(encoding)
230 except Exception:
231 encoding = "ascii"
232 return encoding
235DEFAULT_LOCALE_ENCODING = get_system_encoding()