Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1 # This file is part of Patsy 

2# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# This file defines a parser for a simple language based on S/R "formulas" 

6# (which are described in sections 2.3 and 2.4 in Chambers & Hastie, 1992). It 

7# uses the machinery in patsy.parse_core to do the heavy-lifting -- its 

8# biggest job is to handle tokenization. 

9 

10from __future__ import print_function 

11 

12__all__ = ["parse_formula"] 

13 

14# The Python tokenizer 

15import tokenize 

16 

17import six 

18from six.moves import cStringIO as StringIO 

19 

20from patsy import PatsyError 

21from patsy.origin import Origin 

22from patsy.infix_parser import Token, Operator, infix_parse, ParseNode 

23from patsy.tokens import python_tokenize, pretty_untokenize 

24from patsy.util import PushbackAdapter 

25 

26_atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "NUMBER"] 

27 

28def _is_a(f, v): 

29 try: 

30 f(v) 

31 except ValueError: 

32 return False 

33 else: 

34 return True 

35 

36# Helper function for _tokenize_formula: 

37def _read_python_expr(it, end_tokens): 

38 # Read out a full python expression, stopping when we hit an 

39 # unnested end token. 

40 pytypes = [] 

41 token_strings = [] 

42 origins = [] 

43 bracket_level = 0 

44 for pytype, token_string, origin in it: 

45 assert bracket_level >= 0 

46 if bracket_level == 0 and token_string in end_tokens: 

47 it.push_back((pytype, token_string, origin)) 

48 break 

49 if token_string in ("(", "[", "{"): 

50 bracket_level += 1 

51 if token_string in (")", "]", "}"): 

52 bracket_level -= 1 

53 if bracket_level < 0: 

54 raise PatsyError("unmatched close bracket", origin) 

55 pytypes.append(pytype) 

56 token_strings.append(token_string) 

57 origins.append(origin) 

58 # Either we found an end_token, or we hit the end of the string 

59 if bracket_level == 0: 

60 expr_text = pretty_untokenize(zip(pytypes, token_strings)) 

61 if expr_text == "0": 

62 token_type = "ZERO" 

63 elif expr_text == "1": 

64 token_type = "ONE" 

65 elif _is_a(int, expr_text) or _is_a(float, expr_text): 

66 token_type = "NUMBER" 

67 else: 

68 token_type = "PYTHON_EXPR" 

69 return Token(token_type, Origin.combine(origins), extra=expr_text) 

70 else: 

71 raise PatsyError("unclosed bracket in embedded Python " 

72 "expression", 

73 Origin.combine(origins)) 

74 

75def _tokenize_formula(code, operator_strings): 

76 assert "(" not in operator_strings 

77 assert ")" not in operator_strings 

78 magic_token_types = {"(": Token.LPAREN, 

79 ")": Token.RPAREN, 

80 } 

81 for operator_string in operator_strings: 

82 magic_token_types[operator_string] = operator_string 

83 # Once we enter a Python expression, a ( does not end it, but any other 

84 # "magic" token does: 

85 end_tokens = set(magic_token_types) 

86 end_tokens.remove("(") 

87 

88 it = PushbackAdapter(python_tokenize(code)) 

89 for pytype, token_string, origin in it: 

90 if token_string in magic_token_types: 

91 yield Token(magic_token_types[token_string], origin) 

92 else: 

93 it.push_back((pytype, token_string, origin)) 

94 yield _read_python_expr(it, end_tokens) 

95 

96def test__tokenize_formula(): 

97 code = "y ~ a + (foo(b,c + 2)) + -1 + 0 + 10" 

98 tokens = list(_tokenize_formula(code, ["+", "-", "~"])) 

99 expecteds = [("PYTHON_EXPR", Origin(code, 0, 1), "y"), 

100 ("~", Origin(code, 2, 3), None), 

101 ("PYTHON_EXPR", Origin(code, 4, 5), "a"), 

102 ("+", Origin(code, 6, 7), None), 

103 (Token.LPAREN, Origin(code, 8, 9), None), 

104 ("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"), 

105 (Token.RPAREN, Origin(code, 23, 24), None), 

106 ("+", Origin(code, 25, 26), None), 

107 ("-", Origin(code, 27, 28), None), 

108 ("ONE", Origin(code, 28, 29), "1"), 

109 ("+", Origin(code, 30, 31), None), 

110 ("ZERO", Origin(code, 32, 33), "0"), 

111 ("+", Origin(code, 34, 35), None), 

112 ("NUMBER", Origin(code, 36, 38), "10"), 

113 ] 

114 for got, expected in zip(tokens, expecteds): 

115 assert isinstance(got, Token) 

116 assert got.type == expected[0] 

117 assert got.origin == expected[1] 

118 assert got.extra == expected[2] 

119 

120_unary_tilde = Operator("~", 1, -100) 

121_default_ops = [ 

122 _unary_tilde, 

123 Operator("~", 2, -100), 

124 

125 Operator("+", 2, 100), 

126 Operator("-", 2, 100), 

127 Operator("*", 2, 200), 

128 Operator("/", 2, 200), 

129 Operator(":", 2, 300), 

130 Operator("**", 2, 500), 

131 

132 Operator("+", 1, 100), 

133 Operator("-", 1, 100), 

134] 

135 

136def parse_formula(code, extra_operators=[]): 

137 if not code.strip(): 

138 code = "~ 1" 

139 

140 for op in extra_operators: 

141 if op.precedence < 0: 

142 raise ValueError("all operators must have precedence >= 0") 

143 

144 operators = _default_ops + extra_operators 

145 operator_strings = [op.token_type for op in operators] 

146 tree = infix_parse(_tokenize_formula(code, operator_strings), 

147 operators, 

148 _atomic_token_types) 

149 if not isinstance(tree, ParseNode) or tree.type != "~": 

150 tree = ParseNode("~", None, [tree], tree.origin) 

151 return tree 

152 

153############# 

154 

155_parser_tests = { 

156 "": ["~", "1"], 

157 " ": ["~", "1"], 

158 " \n ": ["~", "1"], 

159 

160 "1": ["~", "1"], 

161 "a": ["~", "a"], 

162 "a ~ b": ["~", "a", "b"], 

163 

164 "(a ~ b)": ["~", "a", "b"], 

165 "a ~ ((((b))))": ["~", "a", "b"], 

166 "a ~ ((((+b))))": ["~", "a", ["+", "b"]], 

167 

168 "a + b + c": ["~", ["+", ["+", "a", "b"], "c"]], 

169 "a + (b ~ c) + d": ["~", ["+", ["+", "a", ["~", "b", "c"]], "d"]], 

170 

171 "a + np.log(a, base=10)": ["~", ["+", "a", "np.log(a, base=10)"]], 

172 # Note different spacing: 

173 "a + np . log(a , base = 10)": ["~", ["+", "a", "np.log(a, base=10)"]], 

174 

175 # Check precedence 

176 "a + b ~ c * d": ["~", ["+", "a", "b"], ["*", "c", "d"]], 

177 "a + b * c": ["~", ["+", "a", ["*", "b", "c"]]], 

178 "-a**2": ["~", ["-", ["**", "a", "2"]]], 

179 "-a:b": ["~", ["-", [":", "a", "b"]]], 

180 "a + b:c": ["~", ["+", "a", [":", "b", "c"]]], 

181 "(a + b):c": ["~", [":", ["+", "a", "b"], "c"]], 

182 "a*b:c": ["~", ["*", "a", [":", "b", "c"]]], 

183 

184 "a+b / c": ["~", ["+", "a", ["/", "b", "c"]]], 

185 "~ a": ["~", "a"], 

186 

187 "-1": ["~", ["-", "1"]], 

188 } 

189 

190def _compare_trees(got, expected): 

191 assert isinstance(got, ParseNode) 

192 if got.args: 

193 assert got.type == expected[0] 

194 for arg, expected_arg in zip(got.args, expected[1:]): 

195 _compare_trees(arg, expected_arg) 

196 else: 

197 assert got.type in _atomic_token_types 

198 assert got.token.extra == expected 

199 

200def _do_parse_test(test_cases, extra_operators): 

201 for code, expected in six.iteritems(test_cases): 

202 actual = parse_formula(code, extra_operators=extra_operators) 

203 print(repr(code), repr(expected)) 

204 print(actual) 

205 _compare_trees(actual, expected) 

206 

207def test_parse_formula(): 

208 _do_parse_test(_parser_tests, []) 

209 

210def test_parse_origin(): 

211 tree = parse_formula("a ~ b + c") 

212 assert tree.origin == Origin("a ~ b + c", 0, 9) 

213 assert tree.token.origin == Origin("a ~ b + c", 2, 3) 

214 assert tree.args[0].origin == Origin("a ~ b + c", 0, 1) 

215 assert tree.args[1].origin == Origin("a ~ b + c", 4, 9) 

216 assert tree.args[1].token.origin == Origin("a ~ b + c", 6, 7) 

217 assert tree.args[1].args[0].origin == Origin("a ~ b + c", 4, 5) 

218 assert tree.args[1].args[1].origin == Origin("a ~ b + c", 8, 9) 

219 

220# <> mark off where the error should be reported: 

221_parser_error_tests = [ 

222 "a <+>", 

223 "a + <(>", 

224 

225 "a + b <# asdf>", 

226 

227 "<)>", 

228 "a + <)>", 

229 "<*> a", 

230 "a + <*>", 

231 

232 "a + <foo[bar>", 

233 "a + <foo{bar>", 

234 "a + <foo(bar>", 

235 

236 "a + <[bar>", 

237 "a + <{bar>", 

238 

239 "a + <{bar[]>", 

240 

241 "a + foo<]>bar", 

242 "a + foo[]<]>bar", 

243 "a + foo{}<}>bar", 

244 "a + foo<)>bar", 

245 

246 "a + b<)>", 

247 "(a) <.>", 

248 

249 "<(>a + b", 

250 

251 "a +< >'foo", # Not the best placement for the error 

252] 

253 

254# Split out so it can also be used by tests of the evaluator (which also 

255# raises PatsyError's) 

256def _parsing_error_test(parse_fn, error_descs): # pragma: no cover 

257 for error_desc in error_descs: 

258 letters = [] 

259 start = None 

260 end = None 

261 for letter in error_desc: 

262 if letter == "<": 

263 start = len(letters) 

264 elif letter == ">": 

265 end = len(letters) 

266 else: 

267 letters.append(letter) 

268 bad_code = "".join(letters) 

269 assert start is not None and end is not None 

270 print(error_desc) 

271 print(repr(bad_code), start, end) 

272 try: 

273 parse_fn(bad_code) 

274 except PatsyError as e: 

275 print(e) 

276 assert e.origin.code == bad_code 

277 assert e.origin.start == start 

278 assert e.origin.end == end 

279 else: 

280 assert False, "parser failed to report an error!" 

281 

282def test_parse_errors(extra_operators=[]): 

283 def parse_fn(code): 

284 return parse_formula(code, extra_operators=extra_operators) 

285 _parsing_error_test(parse_fn, _parser_error_tests) 

286 

287_extra_op_parser_tests = { 

288 "a | b": ["~", ["|", "a", "b"]], 

289 "a * b|c": ["~", ["*", "a", ["|", "b", "c"]]], 

290 } 

291 

292def test_parse_extra_op(): 

293 extra_operators = [Operator("|", 2, 250)] 

294 _do_parse_test(_parser_tests, 

295 extra_operators=extra_operators) 

296 _do_parse_test(_extra_op_parser_tests, 

297 extra_operators=extra_operators) 

298 test_parse_errors(extra_operators=extra_operators)