Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# This file implements a simple "shunting yard algorithm" parser for infix 

6# languages with parentheses. It is used as the core of our parser for 

7# formulas, but is generic enough to be used for other purposes as well 

8# (e.g. parsing linear constraints). It just builds a parse tree; semantics 

9# are somebody else's problem. 

10#  

11# Plus it spends energy on tracking where each item in the parse tree comes 

12# from, to allow high-quality error reporting. 

13# 

14# You are expected to provide an collection of Operators, a collection of 

15# atomic types, and an iterator that provides Tokens. Each Operator should 

16# have a unique token_type (which is an arbitrary Python object), and each 

17# Token should have a matching token_type, or one of the special types 

18# Token.LPAREN, Token.RPAREN. Each Token is required to have a valid Origin 

19# attached, for error reporting. 

20 

21# XX: still seriously consider putting the magic intercept handling into the 

22# tokenizer. we'd still need separate term-sets that get pasted together by ~ 

23# to create the modeldesc, though... heck maybe we should just have a 

24# modeldesc be 1-or-more termsets, with the convention that if it's 1, then 

25# it's a rhs, and if it's 2, it's (lhs, rhs), and otherwise you're on your 

26# own. Test: would this be useful for multiple-group log-linear models, 

27# maybe? Answer: Perhaps. outcome ~ x1 + x2 ~ group. But lots of other 

28# plausible, maybe better ways to write this -- (outcome | group) ~ x1 + x2? 

29# "outcome ~ x1 + x2", group="group"? etc. 

30 

31from __future__ import print_function 

32 

33__all__ = ["Token", "ParseNode", "Operator", "parse"] 

34 

35from patsy import PatsyError 

36from patsy.origin import Origin 

37from patsy.util import (repr_pretty_delegate, repr_pretty_impl, 

38 no_pickling, assert_no_pickling) 

39 

40class _UniqueValue(object): 

41 def __init__(self, print_as): 

42 self._print_as = print_as 

43 

44 def __repr__(self): 

45 return "%s(%r)" % (self.__class__.__name__, self._print_as) 

46 

47 __getstate__ = no_pickling 

48 

49class Token(object): 

50 """A token with possible payload. 

51 

52 .. attribute:: type 

53 

54 An arbitrary object indicating the type of this token. Should be 

55 :term:`hashable`, but otherwise it can be whatever you like. 

56 """ 

57 LPAREN = _UniqueValue("LPAREN") 

58 RPAREN = _UniqueValue("RPAREN") 

59 

60 def __init__(self, type, origin, extra=None): 

61 self.type = type 

62 self.origin = origin 

63 self.extra = extra 

64 

65 __repr__ = repr_pretty_delegate 

66 def _repr_pretty_(self, p, cycle): 

67 assert not cycle 

68 kwargs = [] 

69 if self.extra is not None: 

70 kwargs = [("extra", self.extra)] 

71 return repr_pretty_impl(p, self, [self.type, self.origin], kwargs) 

72 

73 __getstate__ = no_pickling 

74 

75class ParseNode(object): 

76 def __init__(self, type, token, args, origin): 

77 self.type = type 

78 self.token = token 

79 self.args = args 

80 self.origin = origin 

81 

82 __repr__ = repr_pretty_delegate 

83 def _repr_pretty_(self, p, cycle): 

84 return repr_pretty_impl(p, self, [self.type, self.token, self.args]) 

85 

86 __getstate__ = no_pickling 

87 

88class Operator(object): 

89 def __init__(self, token_type, arity, precedence): 

90 self.token_type = token_type 

91 self.arity = arity 

92 self.precedence = precedence 

93 

94 def __repr__(self): 

95 return "%s(%r, %r, %r)" % (self.__class__.__name__, 

96 self.token_type, self.arity, self.precedence) 

97 

98 __getstate__ = no_pickling 

99 

100class _StackOperator(object): 

101 def __init__(self, op, token): 

102 self.op = op 

103 self.token = token 

104 

105 __getstate__ = no_pickling 

106 

107_open_paren = Operator(Token.LPAREN, -1, -9999999) 

108 

109class _ParseContext(object): 

110 def __init__(self, unary_ops, binary_ops, atomic_types, trace): 

111 self.op_stack = [] 

112 self.noun_stack = [] 

113 self.unary_ops = unary_ops 

114 self.binary_ops = binary_ops 

115 self.atomic_types = atomic_types 

116 self.trace = trace 

117 

118 __getstate__ = no_pickling 

119 

120def _read_noun_context(token, c): 

121 if token.type == Token.LPAREN: 

122 if c.trace: 

123 print("Pushing open-paren") 

124 c.op_stack.append(_StackOperator(_open_paren, token)) 

125 return True 

126 elif token.type in c.unary_ops: 

127 if c.trace: 

128 print("Pushing unary op %r" % (token.type,)) 

129 c.op_stack.append(_StackOperator(c.unary_ops[token.type], token)) 

130 return True 

131 elif token.type in c.atomic_types: 

132 if c.trace: 

133 print("Pushing noun %r (%r)" % (token.type, token.extra)) 

134 c.noun_stack.append(ParseNode(token.type, token, [], 

135 token.origin)) 

136 return False 

137 else: 

138 raise PatsyError("expected a noun, not '%s'" 

139 % (token.origin.relevant_code(),), 

140 token) 

141 

142def _run_op(c): 

143 assert c.op_stack 

144 stackop = c.op_stack.pop() 

145 args = [] 

146 for i in range(stackop.op.arity): 

147 args.append(c.noun_stack.pop()) 

148 args.reverse() 

149 if c.trace: 

150 print("Reducing %r (%r)" % (stackop.op.token_type, args)) 

151 node = ParseNode(stackop.op.token_type, stackop.token, args, 

152 Origin.combine([stackop.token] + args)) 

153 c.noun_stack.append(node) 

154 

155def _read_op_context(token, c): 

156 if token.type == Token.RPAREN: 

157 if c.trace: 

158 print("Found close-paren") 

159 while c.op_stack and c.op_stack[-1].op.token_type != Token.LPAREN: 

160 _run_op(c) 

161 if not c.op_stack: 

162 raise PatsyError("missing '(' or extra ')'", token) 

163 assert c.op_stack[-1].op.token_type == Token.LPAREN 

164 # Expand the origin of the item on top of the noun stack to include 

165 # the open and close parens: 

166 combined = Origin.combine([c.op_stack[-1].token, 

167 c.noun_stack[-1].token, 

168 token]) 

169 c.noun_stack[-1].origin = combined 

170 # Pop the open-paren 

171 c.op_stack.pop() 

172 return False 

173 elif token.type in c.binary_ops: 

174 if c.trace: 

175 print("Found binary operator %r" % (token.type)) 

176 stackop = _StackOperator(c.binary_ops[token.type], token) 

177 while (c.op_stack 

178 and stackop.op.precedence <= c.op_stack[-1].op.precedence): 

179 _run_op(c) 

180 if c.trace: 

181 print("Pushing binary operator %r" % (token.type)) 

182 c.op_stack.append(stackop) 

183 return True 

184 else: 

185 raise PatsyError("expected an operator, not '%s'" 

186 % (token.origin.relevant_code(),), 

187 token) 

188 

189def infix_parse(tokens, operators, atomic_types, trace=False): 

190 token_source = iter(tokens) 

191 

192 unary_ops = {} 

193 binary_ops = {} 

194 for op in operators: 

195 assert op.precedence > _open_paren.precedence 

196 if op.arity == 1: 

197 unary_ops[op.token_type] = op 

198 elif op.arity == 2: 

199 binary_ops[op.token_type] = op 

200 else: 

201 raise ValueError("operators must be unary or binary") 

202 

203 c = _ParseContext(unary_ops, binary_ops, atomic_types, trace) 

204 

205 # This is an implementation of Dijkstra's shunting yard algorithm: 

206 # http://en.wikipedia.org/wiki/Shunting_yard_algorithm 

207 # http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm 

208 

209 want_noun = True 

210 for token in token_source: 

211 if c.trace: 

212 print("Reading next token (want_noun=%r)" % (want_noun,)) 

213 if want_noun: 

214 want_noun = _read_noun_context(token, c) 

215 else: 

216 want_noun = _read_op_context(token, c) 

217 if c.trace: 

218 print("End of token stream") 

219 

220 if want_noun: 

221 raise PatsyError("expected a noun, but instead the expression ended", 

222 c.op_stack[-1].token.origin) 

223 

224 while c.op_stack: 

225 if c.op_stack[-1].op.token_type == Token.LPAREN: 

226 raise PatsyError("Unmatched '('", c.op_stack[-1].token) 

227 _run_op(c) 

228 

229 assert len(c.noun_stack) == 1 

230 return c.noun_stack.pop() 

231 

232# Much more thorough tests in parse_formula.py, this is just a smoke test: 

233def test_infix_parse(): 

234 ops = [Operator("+", 2, 10), 

235 Operator("*", 2, 20), 

236 Operator("-", 1, 30)] 

237 atomic = ["ATOM1", "ATOM2"] 

238 # a + -b * (c + d) 

239 mock_origin = Origin("asdf", 2, 3) 

240 tokens = [Token("ATOM1", mock_origin, "a"), 

241 Token("+", mock_origin, "+"), 

242 Token("-", mock_origin, "-"), 

243 Token("ATOM2", mock_origin, "b"), 

244 Token("*", mock_origin, "*"), 

245 Token(Token.LPAREN, mock_origin, "("), 

246 Token("ATOM1", mock_origin, "c"), 

247 Token("+", mock_origin, "+"), 

248 Token("ATOM2", mock_origin, "d"), 

249 Token(Token.RPAREN, mock_origin, ")")] 

250 tree = infix_parse(tokens, ops, atomic) 

251 def te(tree, type, extra): 

252 assert tree.type == type 

253 assert tree.token.extra == extra 

254 te(tree, "+", "+") 

255 te(tree.args[0], "ATOM1", "a") 

256 assert tree.args[0].args == [] 

257 te(tree.args[1], "*", "*") 

258 te(tree.args[1].args[0], "-", "-") 

259 assert len(tree.args[1].args[0].args) == 1 

260 te(tree.args[1].args[0].args[0], "ATOM2", "b") 

261 te(tree.args[1].args[1], "+", "+") 

262 te(tree.args[1].args[1].args[0], "ATOM1", "c") 

263 te(tree.args[1].args[1].args[1], "ATOM2", "d") 

264 

265 from nose.tools import assert_raises 

266 # No ternary ops 

267 assert_raises(ValueError, 

268 infix_parse, [], [Operator("+", 3, 10)], ["ATOMIC"]) 

269 

270 # smoke test just to make sure there are no egregious bugs in 'trace' 

271 infix_parse(tokens, ops, atomic, trace=True)