Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""":func:`~pandas.eval` source string parsing functions 

2""" 

3 

4from io import StringIO 

5from keyword import iskeyword 

6import token 

7import tokenize 

8from typing import Iterator, Tuple 

9 

10# A token value Python's tokenizer probably will never use. 

11BACKTICK_QUOTED_STRING = 100 

12 

13 

14def create_valid_python_identifier(name: str) -> str: 

15 """ 

16 Create valid Python identifiers from any string. 

17 

18 Check if name contains any special characters. If it contains any 

19 special characters, the special characters will be replaced by 

20 a special string and a prefix is added. 

21 

22 Raises 

23 ------ 

24 SyntaxError 

25 If the returned name is not a Python valid identifier, raise an exception. 

26 This can happen if there is a hashtag in the name, as the tokenizer will 

27 than terminate and not find the backtick. 

28 But also for characters that fall out of the range of (U+0001..U+007F). 

29 """ 

30 if name.isidentifier() and not iskeyword(name): 

31 return name 

32 

33 # Create a dict with the special characters and their replacement string. 

34 # EXACT_TOKEN_TYPES contains these special characters 

35 # toke.tok_name contains a readable description of the replacement string. 

36 special_characters_replacements = { 

37 char: f"_{token.tok_name[tokval]}_" 

38 # The ignore here is because of a bug in mypy that is resolved in 0.740 

39 for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore 

40 } 

41 special_characters_replacements.update( 

42 { 

43 " ": "_", 

44 "?": "_QUESTIONMARK_", 

45 "!": "_EXCLAMATIONMARK_", 

46 "$": "_DOLLARSIGN_", 

47 "€": "_EUROSIGN_", 

48 # Including quotes works, but there are exceptions. 

49 "'": "_SINGLEQUOTE_", 

50 '"': "_DOUBLEQUOTE_", 

51 # Currently not possible. Terminates parser and won't find backtick. 

52 # "#": "_HASH_", 

53 } 

54 ) 

55 

56 name = "".join(special_characters_replacements.get(char, char) for char in name) 

57 name = "BACKTICK_QUOTED_STRING_" + name 

58 

59 if not name.isidentifier(): 

60 raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") 

61 

62 return name 

63 

64 

65def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: 

66 """ 

67 Clean up a column name if surrounded by backticks. 

68 

69 Backtick quoted string are indicated by a certain tokval value. If a string 

70 is a backtick quoted token it will processed by 

71 :func:`_create_valid_python_identifier` so that the parser can find this 

72 string when the query is executed. 

73 In this case the tok will get the NAME tokval. 

74 

75 Parameters 

76 ---------- 

77 tok : tuple of int, str 

78 ints correspond to the all caps constants in the tokenize module 

79 

80 Returns 

81 ------- 

82 tok : Tuple[int, str] 

83 Either the input or token or the replacement values 

84 """ 

85 toknum, tokval = tok 

86 if toknum == BACKTICK_QUOTED_STRING: 

87 return tokenize.NAME, create_valid_python_identifier(tokval) 

88 return toknum, tokval 

89 

90 

91def clean_column_name(name: str) -> str: 

92 """ 

93 Function to emulate the cleaning of a backtick quoted name. 

94 

95 The purpose for this function is to see what happens to the name of 

96 identifier if it goes to the process of being parsed a Python code 

97 inside a backtick quoted string and than being cleaned 

98 (removed of any special characters). 

99 

100 Parameters 

101 ---------- 

102 name : str 

103 Name to be cleaned. 

104 

105 Returns 

106 ------- 

107 name : str 

108 Returns the name after tokenizing and cleaning. 

109 

110 Notes 

111 ----- 

112 For some cases, a name cannot be converted to a valid Python identifier. 

113 In that case :func:`tokenize_string` raises a SyntaxError. 

114 In that case, we just return the name unmodified. 

115 

116 If this name was used in the query string (this makes the query call impossible) 

117 an error will be raised by :func:`tokenize_backtick_quoted_string` instead, 

118 which is not catched and propogates to the user level. 

119 """ 

120 try: 

121 tokenized = tokenize_string(f"`{name}`") 

122 tokval = next(tokenized)[1] 

123 return create_valid_python_identifier(tokval) 

124 except SyntaxError: 

125 return name 

126 

127 

128def tokenize_backtick_quoted_string( 

129 token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int 

130) -> Tuple[int, str]: 

131 """ 

132 Creates a token from a backtick quoted string. 

133 

134 Moves the token_generator forwards till right after the next backtick. 

135 

136 Parameters 

137 ---------- 

138 token_generator : Iterator[tokenize.TokenInfo] 

139 The generator that yields the tokens of the source string (Tuple[int, str]). 

140 The generator is at the first token after the backtick (`) 

141 

142 source : str 

143 The Python source code string. 

144 

145 string_start : int 

146 This is the start of backtick quoted string inside the source string. 

147 

148 Returns 

149 ------- 

150 tok: Tuple[int, str] 

151 The token that represents the backtick quoted string. 

152 The integer is equal to BACKTICK_QUOTED_STRING (100). 

153 """ 

154 for _, tokval, start, _, _ in token_generator: 

155 if tokval == "`": 

156 string_end = start[1] 

157 break 

158 

159 return BACKTICK_QUOTED_STRING, source[string_start:string_end] 

160 

161 

162def tokenize_string(source: str) -> Iterator[Tuple[int, str]]: 

163 """ 

164 Tokenize a Python source code string. 

165 

166 Parameters 

167 ---------- 

168 source : str 

169 The Python source code string. 

170 

171 Returns 

172 ------- 

173 tok_generator : Iterator[Tuple[int, str]] 

174 An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). 

175 """ 

176 line_reader = StringIO(source).readline 

177 token_generator = tokenize.generate_tokens(line_reader) 

178 

179 # Loop over all tokens till a backtick (`) is found. 

180 # Then, take all tokens till the next backtick to form a backtick quoted string 

181 for toknum, tokval, start, _, _ in token_generator: 

182 if tokval == "`": 

183 try: 

184 yield tokenize_backtick_quoted_string( 

185 token_generator, source, string_start=start[1] + 1 

186 ) 

187 except Exception: 

188 raise SyntaxError(f"Failed to parse backticks in '{source}'.") 

189 else: 

190 yield toknum, tokval