phml.core.formats.parse
Pythonic Hypertext Markup Language (phml) parser.
1"""Pythonic Hypertext Markup Language (phml) parser.""" 2from copy import deepcopy 3from operator import itemgetter 4import re 5 6from phml.core.nodes import ( 7 AST, 8 Comment, 9 DocType, 10 Element, 11 Point, 12 Position, 13 Root, 14 Text, 15 Node 16) 17 18def parse_hypertest_markup(data: str, class_name: str, auto_close: bool = True) -> AST: 19 """Parse a string as a hypertest markup document.""" 20 21 phml_parser = HypertextMarkupParser() 22 23 if isinstance(data, str): 24 return phml_parser.parse(data, auto_close=auto_close) 25 raise Exception(f"Data passed to {class_name}.parse must be a str") 26 27def strip_blank_lines(data_lines: list[str]) -> list[str]: 28 """Strip the blank lines at the start and end of a list.""" 29 data_lines = [line.replace("\r\n", "\n") for line in data_lines] 30 # remove leading blank lines 31 for idx in range(0, len(data_lines)): # pylint: disable=consider-using-enumerate 32 if data_lines[idx].strip() != "": 33 data_lines = data_lines[idx:] 34 break 35 if idx == len(data_lines) - 1: 36 data_lines = [] 37 break 38 39 # Remove trailing blank lines 40 if len(data_lines) > 0: 41 for idx in range(len(data_lines) - 1, -1, -1): 42 if data_lines[idx].replace("\n", " ").strip() != "": 43 data_lines = data_lines[: idx + 1] 44 break 45 46 return data_lines 47 48def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]: 49 """This function takes a possibly mutliline string and strips leading and trailing 50 blank lines. Given the current position it will also calculate the line and column 51 taht the data ends at. 52 """ 53 if "pre" not in cur_tags: 54 data_lines = data.split("\n") 55 56 # If multiline data block 57 if len(data_lines) > 1: 58 data_lines = strip_blank_lines(data_lines) 59 60 data = "\n".join(data_lines) 61 # Else it is a single line data block 62 else: 63 data = data_lines[0] 64 65 return data 66 67 68self_closing = [ 69 "area", 70 "base", 71 "br", 72 "col", 73 "embed", 74 "hr", 75 "img", 76 "input", 77 "link", 78 "meta", 79 "param", 80 "source", 81 "track", 82 "wbr", 83 "command", 84 "keygen", 85 "menuitem", 86] 87 88# Main form of tokenization 89class RE: 90 tag_start = re.compile(r"<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)") 91 """Matches the start of a tag `<!name|</name|<name`""" 92 93 tag_end = re.compile(r"(?P<closing>/?)>") 94 """Matches the end of a tag `/>|>`.""" 95 96 comment = re.compile(r"<!--((?:.|\s)*)-->") 97 """Matches all html style comments `<!--Comment-->`.""" 98 99 attribute = re.compile(r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"]+)))?") 100 """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`.""" 101 102 bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$") 103 104class HypertextMarkupParser: 105 """Parse html/xml like source code strings.""" 106 107 tag_stack = [] 108 """Current stack of tags in order of when they are opened.""" 109 110 def __calc_line_col(self, source: str, start: int) -> tuple[int, int]: 111 """Calculate the number of lines and columns that lead to the starting point int he source 112 string. 113 """ 114 source = source[:start] 115 return source.count("\n"), len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0 116 117 def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int: 118 """Calculate whether the number of columns should be added to the current column or be 119 treated as if it is starting from zero based on whether new lines exist. 120 """ 121 return num_cols if num_lines != 0 else init_cols + num_cols 122 123 def __parse_text_comment(self, text: str, pos: Position) -> list[Node]: 124 """Parse the comments and general text found in the provided source.""" 125 126 elements = [] # List of text and comment elements. 127 128 # For each comment add it to the list of elements 129 while RE.comment.search(text) is not None: 130 comment = RE.comment.search(text) 131 line_s, col_s = self.__calc_line_col(text, comment.start()) 132 line_e, col_e = self.__calc_line_col(comment.group(0), len(comment.group(0))) 133 134 pos.start = Point( 135 pos.start.line + line_s, 136 self.__calc_col(line_s, col_s, pos.start.column) 137 ) 138 pos.end = Point( 139 pos.start.line + line_e, 140 self.__calc_col(line_e, col_e, pos.start.column) 141 ) 142 143 # If there is text between two comments then add a text element 144 if comment.start() > 0: 145 elements.append(Text( 146 text[:comment.span()[0]], 147 position=deepcopy(pos) 148 )) 149 150 text = text[comment.span()[1]:] 151 elements.append( 152 Comment(comment.group(1), position=deepcopy(pos)) 153 ) 154 155 # remaining text is added as a text element 156 if len(text) > 0: 157 line, col = self.__calc_line_col(text, len(text)) 158 pos.start.line += line 159 pos.start.column = col 160 161 elements.append(Text( 162 text, 163 position=Position( 164 deepcopy(pos.end), 165 (pos.end.line + line, self.__calc_col(line, col, pos.end.column)) 166 ) 167 )) 168 return elements 169 170 def __parse_attributes(self, attrs: str) -> dict: 171 """Parse a tags attributes from the text found between the tag start and the tag end. 172 173 Example: 174 `<name (attributes)>` 175 """ 176 attributes = {} 177 for attr in RE.attribute.finditer(attrs): 178 ( 179 name, 180 value, 181 _, 182 double, 183 single, 184 no_bracket 185 ) = itemgetter('name', 'value', 'curly', 'double', 'single', 'open')(attr.groupdict()) 186 187 if value is not None and RE.bracket_attributte.match(value) is not None: 188 if not name.startswith(":"): 189 name = ":" + name 190 value = RE.bracket_attributte.match(value).group(1) 191 else: 192 value = double or single or no_bracket 193 194 if value in ["yes", "true", None]: 195 value = True 196 elif value in ["no", "false"]: 197 value = False 198 199 attributes[name] = value 200 return attributes 201 202 def __parse_tag(self, source, position: Position): 203 """Parse a tag from the given source. This includes the tag start, attributes and tag end. 204 It will also parse any comments and text from the start of the source to the start of the 205 tag. 206 """ 207 begin = RE.tag_start.search(source) 208 begin = (begin.start(), begin.group(0), begin.groupdict()) 209 begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"] 210 211 elems = [] 212 if begin[0] > 0: 213 elems = self.__parse_text_comment(source[:begin[0]], position) 214 position.end.column = position.start.column + len(begin[1]) 215 source = source[begin[0] + len(begin[1]):] 216 217 end = RE.tag_end.search(source) 218 if end is None: 219 raise Exception(f"Expected tag {begin} to be closed with symbol '>'. Was not closed.") 220 end = (end.start(), end.group(0), end.groupdict()) 221 222 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 223 position.end.line = position.start.line + line 224 position.end.column = position.end.column + col 225 226 attributes = self.__parse_attributes(source[:end[0]]) 227 return source[end[0] + len(end[1]):], begin, attributes, end, elems 228 229 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 230 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 231 232 if auto_closing: 233 return name in self_closing 234 return False 235 236 def parse(self, source: str, auto_close: bool = True) -> Root: 237 """Parse a given html or phml string into it's corresponding phml ast. 238 239 Args: 240 source (str): The html or phml source to parse. 241 242 Returns: 243 AST: A phml AST representing the parsed code source. 244 """ 245 246 self.tag_stack = [] 247 current = Root() 248 position = Position((0, 0), (0, 0)) 249 250 while RE.tag_start.search(source) is not None: 251 source, begin, attr, end, elems = self.__parse_tag(source, position) 252 253 if len(elems) > 0: 254 current.extend(elems) 255 256 name = begin[2]["name"] or '' 257 if begin[2]["opening"] == "/": 258 if name != self.tag_stack[-1]: 259 input(self.tag_stack) 260 raise Exception( 261 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}" 262 ) 263 264 self.tag_stack.pop() 265 current.position.end.line = position.end.line 266 current.position.end.column = position.end.column 267 268 current = current.parent 269 elif begin[2]["opening"] == "!": 270 current.append(DocType(attr.get("lang", "html"), position=deepcopy(position))) 271 elif ( 272 end[2]["closing"] != "/" 273 and not self.is_self_closing(name, auto_close) 274 and begin[2]["opening"] is None 275 ): 276 self.tag_stack.append(name) 277 current.append(Element(name, attr, position=deepcopy(position))) 278 current = current.children[-1] 279 else: 280 current.append(Element(name, attr, position=deepcopy(position), startend=True)) 281 282 position.start = deepcopy(position.end) 283 284 return AST(current)
def
parse_hypertest_markup( data: str, class_name: str, auto_close: bool = True) -> phml.core.nodes.AST.AST:
19def parse_hypertest_markup(data: str, class_name: str, auto_close: bool = True) -> AST: 20 """Parse a string as a hypertest markup document.""" 21 22 phml_parser = HypertextMarkupParser() 23 24 if isinstance(data, str): 25 return phml_parser.parse(data, auto_close=auto_close) 26 raise Exception(f"Data passed to {class_name}.parse must be a str")
Parse a string as a hypertest markup document.
def
strip_blank_lines(data_lines: list[str]) -> list[str]:
28def strip_blank_lines(data_lines: list[str]) -> list[str]: 29 """Strip the blank lines at the start and end of a list.""" 30 data_lines = [line.replace("\r\n", "\n") for line in data_lines] 31 # remove leading blank lines 32 for idx in range(0, len(data_lines)): # pylint: disable=consider-using-enumerate 33 if data_lines[idx].strip() != "": 34 data_lines = data_lines[idx:] 35 break 36 if idx == len(data_lines) - 1: 37 data_lines = [] 38 break 39 40 # Remove trailing blank lines 41 if len(data_lines) > 0: 42 for idx in range(len(data_lines) - 1, -1, -1): 43 if data_lines[idx].replace("\n", " ").strip() != "": 44 data_lines = data_lines[: idx + 1] 45 break 46 47 return data_lines
Strip the blank lines at the start and end of a list.
def
strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]:
49def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]: 50 """This function takes a possibly mutliline string and strips leading and trailing 51 blank lines. Given the current position it will also calculate the line and column 52 taht the data ends at. 53 """ 54 if "pre" not in cur_tags: 55 data_lines = data.split("\n") 56 57 # If multiline data block 58 if len(data_lines) > 1: 59 data_lines = strip_blank_lines(data_lines) 60 61 data = "\n".join(data_lines) 62 # Else it is a single line data block 63 else: 64 data = data_lines[0] 65 66 return data
This function takes a possibly mutliline string and strips leading and trailing blank lines. Given the current position it will also calculate the line and column taht the data ends at.
class
RE:
90class RE: 91 tag_start = re.compile(r"<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)") 92 """Matches the start of a tag `<!name|</name|<name`""" 93 94 tag_end = re.compile(r"(?P<closing>/?)>") 95 """Matches the end of a tag `/>|>`.""" 96 97 comment = re.compile(r"<!--((?:.|\s)*)-->") 98 """Matches all html style comments `<!--Comment-->`.""" 99 100 attribute = re.compile(r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"]+)))?") 101 """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`.""" 102 103 bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
class
HypertextMarkupParser:
105class HypertextMarkupParser: 106 """Parse html/xml like source code strings.""" 107 108 tag_stack = [] 109 """Current stack of tags in order of when they are opened.""" 110 111 def __calc_line_col(self, source: str, start: int) -> tuple[int, int]: 112 """Calculate the number of lines and columns that lead to the starting point int he source 113 string. 114 """ 115 source = source[:start] 116 return source.count("\n"), len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0 117 118 def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int: 119 """Calculate whether the number of columns should be added to the current column or be 120 treated as if it is starting from zero based on whether new lines exist. 121 """ 122 return num_cols if num_lines != 0 else init_cols + num_cols 123 124 def __parse_text_comment(self, text: str, pos: Position) -> list[Node]: 125 """Parse the comments and general text found in the provided source.""" 126 127 elements = [] # List of text and comment elements. 128 129 # For each comment add it to the list of elements 130 while RE.comment.search(text) is not None: 131 comment = RE.comment.search(text) 132 line_s, col_s = self.__calc_line_col(text, comment.start()) 133 line_e, col_e = self.__calc_line_col(comment.group(0), len(comment.group(0))) 134 135 pos.start = Point( 136 pos.start.line + line_s, 137 self.__calc_col(line_s, col_s, pos.start.column) 138 ) 139 pos.end = Point( 140 pos.start.line + line_e, 141 self.__calc_col(line_e, col_e, pos.start.column) 142 ) 143 144 # If there is text between two comments then add a text element 145 if comment.start() > 0: 146 elements.append(Text( 147 text[:comment.span()[0]], 148 position=deepcopy(pos) 149 )) 150 151 text = text[comment.span()[1]:] 152 elements.append( 153 Comment(comment.group(1), position=deepcopy(pos)) 154 ) 155 156 # remaining text is added as a text element 157 if len(text) > 0: 158 line, col = self.__calc_line_col(text, len(text)) 159 pos.start.line += line 160 pos.start.column = col 161 162 elements.append(Text( 163 text, 164 position=Position( 165 deepcopy(pos.end), 166 (pos.end.line + line, self.__calc_col(line, col, pos.end.column)) 167 ) 168 )) 169 return elements 170 171 def __parse_attributes(self, attrs: str) -> dict: 172 """Parse a tags attributes from the text found between the tag start and the tag end. 173 174 Example: 175 `<name (attributes)>` 176 """ 177 attributes = {} 178 for attr in RE.attribute.finditer(attrs): 179 ( 180 name, 181 value, 182 _, 183 double, 184 single, 185 no_bracket 186 ) = itemgetter('name', 'value', 'curly', 'double', 'single', 'open')(attr.groupdict()) 187 188 if value is not None and RE.bracket_attributte.match(value) is not None: 189 if not name.startswith(":"): 190 name = ":" + name 191 value = RE.bracket_attributte.match(value).group(1) 192 else: 193 value = double or single or no_bracket 194 195 if value in ["yes", "true", None]: 196 value = True 197 elif value in ["no", "false"]: 198 value = False 199 200 attributes[name] = value 201 return attributes 202 203 def __parse_tag(self, source, position: Position): 204 """Parse a tag from the given source. This includes the tag start, attributes and tag end. 205 It will also parse any comments and text from the start of the source to the start of the 206 tag. 207 """ 208 begin = RE.tag_start.search(source) 209 begin = (begin.start(), begin.group(0), begin.groupdict()) 210 begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"] 211 212 elems = [] 213 if begin[0] > 0: 214 elems = self.__parse_text_comment(source[:begin[0]], position) 215 position.end.column = position.start.column + len(begin[1]) 216 source = source[begin[0] + len(begin[1]):] 217 218 end = RE.tag_end.search(source) 219 if end is None: 220 raise Exception(f"Expected tag {begin} to be closed with symbol '>'. Was not closed.") 221 end = (end.start(), end.group(0), end.groupdict()) 222 223 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 224 position.end.line = position.start.line + line 225 position.end.column = position.end.column + col 226 227 attributes = self.__parse_attributes(source[:end[0]]) 228 return source[end[0] + len(end[1]):], begin, attributes, end, elems 229 230 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 231 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 232 233 if auto_closing: 234 return name in self_closing 235 return False 236 237 def parse(self, source: str, auto_close: bool = True) -> Root: 238 """Parse a given html or phml string into it's corresponding phml ast. 239 240 Args: 241 source (str): The html or phml source to parse. 242 243 Returns: 244 AST: A phml AST representing the parsed code source. 245 """ 246 247 self.tag_stack = [] 248 current = Root() 249 position = Position((0, 0), (0, 0)) 250 251 while RE.tag_start.search(source) is not None: 252 source, begin, attr, end, elems = self.__parse_tag(source, position) 253 254 if len(elems) > 0: 255 current.extend(elems) 256 257 name = begin[2]["name"] or '' 258 if begin[2]["opening"] == "/": 259 if name != self.tag_stack[-1]: 260 input(self.tag_stack) 261 raise Exception( 262 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}" 263 ) 264 265 self.tag_stack.pop() 266 current.position.end.line = position.end.line 267 current.position.end.column = position.end.column 268 269 current = current.parent 270 elif begin[2]["opening"] == "!": 271 current.append(DocType(attr.get("lang", "html"), position=deepcopy(position))) 272 elif ( 273 end[2]["closing"] != "/" 274 and not self.is_self_closing(name, auto_close) 275 and begin[2]["opening"] is None 276 ): 277 self.tag_stack.append(name) 278 current.append(Element(name, attr, position=deepcopy(position))) 279 current = current.children[-1] 280 else: 281 current.append(Element(name, attr, position=deepcopy(position), startend=True)) 282 283 position.start = deepcopy(position.end) 284 285 return AST(current)
Parse html/xml like source code strings.
def
is_self_closing(self, name: str, auto_closing: bool) -> bool:
230 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 231 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 232 233 if auto_closing: 234 return name in self_closing 235 return False
Check if the tag is self closing. Only check if auto_closing is toggled on.
237 def parse(self, source: str, auto_close: bool = True) -> Root: 238 """Parse a given html or phml string into it's corresponding phml ast. 239 240 Args: 241 source (str): The html or phml source to parse. 242 243 Returns: 244 AST: A phml AST representing the parsed code source. 245 """ 246 247 self.tag_stack = [] 248 current = Root() 249 position = Position((0, 0), (0, 0)) 250 251 while RE.tag_start.search(source) is not None: 252 source, begin, attr, end, elems = self.__parse_tag(source, position) 253 254 if len(elems) > 0: 255 current.extend(elems) 256 257 name = begin[2]["name"] or '' 258 if begin[2]["opening"] == "/": 259 if name != self.tag_stack[-1]: 260 input(self.tag_stack) 261 raise Exception( 262 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}" 263 ) 264 265 self.tag_stack.pop() 266 current.position.end.line = position.end.line 267 current.position.end.column = position.end.column 268 269 current = current.parent 270 elif begin[2]["opening"] == "!": 271 current.append(DocType(attr.get("lang", "html"), position=deepcopy(position))) 272 elif ( 273 end[2]["closing"] != "/" 274 and not self.is_self_closing(name, auto_close) 275 and begin[2]["opening"] is None 276 ): 277 self.tag_stack.append(name) 278 current.append(Element(name, attr, position=deepcopy(position))) 279 current = current.children[-1] 280 else: 281 current.append(Element(name, attr, position=deepcopy(position), startend=True)) 282 283 position.start = deepcopy(position.end) 284 285 return AST(current)
Parse a given html or phml string into it's corresponding phml ast.
Arguments:
- source (str): The html or phml source to parse.
Returns:
AST: A phml AST representing the parsed code source.