phml.core.formats.parse

Pythonic Hypertext Markup Language (phml) parser.

  1"""Pythonic Hypertext Markup Language (phml) parser."""
  2from copy import deepcopy
  3from operator import itemgetter
  4import re
  5
  6from phml.core.nodes import (
  7    AST,
  8    Comment,
  9    DocType,
 10    Element,
 11    Point,
 12    Position,
 13    Root,
 14    Text,
 15    Node
 16)
 17
 18def parse_hypertest_markup(data: str, class_name: str, auto_close: bool = True) -> AST:
 19    """Parse a string as a hypertest markup document."""
 20
 21    phml_parser = HypertextMarkupParser()
 22
 23    if isinstance(data, str):
 24        return phml_parser.parse(data, auto_close=auto_close)
 25    raise Exception(f"Data passed to {class_name}.parse must be a str")
 26
 27def strip_blank_lines(data_lines: list[str]) -> list[str]:
 28    """Strip the blank lines at the start and end of a list."""
 29    data_lines = [line.replace("\r\n", "\n") for line in data_lines]
 30    # remove leading blank lines
 31    for idx in range(0, len(data_lines)):  # pylint: disable=consider-using-enumerate
 32        if data_lines[idx].strip() != "":
 33            data_lines = data_lines[idx:]
 34            break
 35        if idx == len(data_lines) - 1:
 36            data_lines = []
 37            break
 38
 39    # Remove trailing blank lines
 40    if len(data_lines) > 0:
 41        for idx in range(len(data_lines) - 1, -1, -1):
 42            if data_lines[idx].replace("\n", " ").strip() != "":
 43                data_lines = data_lines[: idx + 1]
 44                break
 45
 46    return data_lines
 47
 48def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]:
 49    """This function takes a possibly mutliline string and strips leading and trailing
 50    blank lines. Given the current position it will also calculate the line and column
 51    taht the data ends at.
 52    """
 53    if "pre" not in cur_tags:
 54        data_lines = data.split("\n")
 55
 56        # If multiline data block
 57        if len(data_lines) > 1:
 58            data_lines = strip_blank_lines(data_lines)
 59
 60            data = "\n".join(data_lines)
 61        # Else it is a single line data block
 62        else:
 63            data = data_lines[0]
 64
 65    return data
 66
 67
 68self_closing = [
 69    "area",
 70    "base",
 71    "br",
 72    "col",
 73    "embed",
 74    "hr",
 75    "img",
 76    "input",
 77    "link",
 78    "meta",
 79    "param",
 80    "source",
 81    "track",
 82    "wbr",
 83    "command",
 84    "keygen",
 85    "menuitem",
 86]
 87
 88# Main form of tokenization
 89class RE:
 90    tag_start = re.compile(r"<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)")
 91    """Matches the start of a tag `<!name|</name|<name`"""
 92
 93    tag_end = re.compile(r"(?P<closing>/?)>")
 94    """Matches the end of a tag `/>|>`."""
 95
 96    comment = re.compile(r"<!--((?:.|\s)*)-->")
 97    """Matches all html style comments `<!--Comment-->`."""
 98
 99    attribute = re.compile(r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"]+)))?")
100    """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`."""
101    
102    bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
103
104class HypertextMarkupParser:
105    """Parse html/xml like source code strings."""
106
107    tag_stack = []
108    """Current stack of tags in order of when they are opened."""
109    
110    def __calc_line_col(self, source: str, start: int) -> tuple[int, int]:
111        """Calculate the number of lines and columns that lead to the starting point int he source
112        string.
113        """
114        source = source[:start]
115        return source.count("\n"), len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0
116
117    def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int:
118        """Calculate whether the number of columns should be added to the current column or be
119        treated as if it is starting from zero based on whether new lines exist.
120        """
121        return num_cols if num_lines != 0 else init_cols + num_cols
122
123    def __parse_text_comment(self, text: str, pos: Position) -> list[Node]:
124        """Parse the comments and general text found in the provided source."""
125
126        elements = [] # List of text and comment elements.
127
128        # For each comment add it to the list of elements
129        while RE.comment.search(text) is not None:
130            comment = RE.comment.search(text)
131            line_s, col_s = self.__calc_line_col(text, comment.start())
132            line_e, col_e = self.__calc_line_col(comment.group(0), len(comment.group(0)))
133
134            pos.start = Point(
135                pos.start.line + line_s,
136                self.__calc_col(line_s, col_s, pos.start.column)
137            )
138            pos.end = Point(
139                pos.start.line + line_e,
140                self.__calc_col(line_e, col_e, pos.start.column)
141            )
142
143            # If there is text between two comments then add a text element
144            if comment.start() > 0:
145                elements.append(Text(
146                    text[:comment.span()[0]],
147                    position=deepcopy(pos)
148                ))
149
150            text = text[comment.span()[1]:]
151            elements.append(
152                Comment(comment.group(1), position=deepcopy(pos))
153            )
154
155        # remaining text is added as a text element
156        if len(text) > 0:
157            line, col = self.__calc_line_col(text, len(text))
158            pos.start.line += line
159            pos.start.column = col
160
161            elements.append(Text(
162                text,
163                position=Position(
164                    deepcopy(pos.end),
165                    (pos.end.line + line, self.__calc_col(line, col, pos.end.column))
166                )
167            ))
168        return elements
169
170    def __parse_attributes(self, attrs: str) -> dict:
171        """Parse a tags attributes from the text found between the tag start and the tag end.
172        
173        Example:
174            `<name (attributes)>`
175        """
176        attributes = {}
177        for attr in RE.attribute.finditer(attrs):
178            (
179                name,
180                value,
181                _,
182                double,
183                single,
184                no_bracket
185            ) = itemgetter('name', 'value', 'curly', 'double', 'single', 'open')(attr.groupdict())
186
187            if value is not None and RE.bracket_attributte.match(value) is not None:
188                if not name.startswith(":"):
189                    name = ":" + name
190                value = RE.bracket_attributte.match(value).group(1)
191            else:
192                value = double or single or no_bracket
193
194            if value in ["yes", "true", None]:
195                value = True
196            elif value in ["no", "false"]:
197                value = False
198
199            attributes[name] = value
200        return attributes
201
202    def __parse_tag(self, source, position: Position):
203        """Parse a tag from the given source. This includes the tag start, attributes and tag end.
204        It will also parse any comments and text from the start of the source to the start of the
205        tag.
206        """
207        begin = RE.tag_start.search(source)
208        begin = (begin.start(), begin.group(0), begin.groupdict())
209        begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"]
210
211        elems = []
212        if begin[0] > 0:
213            elems = self.__parse_text_comment(source[:begin[0]], position)
214        position.end.column = position.start.column + len(begin[1])
215        source = source[begin[0] + len(begin[1]):]
216
217        end = RE.tag_end.search(source)
218        if end is None:
219            raise Exception(f"Expected tag {begin} to be closed with symbol '>'. Was not closed.")
220        end = (end.start(), end.group(0), end.groupdict())
221
222        line, col = self.__calc_line_col(source, end[0] + len(end[1]))
223        position.end.line = position.start.line + line
224        position.end.column = position.end.column + col
225
226        attributes = self.__parse_attributes(source[:end[0]])
227        return source[end[0] + len(end[1]):], begin, attributes, end, elems
228
229    def is_self_closing(self, name: str, auto_closing: bool) -> bool:
230        """Check if the tag is self closing. Only check if auto_closing is toggled on."""
231
232        if auto_closing:
233            return name in self_closing
234        return False
235
236    def parse(self, source: str, auto_close: bool = True) -> Root:
237        """Parse a given html or phml string into it's corresponding phml ast.
238
239        Args:
240            source (str): The html or phml source to parse.
241
242        Returns:
243            AST: A phml AST representing the parsed code source.
244        """
245
246        self.tag_stack = []
247        current = Root()
248        position = Position((0, 0), (0, 0))
249
250        while RE.tag_start.search(source) is not None:
251            source, begin, attr, end, elems = self.__parse_tag(source, position)
252
253            if len(elems) > 0:
254                current.extend(elems)
255
256            name = begin[2]["name"] or ''
257            if begin[2]["opening"] == "/":
258                if name != self.tag_stack[-1]:
259                    input(self.tag_stack)
260                    raise Exception(
261                        f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}"
262                    )
263
264                self.tag_stack.pop()
265                current.position.end.line = position.end.line
266                current.position.end.column = position.end.column
267
268                current = current.parent
269            elif begin[2]["opening"] == "!":
270                current.append(DocType(attr.get("lang", "html"), position=deepcopy(position)))
271            elif (
272                end[2]["closing"] != "/"
273                and not self.is_self_closing(name, auto_close)
274                and begin[2]["opening"] is None
275            ):
276                self.tag_stack.append(name)
277                current.append(Element(name, attr, position=deepcopy(position)))
278                current = current.children[-1]
279            else:
280                current.append(Element(name, attr, position=deepcopy(position), startend=True))
281
282            position.start = deepcopy(position.end)
283
284        return AST(current)
def parse_hypertest_markup( data: str, class_name: str, auto_close: bool = True) -> phml.core.nodes.AST.AST:
19def parse_hypertest_markup(data: str, class_name: str, auto_close: bool = True) -> AST:
20    """Parse a string as a hypertest markup document."""
21
22    phml_parser = HypertextMarkupParser()
23
24    if isinstance(data, str):
25        return phml_parser.parse(data, auto_close=auto_close)
26    raise Exception(f"Data passed to {class_name}.parse must be a str")

Parse a string as a hypertest markup document.

def strip_blank_lines(data_lines: list[str]) -> list[str]:
28def strip_blank_lines(data_lines: list[str]) -> list[str]:
29    """Strip the blank lines at the start and end of a list."""
30    data_lines = [line.replace("\r\n", "\n") for line in data_lines]
31    # remove leading blank lines
32    for idx in range(0, len(data_lines)):  # pylint: disable=consider-using-enumerate
33        if data_lines[idx].strip() != "":
34            data_lines = data_lines[idx:]
35            break
36        if idx == len(data_lines) - 1:
37            data_lines = []
38            break
39
40    # Remove trailing blank lines
41    if len(data_lines) > 0:
42        for idx in range(len(data_lines) - 1, -1, -1):
43            if data_lines[idx].replace("\n", " ").strip() != "":
44                data_lines = data_lines[: idx + 1]
45                break
46
47    return data_lines

Strip the blank lines at the start and end of a list.

def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]:
49def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]:
50    """This function takes a possibly mutliline string and strips leading and trailing
51    blank lines. Given the current position it will also calculate the line and column
52    taht the data ends at.
53    """
54    if "pre" not in cur_tags:
55        data_lines = data.split("\n")
56
57        # If multiline data block
58        if len(data_lines) > 1:
59            data_lines = strip_blank_lines(data_lines)
60
61            data = "\n".join(data_lines)
62        # Else it is a single line data block
63        else:
64            data = data_lines[0]
65
66    return data

This function takes a possibly mutliline string and strips leading and trailing blank lines. Given the current position it will also calculate the line and column taht the data ends at.

class RE:
 90class RE:
 91    tag_start = re.compile(r"<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)")
 92    """Matches the start of a tag `<!name|</name|<name`"""
 93
 94    tag_end = re.compile(r"(?P<closing>/?)>")
 95    """Matches the end of a tag `/>|>`."""
 96
 97    comment = re.compile(r"<!--((?:.|\s)*)-->")
 98    """Matches all html style comments `<!--Comment-->`."""
 99
100    attribute = re.compile(r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"]+)))?")
101    """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`."""
102    
103    bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
RE()
tag_start = re.compile('<(?!!--)(?P<opening>!|\\/)?(?P<name>([\\w:\\.]+\\-?)+)|<(?P<opening2>/)?(?=\\s+>|>)')

Matches the start of a tag <!name|</name|<name

tag_end = re.compile('(?P<closing>/?)>')

Matches the end of a tag />|>.

comment = re.compile('<!--((?:.|\\s)*)-->')

Matches all html style comments <!--Comment-->.

attribute = re.compile('(?P<name>[\\w:\\-@]+)(?:=(?P<value>\\{(?P<curly>[^\\}]*)\\/\\}|\\"(?P<double>[^\\"]*)\\"|\'(?P<single>[^\']*)\'|(?P<open>[^>\'\\"]+)))?')

Matches a tags attributes attr|attr=value|attr='value'|attr="value".

class HypertextMarkupParser:
105class HypertextMarkupParser:
106    """Parse html/xml like source code strings."""
107
108    tag_stack = []
109    """Current stack of tags in order of when they are opened."""
110    
111    def __calc_line_col(self, source: str, start: int) -> tuple[int, int]:
112        """Calculate the number of lines and columns that lead to the starting point int he source
113        string.
114        """
115        source = source[:start]
116        return source.count("\n"), len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0
117
118    def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int:
119        """Calculate whether the number of columns should be added to the current column or be
120        treated as if it is starting from zero based on whether new lines exist.
121        """
122        return num_cols if num_lines != 0 else init_cols + num_cols
123
124    def __parse_text_comment(self, text: str, pos: Position) -> list[Node]:
125        """Parse the comments and general text found in the provided source."""
126
127        elements = [] # List of text and comment elements.
128
129        # For each comment add it to the list of elements
130        while RE.comment.search(text) is not None:
131            comment = RE.comment.search(text)
132            line_s, col_s = self.__calc_line_col(text, comment.start())
133            line_e, col_e = self.__calc_line_col(comment.group(0), len(comment.group(0)))
134
135            pos.start = Point(
136                pos.start.line + line_s,
137                self.__calc_col(line_s, col_s, pos.start.column)
138            )
139            pos.end = Point(
140                pos.start.line + line_e,
141                self.__calc_col(line_e, col_e, pos.start.column)
142            )
143
144            # If there is text between two comments then add a text element
145            if comment.start() > 0:
146                elements.append(Text(
147                    text[:comment.span()[0]],
148                    position=deepcopy(pos)
149                ))
150
151            text = text[comment.span()[1]:]
152            elements.append(
153                Comment(comment.group(1), position=deepcopy(pos))
154            )
155
156        # remaining text is added as a text element
157        if len(text) > 0:
158            line, col = self.__calc_line_col(text, len(text))
159            pos.start.line += line
160            pos.start.column = col
161
162            elements.append(Text(
163                text,
164                position=Position(
165                    deepcopy(pos.end),
166                    (pos.end.line + line, self.__calc_col(line, col, pos.end.column))
167                )
168            ))
169        return elements
170
171    def __parse_attributes(self, attrs: str) -> dict:
172        """Parse a tags attributes from the text found between the tag start and the tag end.
173        
174        Example:
175            `<name (attributes)>`
176        """
177        attributes = {}
178        for attr in RE.attribute.finditer(attrs):
179            (
180                name,
181                value,
182                _,
183                double,
184                single,
185                no_bracket
186            ) = itemgetter('name', 'value', 'curly', 'double', 'single', 'open')(attr.groupdict())
187
188            if value is not None and RE.bracket_attributte.match(value) is not None:
189                if not name.startswith(":"):
190                    name = ":" + name
191                value = RE.bracket_attributte.match(value).group(1)
192            else:
193                value = double or single or no_bracket
194
195            if value in ["yes", "true", None]:
196                value = True
197            elif value in ["no", "false"]:
198                value = False
199
200            attributes[name] = value
201        return attributes
202
203    def __parse_tag(self, source, position: Position):
204        """Parse a tag from the given source. This includes the tag start, attributes and tag end.
205        It will also parse any comments and text from the start of the source to the start of the
206        tag.
207        """
208        begin = RE.tag_start.search(source)
209        begin = (begin.start(), begin.group(0), begin.groupdict())
210        begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"]
211
212        elems = []
213        if begin[0] > 0:
214            elems = self.__parse_text_comment(source[:begin[0]], position)
215        position.end.column = position.start.column + len(begin[1])
216        source = source[begin[0] + len(begin[1]):]
217
218        end = RE.tag_end.search(source)
219        if end is None:
220            raise Exception(f"Expected tag {begin} to be closed with symbol '>'. Was not closed.")
221        end = (end.start(), end.group(0), end.groupdict())
222
223        line, col = self.__calc_line_col(source, end[0] + len(end[1]))
224        position.end.line = position.start.line + line
225        position.end.column = position.end.column + col
226
227        attributes = self.__parse_attributes(source[:end[0]])
228        return source[end[0] + len(end[1]):], begin, attributes, end, elems
229
230    def is_self_closing(self, name: str, auto_closing: bool) -> bool:
231        """Check if the tag is self closing. Only check if auto_closing is toggled on."""
232
233        if auto_closing:
234            return name in self_closing
235        return False
236
237    def parse(self, source: str, auto_close: bool = True) -> Root:
238        """Parse a given html or phml string into it's corresponding phml ast.
239
240        Args:
241            source (str): The html or phml source to parse.
242
243        Returns:
244            AST: A phml AST representing the parsed code source.
245        """
246
247        self.tag_stack = []
248        current = Root()
249        position = Position((0, 0), (0, 0))
250
251        while RE.tag_start.search(source) is not None:
252            source, begin, attr, end, elems = self.__parse_tag(source, position)
253
254            if len(elems) > 0:
255                current.extend(elems)
256
257            name = begin[2]["name"] or ''
258            if begin[2]["opening"] == "/":
259                if name != self.tag_stack[-1]:
260                    input(self.tag_stack)
261                    raise Exception(
262                        f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}"
263                    )
264
265                self.tag_stack.pop()
266                current.position.end.line = position.end.line
267                current.position.end.column = position.end.column
268
269                current = current.parent
270            elif begin[2]["opening"] == "!":
271                current.append(DocType(attr.get("lang", "html"), position=deepcopy(position)))
272            elif (
273                end[2]["closing"] != "/"
274                and not self.is_self_closing(name, auto_close)
275                and begin[2]["opening"] is None
276            ):
277                self.tag_stack.append(name)
278                current.append(Element(name, attr, position=deepcopy(position)))
279                current = current.children[-1]
280            else:
281                current.append(Element(name, attr, position=deepcopy(position), startend=True))
282
283            position.start = deepcopy(position.end)
284
285        return AST(current)

Parse html/xml like source code strings.

HypertextMarkupParser()
tag_stack = []

Current stack of tags in order of when they are opened.

def is_self_closing(self, name: str, auto_closing: bool) -> bool:
230    def is_self_closing(self, name: str, auto_closing: bool) -> bool:
231        """Check if the tag is self closing. Only check if auto_closing is toggled on."""
232
233        if auto_closing:
234            return name in self_closing
235        return False

Check if the tag is self closing. Only check if auto_closing is toggled on.

def parse(self, source: str, auto_close: bool = True) -> phml.core.nodes.nodes.Root:
237    def parse(self, source: str, auto_close: bool = True) -> Root:
238        """Parse a given html or phml string into it's corresponding phml ast.
239
240        Args:
241            source (str): The html or phml source to parse.
242
243        Returns:
244            AST: A phml AST representing the parsed code source.
245        """
246
247        self.tag_stack = []
248        current = Root()
249        position = Position((0, 0), (0, 0))
250
251        while RE.tag_start.search(source) is not None:
252            source, begin, attr, end, elems = self.__parse_tag(source, position)
253
254            if len(elems) > 0:
255                current.extend(elems)
256
257            name = begin[2]["name"] or ''
258            if begin[2]["opening"] == "/":
259                if name != self.tag_stack[-1]:
260                    input(self.tag_stack)
261                    raise Exception(
262                        f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}"
263                    )
264
265                self.tag_stack.pop()
266                current.position.end.line = position.end.line
267                current.position.end.column = position.end.column
268
269                current = current.parent
270            elif begin[2]["opening"] == "!":
271                current.append(DocType(attr.get("lang", "html"), position=deepcopy(position)))
272            elif (
273                end[2]["closing"] != "/"
274                and not self.is_self_closing(name, auto_close)
275                and begin[2]["opening"] is None
276            ):
277                self.tag_stack.append(name)
278                current.append(Element(name, attr, position=deepcopy(position)))
279                current = current.children[-1]
280            else:
281                current.append(Element(name, attr, position=deepcopy(position), startend=True))
282
283            position.start = deepcopy(position.end)
284
285        return AST(current)

Parse a given html or phml string into it's corresponding phml ast.

Arguments:
  • source (str): The html or phml source to parse.
Returns:

AST: A phml AST representing the parsed code source.