phml.utils.transform.sanitize

phml.utils.transform.sanatize

Logic for sanatizing a phml ast.

1"""phml.utils.transform.sanatize
2
3Logic for sanatizing a phml ast.
4"""
5from .clean import sanatize
6from .schema import Schema
7
8__all__ = ["sanatize", "Schema"]
def sanatize( tree: phml.nodes.AST.AST | phml.nodes.root.Root | phml.nodes.element.Element, schema: Optional[phml.utils.transform.sanitize.Schema] = Schema(strip=['script'], ancestors={'tbody': ['table'], 'tfoot': ['table'], 'thead': ['table'], 'td': ['table'], 'th': ['table'], 'tr': ['table']}, protocols={'href': ['http', 'https', 'mailto', 'xmpp', 'irc', 'ircs'], 'cite': ['http', 'https'], 'src': ['http', 'https'], 'longDesc': ['http', 'https']}, tag_names=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'b', 'i', 'strong', 'em', 'a', 'pre', 'code', 'img', 'tt', 'div', 'ins', 'del', 'sup', 'sub', 'p', 'ol', 'ul', 'table', 'thead', 'tbody', 'tfoot', 'blockquote', 'dl', 'dt', 'dd', 'kbd', 'q', 'samp', 'var', 'hr', 'ruby', 'rt', 'rp', 'li', 'tr', 'td', 'th', 's', 'strike', 'summary', 'details', 'caption', 'figure', 'figcaption', 'abbr', 'bdo', 'cite', 'dfn', 'mark', 'small', 'span', 'time', 'wbr', 'input'], attributes={'a': ['href'], 'img': ['src', 'longDesc'], 'input': [['type', 'checkbox'], ['disabled', True]], 'li': [['class', 'task-list-item']], 'div': ['itemScope', 'itemType'], 'blockquote': ['cite'], 'del': ['cite'], 'ins': ['cite'], 'q': ['cite'], '*': ['abbr', 'accept', 'acceptCharset', 'accessKey', 'action', 'align', 'alt', 'ariaDescribedBy', 'ariaHidden', 'ariaLabel', 'ariaLabelledBy', 'axis', 'border', 'cellPadding', 'cellSpacing', 'char', 'charOff', 'charSet', 'checked', 'clear', 'cols', 'colSpan', 'color', 'compact', 'coords', 'dateTime', 'dir', 'disabled', 'encType', 'htmlFor', 'frame', 'headers', 'height', 'hrefLang', 'hSpace', 'isMap', 'id', 'label', 'lang', 'maxLength', 'media', 'method', 'multiple', 'name', 'noHref', 'noShade', 'noWrap', 'open', 'prompt', 'readOnly', 'rel', 'rev', 'rows', 'rowSpan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'start', 'summary', 'tabIndex', 'target', 'title', 'type', 'useMap', 'vAlign', 'value', 'vSpace', 'width', 'itemProp']}, required={'input': {'type': 'checkbox', 'disabled': True}})):
 11def sanatize(tree: AST | Root | Element, schema: Optional[Schema] = Schema()):
 12    """Sanatize elements and attributes in the phml tree. Should be used when using
 13    data from an unkown source. It should be used with an AST that has already been
 14    compiled to html to no unkown values are unchecked.
 15
 16    By default the sanatization schema uses the github schema and follows the hast
 17    sanatize utility.
 18
 19    * [github schema](https://github.com/syntax-tree/hast-util-sanitize/blob/main/lib/schema.js)
 20    * [hast sanatize](https://github.com/syntax-tree/hast-util-sanitize)
 21
 22    Note:
 23        This utility will edit the tree in place.
 24
 25    Args:
 26        tree (AST | Root | Element): The root of the tree that will be sanatized.
 27        schema (Optional[Schema], optional): User defined schema. Defaults to github schema.
 28    """
 29
 30    from phml.utils import (  # pylint: disable=import-outside-toplevel
 31        check,
 32        is_element,
 33        remove_nodes,
 34    )
 35
 36    if isinstance(tree, AST):
 37        src = tree.tree
 38    else:
 39        src = tree
 40
 41    for strip in schema.strip:
 42        remove_nodes(src, ["element", {"tag": strip}])
 43
 44    def recurse_check_tag(node: Root | Element):
 45        pop_els = []
 46        for idx, child in enumerate(node.children):
 47            if check(child, "element") and not is_element(child, schema.tag_names):
 48                pop_els.append(child)
 49            elif check(node.children[idx], "element"):
 50                recurse_check_tag(node.children[idx])
 51
 52        for element in pop_els:
 53            node.children.remove(element)
 54
 55    def recurse_check_ancestor(node: Root | Element):
 56        pop_els = []
 57        for idx, child in enumerate(node.children):
 58            if (
 59                check(child, "element")
 60                and child.tag in schema.ancestors.keys()
 61                and child.parent.tag not in schema.ancestors[child.tag]
 62            ):
 63                pop_els.append(child)
 64            elif check(node.children[idx], "element"):
 65                recurse_check_ancestor(node.children[idx])
 66
 67        for element in pop_els:
 68            node.children.remove(element)
 69
 70    def build_valid_attributes(attributes: list) -> list[str]:
 71        """Extract attributes from schema."""
 72        valid_attrs = []
 73        for attribute in attributes:
 74            valid_attrs = (
 75                [*valid_attrs, attribute]
 76                if isinstance(attribute, str)
 77                else [*valid_attrs, attribute[0]]
 78            )
 79        return valid_attrs
 80
 81    def build_remove_attr_list(properties: dict, attributes: dict, valid_attrs: list):
 82        """Build the list of attributes to remove from a dict of attributes."""
 83        result = []
 84        for attribute in properties:
 85            if attribute not in valid_attrs:
 86                result.append(attribute)
 87            else:
 88                for attr in attributes:
 89                    if bool(
 90                        (isinstance(attr, str) and attr != attribute)
 91                        or (attr[0] == attribute and properties[attribute] not in attr[1:])
 92                        or (
 93                            attribute in schema.protocols
 94                            and not check_protocols(
 95                                properties[attribute], schema.protocols[attribute]
 96                            )
 97                        )
 98                    ):
 99                        result.append(attribute)
100
101        return result
102
103    def recurse_check_attributes(node: Root | Element):
104        for idx, child in enumerate(node.children):
105            if check(child, "element") and child.tag in schema.attributes.keys():
106                valid_attrs = build_valid_attributes(schema.attributes[child.tag])
107
108                pop_attrs = build_remove_attr_list(
109                    node.children[idx].properties, schema.attributes[child.tag], valid_attrs
110                )
111
112                for attribute in pop_attrs:
113                    node.children[idx].properties.pop(attribute, None)
114
115            elif check(node.children[idx], "element"):
116                recurse_check_attributes(node.children[idx])
117
118    def recurse_check_required(node: Root | Element):
119        for idx, child in enumerate(node.children):
120            if check(child, "element") and child.tag in schema.required.keys():
121                for attr, value in schema.required[child.tag].items():
122                    if attr not in child.properties:
123                        node.children[idx][attr] = value
124
125            elif check(node.children[idx], "element"):
126                recurse_check_required(node.children[idx])
127
128    def check_protocols(value: str, protocols: list[str]):
129        for protocol in protocols:
130            if match(f"{protocol}:.*", value) is not None:
131                return True
132        return False
133
134    recurse_check_tag(src)
135    recurse_check_ancestor(src)
136    recurse_check_attributes(src)
137    recurse_check_required(src)

Sanatize elements and attributes in the phml tree. Should be used when using data from an unkown source. It should be used with an AST that has already been compiled to html to no unkown values are unchecked.

By default the sanatization schema uses the github schema and follows the hast sanatize utility.

Note

This utility will edit the tree in place.

Args
  • tree (AST | Root | Element): The root of the tree that will be sanatized.
  • schema (Optional[Schema], optional): User defined schema. Defaults to github schema.
@dataclass
class Schema:
  6@dataclass
  7class Schema:
  8    """Dataclass of information on how to sanatize a phml tree.
  9
 10    `strip (list[str])`: The elements to strip from the tree.
 11    `protocols (dict[str, list])`: Collection of element name and allowed protocal value list
 12    `tag_names (list[str])`: List of allowed tag names.
 13    `attributes (dict[str, list[str | list[str]]])`: Collection of element name and allowed property
 14    names.
 15    `required (dict[str, str | list[str]])`: Collection of element names and their required
 16    properties and required property values.
 17    """
 18
 19    strip: list[str] = field(default_factory=lambda: ['script'])
 20    ancestors: dict[str, list] = field(
 21        default_factory=lambda: {
 22            "tbody": ['table'],
 23            "tfoot": ['table'],
 24            "thead": ['table'],
 25            "td": ['table'],
 26            "th": ['table'],
 27            "tr": ['table'],
 28        }
 29    )
 30    protocols: dict[str, list] = field(
 31        default_factory=lambda: {
 32            "href": ['http', 'https', 'mailto', 'xmpp', 'irc', 'ircs'],
 33            "cite": ['http', 'https'],
 34            "src": ['http', 'https'],
 35            "longDesc": ['http', 'https'],
 36        }
 37    )
 38    tag_names: list[str] = field(
 39        default_factory=lambda: [
 40            'h1',
 41            'h2',
 42            'h3',
 43            'h4',
 44            'h5',
 45            'h6',
 46            'br',
 47            'b',
 48            'i',
 49            'strong',
 50            'em',
 51            'a',
 52            'pre',
 53            'code',
 54            'img',
 55            'tt',
 56            'div',
 57            'ins',
 58            'del',
 59            'sup',
 60            'sub',
 61            'p',
 62            'ol',
 63            'ul',
 64            'table',
 65            'thead',
 66            'tbody',
 67            'tfoot',
 68            'blockquote',
 69            'dl',
 70            'dt',
 71            'dd',
 72            'kbd',
 73            'q',
 74            'samp',
 75            'var',
 76            'hr',
 77            'ruby',
 78            'rt',
 79            'rp',
 80            'li',
 81            'tr',
 82            'td',
 83            'th',
 84            's',
 85            'strike',
 86            'summary',
 87            'details',
 88            'caption',
 89            'figure',
 90            'figcaption',
 91            'abbr',
 92            'bdo',
 93            'cite',
 94            'dfn',
 95            'mark',
 96            'small',
 97            'span',
 98            'time',
 99            'wbr',
100            'input',
101        ]
102    )
103    attributes: dict[str, list[str | list[str]]] = field(
104        default_factory=lambda: {
105            "a": ['href'],
106            "img": ['src', 'longDesc'],
107            "input": [['type', 'checkbox'], ['disabled', True]],
108            "li": [['class', 'task-list-item']],
109            "div": ['itemScope', 'itemType'],
110            "blockquote": ['cite'],
111            "del": ['cite'],
112            "ins": ['cite'],
113            "q": ['cite'],
114            '*': [
115                'abbr',
116                'accept',
117                'acceptCharset',
118                'accessKey',
119                'action',
120                'align',
121                'alt',
122                'ariaDescribedBy',
123                'ariaHidden',
124                'ariaLabel',
125                'ariaLabelledBy',
126                'axis',
127                'border',
128                'cellPadding',
129                'cellSpacing',
130                'char',
131                'charOff',
132                'charSet',
133                'checked',
134                'clear',
135                'cols',
136                'colSpan',
137                'color',
138                'compact',
139                'coords',
140                'dateTime',
141                'dir',
142                'disabled',
143                'encType',
144                'htmlFor',
145                'frame',
146                'headers',
147                'height',
148                'hrefLang',
149                'hSpace',
150                'isMap',
151                'id',
152                'label',
153                'lang',
154                'maxLength',
155                'media',
156                'method',
157                'multiple',
158                'name',
159                'noHref',
160                'noShade',
161                'noWrap',
162                'open',
163                'prompt',
164                'readOnly',
165                'rel',
166                'rev',
167                'rows',
168                'rowSpan',
169                'rules',
170                'scope',
171                'selected',
172                'shape',
173                'size',
174                'span',
175                'start',
176                'summary',
177                'tabIndex',
178                'target',
179                'title',
180                'type',
181                'useMap',
182                'vAlign',
183                'value',
184                'vSpace',
185                'width',
186                'itemProp',
187            ],
188        }
189    )
190    required: dict[str, str | list[str]] = field(
191        default_factory=lambda: {
192            "input": {
193                "type": 'checkbox',
194                "disabled": True,
195            }
196        }
197    )

Dataclass of information on how to sanatize a phml tree.

strip (list[str]): The elements to strip from the tree. protocols (dict[str, list]): Collection of element name and allowed protocal value list tag_names (list[str]): List of allowed tag names. attributes (dict[str, list[str | list[str]]]): Collection of element name and allowed property names. required (dict[str, str | list[str]]): Collection of element names and their required properties and required property values.

Schema( strip: list[str] = <factory>, ancestors: dict[str, list] = <factory>, protocols: dict[str, list] = <factory>, tag_names: list[str] = <factory>, attributes: dict[str, list[str | list[str]]] = <factory>, required: dict[str, str | list[str]] = <factory>)