phml.utilities.transform.sanitize

phml.utilities.transform.sanatize

Logic for sanatizing a phml ast.

1"""phml.utilities.transform.sanatize
2
3Logic for sanatizing a phml ast.
4"""
5from .clean import sanatize
6from .schema import Schema
7
8__all__ = ["sanatize", "Schema"]
def sanatize( tree: phml.core.nodes.AST.AST | phml.core.nodes.nodes.Root | phml.core.nodes.nodes.Element, schema: Optional[phml.utilities.transform.sanitize.Schema] = Schema(strip=['script'], ancestors={'tbody': ['table'], 'tfoot': ['table'], 'thead': ['table'], 'td': ['table'], 'th': ['table'], 'tr': ['table']}, protocols={'href': ['http', 'https', 'mailto', 'xmpp', 'irc', 'ircs'], 'cite': ['http', 'https'], 'src': ['http', 'https'], 'longDesc': ['http', 'https']}, tag_names=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'b', 'i', 'strong', 'em', 'a', 'pre', 'code', 'img', 'tt', 'div', 'ins', 'del', 'sup', 'sub', 'p', 'ol', 'ul', 'table', 'thead', 'tbody', 'tfoot', 'blockquote', 'dl', 'dt', 'dd', 'kbd', 'q', 'samp', 'var', 'hr', 'ruby', 'rt', 'rp', 'li', 'tr', 'td', 'th', 's', 'strike', 'summary', 'details', 'caption', 'figure', 'figcaption', 'abbr', 'bdo', 'cite', 'dfn', 'mark', 'small', 'span', 'time', 'wbr', 'input'], attributes={'a': ['href'], 'img': ['src', 'longDesc'], 'input': [['type', 'checkbox'], ['disabled', True]], 'li': [['class', 'task-list-item']], 'div': ['itemScope', 'itemType'], 'blockquote': ['cite'], 'del': ['cite'], 'ins': ['cite'], 'q': ['cite'], '*': ['abbr', 'accept', 'acceptCharset', 'accessKey', 'action', 'align', 'alt', 'ariaDescribedBy', 'ariaHidden', 'ariaLabel', 'ariaLabelledBy', 'axis', 'border', 'cellPadding', 'cellSpacing', 'char', 'charOff', 'charSet', 'checked', 'clear', 'cols', 'colSpan', 'color', 'compact', 'coords', 'dateTime', 'dir', 'disabled', 'encType', 'htmlFor', 'frame', 'headers', 'height', 'hrefLang', 'hSpace', 'isMap', 'id', 'label', 'lang', 'maxLength', 'media', 'method', 'multiple', 'name', 'noHref', 'noShade', 'noWrap', 'open', 'prompt', 'readOnly', 'rel', 'rev', 'rows', 'rowSpan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'start', 'summary', 'tabIndex', 'target', 'title', 'type', 'useMap', 'vAlign', 'value', 'vSpace', 'width', 'itemProp']}, required={'input': {'type': 'checkbox', 'disabled': True}})):
 10def sanatize(tree: AST | Root | Element, schema: Optional[Schema] = Schema()):
 11    """Sanatize elements and attributes in the phml tree. Should be used when using
 12    data from an unkown source. It should be used with an AST that has already been
 13    compiled to html to no unkown values are unchecked.
 14
 15    By default the sanatization schema uses the github schema and follows the hast
 16    sanatize utility.
 17
 18    * [github schema](https://github.com/syntax-tree/hast-util-sanitize/blob/main/lib/schema.js)
 19    * [hast sanatize](https://github.com/syntax-tree/hast-util-sanitize)
 20
 21    Note:
 22        This utility will edit the tree in place.
 23
 24    Args:
 25        tree (AST | Root | Element): The root of the tree that will be sanatized.
 26        schema (Optional[Schema], optional): User defined schema. Defaults to github schema.
 27    """
 28
 29    from phml import check, is_element, remove_nodes  # pylint: disable=import-outside-toplevel
 30
 31    if isinstance(tree, AST):
 32        src = tree.tree
 33    else:
 34        src = tree
 35
 36    for strip in schema.strip:
 37        remove_nodes(src, ["element", {"tag": strip}])
 38
 39    def recurse_check_tag(node: Root | Element):
 40        pop_els = []
 41        for idx, child in enumerate(node.children):
 42            if check(child, "element") and not is_element(child, schema.tag_names):
 43                pop_els.append(child)
 44            elif check(node.children[idx], "element"):
 45                recurse_check_tag(node.children[idx])
 46
 47        for element in pop_els:
 48            node.children.remove(element)
 49
 50    def recurse_check_ancestor(node: Root | Element):
 51        pop_els = []
 52        for idx, child in enumerate(node.children):
 53            if (
 54                check(child, "element")
 55                and child.tag in schema.ancestors.keys()
 56                and (
 57                    check(child.parent, "root")
 58                    or child.parent.tag not in schema.ancestors[child.tag]
 59                )
 60            ):
 61                pop_els.append(child)
 62            elif check(node.children[idx], "element"):
 63                recurse_check_ancestor(node.children[idx])
 64
 65        for element in pop_els:
 66            node.children.remove(element)
 67
 68    def build_valid_attributes(attributes: list) -> list[str]:
 69        """Extract attributes from schema."""
 70        valid_attrs = []
 71        for attribute in attributes:
 72            valid_attrs = (
 73                [*valid_attrs, attribute]
 74                if isinstance(attribute, str)
 75                else [*valid_attrs, attribute[0]]
 76            )
 77        return valid_attrs
 78
 79    def build_remove_attr_list(properties: dict, attributes: dict, valid_attributes: list):
 80        """Build the list of attributes to remove from a dict of attributes."""
 81        result = []
 82        for attribute in properties:
 83            if attribute not in valid_attributes:
 84                result.append(attribute)
 85            else:
 86                for attr in attributes:
 87                    if isinstance(attr, list) and attr[0] == attribute and len(attr) > 1:
 88                        if not all(val == properties[attribute] for val in attr[1:]) or (
 89                            attribute in schema.protocols
 90                            and not check_protocols(
 91                                properties[attribute], schema.protocols[attribute]
 92                            )
 93                        ):
 94                            result.append(attribute)
 95                            break
 96                    elif (
 97                        attr == attribute
 98                        and attr in schema.protocols
 99                        and not check_protocols(properties[attribute], schema.protocols[attribute])
100                    ):
101                        result.append(attribute)
102                        break
103
104        return result
105
106    def recurse_check_attributes(node: Root | Element):
107        for idx, child in enumerate(node.children):
108            if check(child, "element"):
109                if child.tag in schema.attributes:
110                    valid_attributes = build_valid_attributes(schema.attributes[child.tag])
111
112                    pop_attrs = build_remove_attr_list(
113                        node.children[idx].properties,
114                        schema.attributes[child.tag],
115                        valid_attributes,
116                    )
117
118                    for attribute in pop_attrs:
119                        node.children[idx].properties.pop(attribute, None)
120
121                recurse_check_attributes(node.children[idx])
122
123    def recurse_check_required(node: Root | Element):
124        for idx, child in enumerate(node.children):
125            if check(child, "element") and child.tag in schema.required:
126                for attr, value in schema.required[child.tag].items():
127                    if attr not in child.properties:
128                        node.children[idx][attr] = value
129
130            elif check(node.children[idx], "element"):
131                recurse_check_required(node.children[idx])
132
133    def check_protocols(value: str, protocols: list[str]):
134        for protocol in protocols:
135            if match(f"{protocol}:.*", value) is not None:
136                return True
137        return False
138
139    recurse_check_tag(src)
140    recurse_check_ancestor(src)
141    recurse_check_attributes(src)
142    recurse_check_required(src)

Sanatize elements and attributes in the phml tree. Should be used when using data from an unkown source. It should be used with an AST that has already been compiled to html to no unkown values are unchecked.

By default the sanatization schema uses the github schema and follows the hast sanatize utility.

Note:

This utility will edit the tree in place.

Arguments:
  • tree (AST | Root | Element): The root of the tree that will be sanatized.
  • schema (Optional[Schema], optional): User defined schema. Defaults to github schema.
@dataclass
class Schema:
  6@dataclass
  7class Schema:
  8    """Dataclass of information on how to sanatize a phml tree.
  9
 10    `strip (list[str])`: The elements to strip from the tree.
 11    `protocols (dict[str, list])`: Collection of element name and allowed protocal value list
 12    `tag_names (list[str])`: List of allowed tag names.
 13    `attributes (dict[str, list[str | list[str]]])`: Collection of element name and allowed property
 14    names.
 15    `required (dict[str, str | list[str]])`: Collection of element names and their required
 16    properties and required property values.
 17    """
 18
 19    strip: list[str] = field(default_factory=lambda: ['script'])
 20    ancestors: dict[str, list] = field(
 21        default_factory=lambda: {
 22            "tbody": ['table'],
 23            "tfoot": ['table'],
 24            "thead": ['table'],
 25            "td": ['table'],
 26            "th": ['table'],
 27            "tr": ['table'],
 28        }
 29    )
 30    protocols: dict[str, list] = field(
 31        default_factory=lambda: {
 32            "href": ['http', 'https', 'mailto', 'xmpp', 'irc', 'ircs'],
 33            "cite": ['http', 'https'],
 34            "src": ['http', 'https'],
 35            "longDesc": ['http', 'https'],
 36        }
 37    )
 38    tag_names: list[str] = field(
 39        default_factory=lambda: [
 40            'h1',
 41            'h2',
 42            'h3',
 43            'h4',
 44            'h5',
 45            'h6',
 46            'br',
 47            'b',
 48            'i',
 49            'strong',
 50            'em',
 51            'a',
 52            'pre',
 53            'code',
 54            'img',
 55            'tt',
 56            'div',
 57            'ins',
 58            'del',
 59            'sup',
 60            'sub',
 61            'p',
 62            'ol',
 63            'ul',
 64            'table',
 65            'thead',
 66            'tbody',
 67            'tfoot',
 68            'blockquote',
 69            'dl',
 70            'dt',
 71            'dd',
 72            'kbd',
 73            'q',
 74            'samp',
 75            'var',
 76            'hr',
 77            'ruby',
 78            'rt',
 79            'rp',
 80            'li',
 81            'tr',
 82            'td',
 83            'th',
 84            's',
 85            'strike',
 86            'summary',
 87            'details',
 88            'caption',
 89            'figure',
 90            'figcaption',
 91            'abbr',
 92            'bdo',
 93            'cite',
 94            'dfn',
 95            'mark',
 96            'small',
 97            'span',
 98            'time',
 99            'wbr',
100            'input',
101        ]
102    )
103    attributes: dict[str, list[str | list[str]]] = field(
104        default_factory=lambda: {
105            "a": ['href'],
106            "img": ['src', 'longDesc'],
107            "input": [['type', 'checkbox'], ['disabled', True]],
108            "li": [['class', 'task-list-item']],
109            "div": ['itemScope', 'itemType'],
110            "blockquote": ['cite'],
111            "del": ['cite'],
112            "ins": ['cite'],
113            "q": ['cite'],
114            '*': [
115                'abbr',
116                'accept',
117                'acceptCharset',
118                'accessKey',
119                'action',
120                'align',
121                'alt',
122                'ariaDescribedBy',
123                'ariaHidden',
124                'ariaLabel',
125                'ariaLabelledBy',
126                'axis',
127                'border',
128                'cellPadding',
129                'cellSpacing',
130                'char',
131                'charOff',
132                'charSet',
133                'checked',
134                'clear',
135                'cols',
136                'colSpan',
137                'color',
138                'compact',
139                'coords',
140                'dateTime',
141                'dir',
142                'disabled',
143                'encType',
144                'htmlFor',
145                'frame',
146                'headers',
147                'height',
148                'hrefLang',
149                'hSpace',
150                'isMap',
151                'id',
152                'label',
153                'lang',
154                'maxLength',
155                'media',
156                'method',
157                'multiple',
158                'name',
159                'noHref',
160                'noShade',
161                'noWrap',
162                'open',
163                'prompt',
164                'readOnly',
165                'rel',
166                'rev',
167                'rows',
168                'rowSpan',
169                'rules',
170                'scope',
171                'selected',
172                'shape',
173                'size',
174                'span',
175                'start',
176                'summary',
177                'tabIndex',
178                'target',
179                'title',
180                'type',
181                'useMap',
182                'vAlign',
183                'value',
184                'vSpace',
185                'width',
186                'itemProp',
187            ],
188        }
189    )
190    required: dict[str, str | list[str]] = field(
191        default_factory=lambda: {
192            "input": {
193                "type": 'checkbox',
194                "disabled": True,
195            }
196        }
197    )

Dataclass of information on how to sanatize a phml tree.

strip (list[str]): The elements to strip from the tree. protocols (dict[str, list]): Collection of element name and allowed protocal value list tag_names (list[str]): List of allowed tag names. attributes (dict[str, list[str | list[str]]]): Collection of element name and allowed property names. required (dict[str, str | list[str]]): Collection of element names and their required properties and required property values.

Schema( strip: list[str] = <factory>, ancestors: dict[str, list] = <factory>, protocols: dict[str, list] = <factory>, tag_names: list[str] = <factory>, attributes: dict[str, list[str | list[str]]] = <factory>, required: dict[str, str | list[str]] = <factory>)