Source code for badsnakes.libs.extractor

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides abstract syntax tree node visitation
            and attribute extraction functionality.

:Platform:  Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

:Example:
    Example code use::

        >>> from badsnakes.libs.parser import Parser
        >>> from badsnakes.libs.extractor import Extractor

        >>> p = Parser()
        >>> e = Extractor()
        >>> p.parse(path='hello.py')
        >>> e.extract(node=p.ast_)

        # Display the extracted nodes.
        >>> e.display()

"""
# pylint: disable=import-error

import ast
import logging
# locals
from badsnakes.libs.containers import (Assign,
                                       Attribute,
                                       Call,
                                       Constant,
                                       FunctionDef,
                                       Import)

logger = logging.getLogger(__name__)


[docs] class Extractor(ast.NodeVisitor): """Inspect, extract and store relevant AST node attributes.""" def __init__(self): """Node visitor class initialiser.""" super().__init__() self._args = [] # Storage for ast.Constant used as function arguments. self._assigns = [] # Storage for ast.Assign self._attrs = [] # Storage for ast.Attribute self._calls = [] # Storage for ast.Call self._constants = [] # Storage for ast.Constant self._funcdefs = [] # Storage for ast.FunctionDef self._imports = [] # Storage for ast.Import and ast.ImportFrom self._docs = set() # Extracted docstrings used to filter constants.
[docs] def display(self, name: str=None): """Display the extracted contents. The extracted attributes for each of the following AST nodes are displayed here: - ast.Assign - ast.Attribute - ast.Call - ast.Constants - ast.FunctionDef - ast.Import - ast.ImportFrom Args: name (str, optional): Name of the Python module being displayed. Defaults to None. """ title = f'Extracted attributes from: {name}' sep = '-' * len(title) # This is intentionally verbose for easy readability and maintenance. print('', sep, title, sep, '', 'Function arguments:', *(n for n in self._args), '', 'Assignments:', *(n for n in self._assigns), '', 'Attributes:', *(n for n in self._attrs), '', 'Function calls:', *(n for n in self._calls), '', 'Constants:', *(n for n in self._constants), '', 'Function definitions:', *(n for n in self._funcdefs), '', 'Imports:', *(n for n in self._imports), sep='\n')
[docs] def extract(self, node: ast.Module): """Extract and store relevant attributes from a parsed AST. This method is an alias for the :meth:`ast.NodeVisitor.visit` which is called directly, after the docstrings have been extracted. Args: node (ast.Module): Starting node to be visited from which attributes are to be extracted. """ if node: self._extract_docstrings(node=node) self.visit(node=node)
[docs] def visit_Assign(self, node: ast.Assign): """Extract attributes of interest from ``ast.Assign`` nodes. Generally, the assignments are used by the analyser to detect (very) long strings, or suspicious module or function aliasing. For example: - [A very very long string which may be base64 encoded code] - A URL including 'http' - cexe = exec - lave = eval - _i = __import__ Args: node (ast.Assign): A node of type ``ast.Assign``. """ # Capture assigned values. (e.g.: callables, constants, tuple assignments) for target, value in zip(node.targets, [node.value]): # Assignment of a callable. (e.g.: c = __import__('builtins').compile) if isinstance(value, ast.Attribute): if isinstance(value.value, ast.Call): a = Assign(name=getattr(target, 'id', None) or getattr(target, 'attr', None), value=value.attr, line=node.lineno, line_end=node.end_lineno) self._assigns.append(a) # Assignment of a 'normal' constant. (e.g.: x = 'a string') elif isinstance(value, ast.Constant): a = Assign(name=getattr(target, 'id', None), value=value.value, line=node.lineno, line_end=node.end_lineno) self._assigns.append(a) elif isinstance(value, ast.Tuple): if hasattr(target, 'elts'): # Assignment from tuple unpacking. (e.g.: a, b = 'thingA', 'thingB') for t_elt, v_elt in zip(target.elts, value.elts): a = Assign(name=getattr(t_elt, 'id', None), value=getattr(v_elt, 'value', None), line=node.lineno, line_end=node.end_lineno) self._assigns.append(a) else: # Assignment from tuple packing. (e.g.: a = ('thingA', 'thingB')) for v in value.elts: a = Assign(name=(getattr(target, 'id', None) or getattr(target, 'attr', None)), value=getattr(v, 'value', None), line=node.lineno, line_end=node.end_lineno) self._assigns.append(a) self.generic_visit(node=node)
[docs] def visit_Attribute(self, node: ast.Attribute): """Extract attributes of interest from ``ast.Attribute`` nodes. For example: - ``__builtins__.__getattribute__`` - ``ctypes.windll`` - ``os.system`` Args: node (ast.Attribute): A node of type ``ast.Attribute``. """ if isinstance(node.value, ast.Name): a = Attribute(name=node.value.id, value=node.attr, line=node.value.lineno, line_end=node.end_lineno) self._attrs.append(a) self.generic_visit(node=node)
[docs] def visit_Call(self, node: ast.Call): """Extract attributes of interest from ``ast.Call`` nodes. Generally, function calls are used by the analyser to detect calls to functions which are generally considered unsafe, or used for suspicious activity. Additionally, any arguments into these function calls are stored into the :attr:`_args` class attribute, to be later added to the ``Module.arguments`` object. For example: - Calls ``compile``, ``exec`` or ``eval`` - Disguised imports using ``__import__`` - Calls to ``requests.post`` Args: node (ast.Call): A node of type ``ast.Call``. """ # Function calls which are attributes of objects. (e.g.: str.join()) if isinstance(node.func, ast.Attribute): c = Call(name=node.func.attr, module=getattr(node.func.value, 'id', None), line=node.lineno, line_end=node.end_lineno) self._calls.append(c) # Function calls. (e.g.: print(), or test1()) elif isinstance(node.func, ast.Name): c = Call(name=node.func.id, line=node.lineno, line_end=node.end_lineno) self._calls.append(c) # Capture function call arguments. for arg in node.args: if isinstance(arg, ast.Constant): cst = Constant(name=node.func.id, value=arg.value, line=node.lineno, line_end=node.end_lineno) self._args.append(cst) self.generic_visit(node=node)
[docs] def visit_Constant(self, node: ast.Constant): """Extract attributes of interest from ``ast.Constant`` nodes. Generally, the constants of interest here are *strings*. The extracted strings will be compared against the blacklisted strings to determine if any suspicious activities are being attempted. :Docstrings: Often times, a docstring containing benign text such as a semi-colon or the term 'execute' can flag a module as dangerous during a string search. Because of this, the AST is walked to collect and store all docstrings when :meth:`extract` method is called. A constant node is only stored by this method for analysis if the constant's value was *not* found in the stored docstrings. For further rationale on this, please refer to the :meth:`_extract_docstrings` method. For example: - Calls to cmd.exe or powershell - References to Bitcoin or other payment demands - Windows registry key paths Args: node (ast.Constant): A node of type ``ast.Constant``. """ if node.value not in self._docs: # i.e.: Verify string is not a docstring. c = Constant(value=node.value, line=node.lineno, line_end=node.end_lineno) self._constants.append(c) self.generic_visit(node=node)
[docs] def visit_FunctionDef(self, node: ast.FunctionDef): """Extract attributes of interest from ``ast.FunctionDef`` nodes. Generally, the analyser will use these nodes in search of obfuscated function names, indicating suspicious activity. For example: - ``_`` - ``__`` - ``_0xb1`` - ``_00OO00OO`` - ``_01001001`` Args: node (ast.FunctionDef): A node of type ``ast.FunctionDef``. """ f = FunctionDef(name=node.name, line=node.lineno, line_end=node.end_lineno) self._funcdefs.append(f) self.generic_visit(node=node)
[docs] def visit_Import(self, node: ast.Import): """Extract attributes of interest from ``ast.Import`` nodes. Generally, the analyser will use these nodes in search of module imports which may indicate suspicious activity. For example: - import requests - import winreg - import ctypes as ct - import win32api as _win32api - import win32con as _win32con Args: node (ast.Import): A node of type ``ast.Import``. """ for n in node.names: i = Import(module=n.name, asname=n.asname, line=n.lineno, line_end=n.end_lineno) self._imports.append(i) self.generic_visit(node=node)
[docs] def visit_ImportFrom(self, node: ast.ImportFrom): """Extract attributes of interest from ``ast.ImportFrom`` nodes. Generally, the analyser will use these nodes in search of module imports which may indicate suspicious activity. For example: - from win32api import SetFileAttributes - from win32con import SRCAND, FILE_ATTRIBUTE_HIDDEN - from win32file import CreateFileW, WriteFile, CloseHandle Args: node (ast.ImportFrom): A node of type ``ast.ImportFrom``. """ for n in node.names: i = Import(module=node.module, name=n.name, asname=n.asname, line=n.lineno, line_end=n.end_lineno) self._imports.append(i) self.generic_visit(node=node)
[docs] def _extract_docstrings(self, node: ast.Module): """Collect all docstrings in the module and store. Args: node (ast.Module): Top-level AST node to be searched. The extracted (uncleaned) docstrings are stored into the :attr:`_docs` attribute. A constant is only tested if the value is *not* in the ``_docs`` attribute. :Rationale: Extracting and storing docstrings lets us put simple strings such as ``';'`` and ``'()'`` in ``config.toml`` under the ``[analyser.constant.dangerous]`` and ``[analyser.constant.suspect]`` tables without having a false-positive trigger for the string being somewhere in the docstring. """ fns = [n for n in ast.walk(node) if isinstance(n, (ast.ClassDef, ast.FunctionDef, ast.Module))] self._docs = {ast.get_docstring(f, clean=False) for f in fns}