Module implicit_word_network.build_graph

Expand source code
#
# ---------------------------------------------------------------------------- #
#                              Build Word Network                              #
# ---------------------------------------------------------------------------- #
#
# Extracting implicit word networks from a user specified
# set of documents as described in:
#
#   1. Spitz, “Implicit Entity Networks: A Versatile Document Model.”
#   2. Spitz and Gertz, “Exploring Entity-Centric Networks in Entangled News Streams.”
#

# ---------------------------------- Imports --------------------------------- #

import math
from sklearn.cluster import DBSCAN
from tqdm import tqdm

# -------------------------------- Build Graph ------------------------------- #


def getEntitiesInDoc(d_id, D):
    """
    Return all tokens marked as entities in document.
    """

    d = D[d_id]  # Current document
    E_d = {}  # Init entity dic

    # Iterate over all sentences and tokens
    # in document to find entity instances
    for s_id in d.keys():
        s = d[s_id]

        for t_id in s.keys():
            t = s[t_id]

            if t["is_entity"]:
                # Add instance of entity to dictionary of entities
                e_key = (t["text"].lower(), t["entity_type"])
                if e_key not in E_d:
                    E_d[e_key] = []
                E_d[e_key].append(t)

    return E_d


def getSentencesInDoc(d_id, D):
    """
    Return all sentences in document.
    """

    d = D[d_id]  # Current document
    S_d = [{"d_id": d_id, "s_id": s_id, "type": "s"} for s_id in d.keys()]

    return S_d


def getTermsInSent(s):
    """
    Return all terms in sentence.
    """

    T_s = {}  # Init token dic

    # Iterate over all tokens in sentence
    # to find terms (non entities)
    for t_id in s.keys():
        t = s[t_id]

        if not t["is_entity"]:
            # Add instance of token to dictionary
            e_key = (t["text"].lower(), t["pos"])
            if e_key not in T_s:
                T_s[e_key] = []
            T_s[e_key].append(t)

    return T_s


def getEntsInSent(s):
    """
    Return all entities in sentence.
    """

    E_s = {}  # Init entities dic

    # Iterate over all tokens in sentence
    # to find entities
    for t_id in s.keys():
        t = s[t_id]

        if t["is_entity"]:
            # Add instance of entities to dictionary
            e_key = (t["text"].lower(), t["entity_type"])
            if e_key not in E_s:
                E_s[e_key] = []
            E_s[e_key].append(t)

    return E_s


def extendEntities(V_e, E_d):
    """
    Merge two dictionaries of entity nodes. If necessary the instance lists are combinded.
    """

    # If entity already present
    # add list of instances
    for e_key in E_d:
        if e_key not in V_e:
            V_e[e_key] = E_d[e_key]
        else:
            V_e[e_key].extend(E_d[e_key])


def extendTerms(V_t, T_s):
    """
    Merge two dictionaries of term nodes. If necessary the instance lists are combinded.
    """

    # If token already present
    # add new instances
    for e_key in T_s:
        if e_key not in V_t:
            V_t[e_key] = T_s[e_key]
        else:
            V_t[e_key].extend(T_s[e_key])


def linkDocToSent(d, s, Ep_d_s):
    """
    Create edge between docuement and sentence.
    """

    ep_key = (d["d_id"], s["d_id"], s["s_id"])
    Ep_d_s[ep_key] = {"w": 1}


def linkEntToSent(E_s, Ep_s_e):
    """
    Create edge between entity and sentence.
    """

    for t_id in E_s.keys():

        t = E_s[t_id]
        ep_key = (t[0]["d_id"], t[0]["s_id"], t[0]["text"].lower(), t[0]["entity_type"])
        Ep_s_e[ep_key] = t


def linkTokenToSent(T_s, Ep_s_t):
    """
    Create edge between token and sentence.
    """

    for t_id in T_s.keys():

        t = T_s[t_id]
        ep_key = (t[0]["d_id"], t[0]["s_id"], t[0]["text"].lower(), t[0]["pos"])
        Ep_s_t[ep_key] = t


def linkEntToTerm(E_s, T_s, Ep_e_t):
    """
    Create edge between entity and term.
    """

    # Iterate over every entity and term in sentence
    for t_s_id, e_s_id in (
        (t_s_id, e_s_id) for t_s_id in T_s.keys() for e_s_id in E_s.keys()
    ):
        t = T_s[t_s_id]
        e = E_s[e_s_id]

        # Iterate over every instance of term and entity
        for t_i, e_i in ((t_i, e_i) for t_i in t for e_i in e):

            e_key = (
                e_i["text"].lower(),
                e_i["entity_type"],
                t_i["text"].lower(),
                t_i["pos"],
            )
            e_instance = {"entity": e_i, "term": t_i}
            if e_key not in Ep_e_t:
                Ep_e_t[e_key] = []
            Ep_e_t[e_key].append(e_instance)


def linkEntToEnt(E_d, Ep_e_e, c):
    """
    Create edge between entity and entity.
    """

    # Iterate over every entity in document
    e_1_done = []  # remember already visited entities
    for e_1_id, e_2_id in (
        (e_1_id, e_2_id) for e_1_id in E_d.keys() for e_2_id in E_d.keys()
    ):
        e_1 = E_d[e_1_id]
        e_2 = E_d[e_2_id]
        e_1_done.append(e_1)

        if e_2 not in e_1_done:

            e_key = (
                e_1[0]["text"].lower(),
                e_1[0]["entity_type"],
                e_2[0]["text"].lower(),
                e_2[0]["entity_type"],
            )

            # Iterate over every instance of both entities
            for e_1_i, e_2_i in ((e_1_i, e_2_i) for e_1_i in e_1 for e_2_i in e_2):

                delta = abs(e_1_i["s_id"] - e_2_i["s_id"])

                if delta <= c:
                    w = math.exp(-delta)
                    e_instance = {"entity_1": e_1_i, "entity_2": e_2_i, "w": w}
                    if e_key not in Ep_e_e:
                        Ep_e_e[e_key] = []
                    Ep_e_e[e_key].append(e_instance)


def combineVerticies(V_d, V_s, V_e, V_t):
    """
    Combine type specific node dictionaries into one.
    """

    # Add document and sentence nodes
    V = {"documents": V_d, "sentences": V_s, "entities": [], "terms": []}

    # Add entity nodes
    for v in V_e.values():
        V["entities"].append(
            {
                "text": v[0]["text"],
                "entity_type": v[0]["entity_type"],
                "type": "e",
                "instances": v,
            }
        )

    # Add term nodes
    for v in V_t.values():
        V["terms"].append(
            {"text": v[0]["text"], "pos": v[0]["pos"], "type": "t", "instances": v,}
        )

    return V


def combindeEdges(Ep_d_s, Ep_s_e, Ep_s_t, Ep_e_t, Ep_e_e):
    """
    Combine type specific edge dictionaries into one.
    """

    # List of edges per node types
    Ep = {
        ("d", "s"): [],
        ("s", "e"): [],
        ("s", "t"): [],
        ("e", "t"): [],
        ("e", "e"): [],
    }

    # Add document-sentence edges
    for key in Ep_d_s:

        e = Ep_d_s[key]
        e_new = {
            "vertex_1": {"type": "d", "d_id": key[0],},
            "vertex_2": {"type": "s", "d_id": key[1], "s_id": key[2]},
            "w": e["w"],
        }
        Ep[("d", "s")].append(e_new)

    # Add sentence-entity edges
    for key in Ep_s_e:

        e = Ep_s_e[key]
        e_new = {
            "vertex_1": {"type": "s", "d_id": key[0], "s_id": key[1]},
            "vertex_2": {
                "type": "e",
                "text": e[0]["text"],
                "entity_type": e[0]["entity_type"],
            },
            "instances": e,
        }
        Ep[("s", "e")].append(e_new)

    # Add sentence-term edges
    for key in Ep_s_t:

        e = Ep_s_t[key]
        e_new = {
            "vertex_1": {"type": "s", "d_id": key[0], "s_id": key[1]},
            "vertex_2": {"type": "t", "text": e[0]["text"], "pos": e[0]["pos"],},
            "instances": e,
        }
        Ep[("s", "t")].append(e_new)

    # Add entity-term edges
    for key in Ep_e_t:

        e = Ep_e_t[key]
        e_new = {
            "vertex_1": {"type": "e", "text": key[0], "entity_type": key[1],},
            "vertex_2": {"type": "t", "text": key[2], "pos": key[3],},
            "instances": e,
        }
        Ep[("e", "t")].append(e_new)

    # Add entity-entity edges
    for key in Ep_e_e:

        e = Ep_e_e[key]
        e_new = {
            "vertex_1": {"type": "e", "text": key[0], "entity_type": key[1],},
            "vertex_2": {"type": "e", "text": key[2], "entity_type": key[3],},
            "instances": e,
        }
        Ep[("e", "e")].append(e_new)

    return Ep


def buildGraph(D, c, show_progress=True):
    """
    Extract implicit word network from corpus. Returns nodes and edges.
    """

    # List of edges per node type
    Ep_d_s = {}
    Ep_s_e = {}
    Ep_s_t = {}
    Ep_e_t = {}
    Ep_e_e = {}

    # List of nodes per node type
    V_d = []
    V_s = []
    V_e = {}
    V_t = {}

    # Iterate over all documents
    for d_id in tqdm(D, desc="Documents", disable=(not show_progress)):

        d = D[d_id]
        d_i = {"d_id": d_id, "type": "d"}
        S_d_i = getSentencesInDoc(d_id, D)  # Sentences in d
        E_d = getEntitiesInDoc(d_id, D)  # Entities in d

        # Extend list of nodes
        V_d.append(d_i)
        V_s.extend(S_d_i)
        extendEntities(V_e, E_d)

        # Iterate over all sentences in document
        for s_i in S_d_i:

            s = d[s_i["s_id"]]

            # Link document to sentence
            linkDocToSent(d_i, s_i, Ep_d_s)

            # Identify and entities in sentence
            # and link them to sentence node
            E_s = getEntsInSent(s)
            linkEntToSent(E_s, Ep_s_e)

            # Identify terms in sentence
            # and extend list of token nodes
            # and link them to sentence node
            T_s = getTermsInSent(s)
            extendTerms(V_t, T_s)
            linkTokenToSent(T_s, Ep_s_t)

            # Link term to entity
            linkEntToTerm(E_s, T_s, Ep_e_t)

        # Link entity to entity
        linkEntToEnt(E_d, Ep_e_e, c)

    # Construct final nodes and edges lists
    V = combineVerticies(V_d, V_s, V_e, V_t)
    Ep = combindeEdges(Ep_d_s, Ep_s_e, Ep_s_t, Ep_e_t, Ep_e_e)

    return V, Ep


# ------------------------------- Cluster Edges ------------------------------ #


def constructEdgeContexts(e, D):
    """
    Return the context of two entities occurred together.
    """

    contexts = []

    # Iterate over all edge instances
    for e_i in e["instances"]:

        e_1 = e_i["entity_1"]
        e_2 = e_i["entity_2"]
        d = D[e_1["d_id"]]  # Both entities are in the same document
        s_id_from = min(e_1["s_id"], e_2["s_id"])
        s_id_to = max(e_1["s_id"], e_2["s_id"])

        # Build context string out of individual tokens
        context = []
        for s_id in range(s_id_from, s_id_to + 1):
            s = d[s_id]
            for t_id in s:
                context.append(s[t_id]["text"])

        contexts.append(" ".join(context))

    return contexts


def getContextEmbeddings(contexts, model):
    """
    Use BERT to compute context embeddings.
    """

    cont_embeddings = model.encode(contexts, show_progress_bar=False)
    return cont_embeddings


def clusterContextEmbeddings(embeddings, metric, eps, min_samples):
    """
    Use DBSCAN to cluster context embeddings.
    """

    clustering = DBSCAN(metric=metric, eps=eps, min_samples=min_samples).fit(embeddings)
    return clustering.labels_


def groupEdgeInstances(instances, clusters):
    """
    Sort edge instances into clusters.
    """

    # Iterate over all edge instances
    # to group them according to clusters
    e_i_clustered = {}
    for i, e_i in enumerate(instances):
        cluster = clusters[i]
        if cluster not in e_i_clustered:
            e_i_clustered[cluster] = []
        e_i_clustered[cluster].append(e_i)

    return [e_i for e_i in e_i_clustered.values()]


def clusterEdges(
    Ep, D, model="", metric="cosine", eps=0.25, min_samples=1, show_progress=True
):
    """
    Cluster edge instances between entities by using DBSCAN on embeddings of the context two entities occurred together.
    """

    Ep_clustered = Ep.copy()

    # Iterate over all edges
    for e in tqdm(Ep_clustered[("e", "e")], desc="Edges", disable=(not show_progress)):

        if len(e["instances"]) > 1:  # Clustering if mulltiple edge instances

            # Conctruct context for each edge instance
            # compute embeddings and cluster them
            contexts = constructEdgeContexts(e, D)
            embeddings = getContextEmbeddings(contexts, model)
            clusters = clusterContextEmbeddings(embeddings, metric, eps, min_samples)
            instances_clustered = groupEdgeInstances(e["instances"], clusters)

        else:
            # If edge has only one instance
            # one cluster with one instance
            # is created instead of clustering
            instances_clustered = [e["instances"]]

        # Attach clustered instances to edge
        e["instances_clustered"] = instances_clustered

    return Ep_clustered

Functions

def buildGraph(D, c, show_progress=True)

Extract implicit word network from corpus. Returns nodes and edges.

Expand source code
def buildGraph(D, c, show_progress=True):
    """
    Extract implicit word network from corpus. Returns nodes and edges.
    """

    # List of edges per node type
    Ep_d_s = {}
    Ep_s_e = {}
    Ep_s_t = {}
    Ep_e_t = {}
    Ep_e_e = {}

    # List of nodes per node type
    V_d = []
    V_s = []
    V_e = {}
    V_t = {}

    # Iterate over all documents
    for d_id in tqdm(D, desc="Documents", disable=(not show_progress)):

        d = D[d_id]
        d_i = {"d_id": d_id, "type": "d"}
        S_d_i = getSentencesInDoc(d_id, D)  # Sentences in d
        E_d = getEntitiesInDoc(d_id, D)  # Entities in d

        # Extend list of nodes
        V_d.append(d_i)
        V_s.extend(S_d_i)
        extendEntities(V_e, E_d)

        # Iterate over all sentences in document
        for s_i in S_d_i:

            s = d[s_i["s_id"]]

            # Link document to sentence
            linkDocToSent(d_i, s_i, Ep_d_s)

            # Identify and entities in sentence
            # and link them to sentence node
            E_s = getEntsInSent(s)
            linkEntToSent(E_s, Ep_s_e)

            # Identify terms in sentence
            # and extend list of token nodes
            # and link them to sentence node
            T_s = getTermsInSent(s)
            extendTerms(V_t, T_s)
            linkTokenToSent(T_s, Ep_s_t)

            # Link term to entity
            linkEntToTerm(E_s, T_s, Ep_e_t)

        # Link entity to entity
        linkEntToEnt(E_d, Ep_e_e, c)

    # Construct final nodes and edges lists
    V = combineVerticies(V_d, V_s, V_e, V_t)
    Ep = combindeEdges(Ep_d_s, Ep_s_e, Ep_s_t, Ep_e_t, Ep_e_e)

    return V, Ep
def clusterContextEmbeddings(embeddings, metric, eps, min_samples)

Use DBSCAN to cluster context embeddings.

Expand source code
def clusterContextEmbeddings(embeddings, metric, eps, min_samples):
    """
    Use DBSCAN to cluster context embeddings.
    """

    clustering = DBSCAN(metric=metric, eps=eps, min_samples=min_samples).fit(embeddings)
    return clustering.labels_
def clusterEdges(Ep, D, model='', metric='cosine', eps=0.25, min_samples=1, show_progress=True)

Cluster edge instances between entities by using DBSCAN on embeddings of the context two entities occurred together.

Expand source code
def clusterEdges(
    Ep, D, model="", metric="cosine", eps=0.25, min_samples=1, show_progress=True
):
    """
    Cluster edge instances between entities by using DBSCAN on embeddings of the context two entities occurred together.
    """

    Ep_clustered = Ep.copy()

    # Iterate over all edges
    for e in tqdm(Ep_clustered[("e", "e")], desc="Edges", disable=(not show_progress)):

        if len(e["instances"]) > 1:  # Clustering if mulltiple edge instances

            # Conctruct context for each edge instance
            # compute embeddings and cluster them
            contexts = constructEdgeContexts(e, D)
            embeddings = getContextEmbeddings(contexts, model)
            clusters = clusterContextEmbeddings(embeddings, metric, eps, min_samples)
            instances_clustered = groupEdgeInstances(e["instances"], clusters)

        else:
            # If edge has only one instance
            # one cluster with one instance
            # is created instead of clustering
            instances_clustered = [e["instances"]]

        # Attach clustered instances to edge
        e["instances_clustered"] = instances_clustered

    return Ep_clustered
def combindeEdges(Ep_d_s, Ep_s_e, Ep_s_t, Ep_e_t, Ep_e_e)

Combine type specific edge dictionaries into one.

Expand source code
def combindeEdges(Ep_d_s, Ep_s_e, Ep_s_t, Ep_e_t, Ep_e_e):
    """
    Combine type specific edge dictionaries into one.
    """

    # List of edges per node types
    Ep = {
        ("d", "s"): [],
        ("s", "e"): [],
        ("s", "t"): [],
        ("e", "t"): [],
        ("e", "e"): [],
    }

    # Add document-sentence edges
    for key in Ep_d_s:

        e = Ep_d_s[key]
        e_new = {
            "vertex_1": {"type": "d", "d_id": key[0],},
            "vertex_2": {"type": "s", "d_id": key[1], "s_id": key[2]},
            "w": e["w"],
        }
        Ep[("d", "s")].append(e_new)

    # Add sentence-entity edges
    for key in Ep_s_e:

        e = Ep_s_e[key]
        e_new = {
            "vertex_1": {"type": "s", "d_id": key[0], "s_id": key[1]},
            "vertex_2": {
                "type": "e",
                "text": e[0]["text"],
                "entity_type": e[0]["entity_type"],
            },
            "instances": e,
        }
        Ep[("s", "e")].append(e_new)

    # Add sentence-term edges
    for key in Ep_s_t:

        e = Ep_s_t[key]
        e_new = {
            "vertex_1": {"type": "s", "d_id": key[0], "s_id": key[1]},
            "vertex_2": {"type": "t", "text": e[0]["text"], "pos": e[0]["pos"],},
            "instances": e,
        }
        Ep[("s", "t")].append(e_new)

    # Add entity-term edges
    for key in Ep_e_t:

        e = Ep_e_t[key]
        e_new = {
            "vertex_1": {"type": "e", "text": key[0], "entity_type": key[1],},
            "vertex_2": {"type": "t", "text": key[2], "pos": key[3],},
            "instances": e,
        }
        Ep[("e", "t")].append(e_new)

    # Add entity-entity edges
    for key in Ep_e_e:

        e = Ep_e_e[key]
        e_new = {
            "vertex_1": {"type": "e", "text": key[0], "entity_type": key[1],},
            "vertex_2": {"type": "e", "text": key[2], "entity_type": key[3],},
            "instances": e,
        }
        Ep[("e", "e")].append(e_new)

    return Ep
def combineVerticies(V_d, V_s, V_e, V_t)

Combine type specific node dictionaries into one.

Expand source code
def combineVerticies(V_d, V_s, V_e, V_t):
    """
    Combine type specific node dictionaries into one.
    """

    # Add document and sentence nodes
    V = {"documents": V_d, "sentences": V_s, "entities": [], "terms": []}

    # Add entity nodes
    for v in V_e.values():
        V["entities"].append(
            {
                "text": v[0]["text"],
                "entity_type": v[0]["entity_type"],
                "type": "e",
                "instances": v,
            }
        )

    # Add term nodes
    for v in V_t.values():
        V["terms"].append(
            {"text": v[0]["text"], "pos": v[0]["pos"], "type": "t", "instances": v,}
        )

    return V
def constructEdgeContexts(e, D)

Return the context of two entities occurred together.

Expand source code
def constructEdgeContexts(e, D):
    """
    Return the context of two entities occurred together.
    """

    contexts = []

    # Iterate over all edge instances
    for e_i in e["instances"]:

        e_1 = e_i["entity_1"]
        e_2 = e_i["entity_2"]
        d = D[e_1["d_id"]]  # Both entities are in the same document
        s_id_from = min(e_1["s_id"], e_2["s_id"])
        s_id_to = max(e_1["s_id"], e_2["s_id"])

        # Build context string out of individual tokens
        context = []
        for s_id in range(s_id_from, s_id_to + 1):
            s = d[s_id]
            for t_id in s:
                context.append(s[t_id]["text"])

        contexts.append(" ".join(context))

    return contexts
def extendEntities(V_e, E_d)

Merge two dictionaries of entity nodes. If necessary the instance lists are combinded.

Expand source code
def extendEntities(V_e, E_d):
    """
    Merge two dictionaries of entity nodes. If necessary the instance lists are combinded.
    """

    # If entity already present
    # add list of instances
    for e_key in E_d:
        if e_key not in V_e:
            V_e[e_key] = E_d[e_key]
        else:
            V_e[e_key].extend(E_d[e_key])
def extendTerms(V_t, T_s)

Merge two dictionaries of term nodes. If necessary the instance lists are combinded.

Expand source code
def extendTerms(V_t, T_s):
    """
    Merge two dictionaries of term nodes. If necessary the instance lists are combinded.
    """

    # If token already present
    # add new instances
    for e_key in T_s:
        if e_key not in V_t:
            V_t[e_key] = T_s[e_key]
        else:
            V_t[e_key].extend(T_s[e_key])
def getContextEmbeddings(contexts, model)

Use BERT to compute context embeddings.

Expand source code
def getContextEmbeddings(contexts, model):
    """
    Use BERT to compute context embeddings.
    """

    cont_embeddings = model.encode(contexts, show_progress_bar=False)
    return cont_embeddings
def getEntitiesInDoc(d_id, D)

Return all tokens marked as entities in document.

Expand source code
def getEntitiesInDoc(d_id, D):
    """
    Return all tokens marked as entities in document.
    """

    d = D[d_id]  # Current document
    E_d = {}  # Init entity dic

    # Iterate over all sentences and tokens
    # in document to find entity instances
    for s_id in d.keys():
        s = d[s_id]

        for t_id in s.keys():
            t = s[t_id]

            if t["is_entity"]:
                # Add instance of entity to dictionary of entities
                e_key = (t["text"].lower(), t["entity_type"])
                if e_key not in E_d:
                    E_d[e_key] = []
                E_d[e_key].append(t)

    return E_d
def getEntsInSent(s)

Return all entities in sentence.

Expand source code
def getEntsInSent(s):
    """
    Return all entities in sentence.
    """

    E_s = {}  # Init entities dic

    # Iterate over all tokens in sentence
    # to find entities
    for t_id in s.keys():
        t = s[t_id]

        if t["is_entity"]:
            # Add instance of entities to dictionary
            e_key = (t["text"].lower(), t["entity_type"])
            if e_key not in E_s:
                E_s[e_key] = []
            E_s[e_key].append(t)

    return E_s
def getSentencesInDoc(d_id, D)

Return all sentences in document.

Expand source code
def getSentencesInDoc(d_id, D):
    """
    Return all sentences in document.
    """

    d = D[d_id]  # Current document
    S_d = [{"d_id": d_id, "s_id": s_id, "type": "s"} for s_id in d.keys()]

    return S_d
def getTermsInSent(s)

Return all terms in sentence.

Expand source code
def getTermsInSent(s):
    """
    Return all terms in sentence.
    """

    T_s = {}  # Init token dic

    # Iterate over all tokens in sentence
    # to find terms (non entities)
    for t_id in s.keys():
        t = s[t_id]

        if not t["is_entity"]:
            # Add instance of token to dictionary
            e_key = (t["text"].lower(), t["pos"])
            if e_key not in T_s:
                T_s[e_key] = []
            T_s[e_key].append(t)

    return T_s
def groupEdgeInstances(instances, clusters)

Sort edge instances into clusters.

Expand source code
def groupEdgeInstances(instances, clusters):
    """
    Sort edge instances into clusters.
    """

    # Iterate over all edge instances
    # to group them according to clusters
    e_i_clustered = {}
    for i, e_i in enumerate(instances):
        cluster = clusters[i]
        if cluster not in e_i_clustered:
            e_i_clustered[cluster] = []
        e_i_clustered[cluster].append(e_i)

    return [e_i for e_i in e_i_clustered.values()]
def linkDocToSent(d, s, Ep_d_s)

Create edge between docuement and sentence.

Expand source code
def linkDocToSent(d, s, Ep_d_s):
    """
    Create edge between docuement and sentence.
    """

    ep_key = (d["d_id"], s["d_id"], s["s_id"])
    Ep_d_s[ep_key] = {"w": 1}
def linkEntToEnt(E_d, Ep_e_e, c)

Create edge between entity and entity.

Expand source code
def linkEntToEnt(E_d, Ep_e_e, c):
    """
    Create edge between entity and entity.
    """

    # Iterate over every entity in document
    e_1_done = []  # remember already visited entities
    for e_1_id, e_2_id in (
        (e_1_id, e_2_id) for e_1_id in E_d.keys() for e_2_id in E_d.keys()
    ):
        e_1 = E_d[e_1_id]
        e_2 = E_d[e_2_id]
        e_1_done.append(e_1)

        if e_2 not in e_1_done:

            e_key = (
                e_1[0]["text"].lower(),
                e_1[0]["entity_type"],
                e_2[0]["text"].lower(),
                e_2[0]["entity_type"],
            )

            # Iterate over every instance of both entities
            for e_1_i, e_2_i in ((e_1_i, e_2_i) for e_1_i in e_1 for e_2_i in e_2):

                delta = abs(e_1_i["s_id"] - e_2_i["s_id"])

                if delta <= c:
                    w = math.exp(-delta)
                    e_instance = {"entity_1": e_1_i, "entity_2": e_2_i, "w": w}
                    if e_key not in Ep_e_e:
                        Ep_e_e[e_key] = []
                    Ep_e_e[e_key].append(e_instance)
def linkEntToSent(E_s, Ep_s_e)

Create edge between entity and sentence.

Expand source code
def linkEntToSent(E_s, Ep_s_e):
    """
    Create edge between entity and sentence.
    """

    for t_id in E_s.keys():

        t = E_s[t_id]
        ep_key = (t[0]["d_id"], t[0]["s_id"], t[0]["text"].lower(), t[0]["entity_type"])
        Ep_s_e[ep_key] = t
def linkEntToTerm(E_s, T_s, Ep_e_t)

Create edge between entity and term.

Expand source code
def linkEntToTerm(E_s, T_s, Ep_e_t):
    """
    Create edge between entity and term.
    """

    # Iterate over every entity and term in sentence
    for t_s_id, e_s_id in (
        (t_s_id, e_s_id) for t_s_id in T_s.keys() for e_s_id in E_s.keys()
    ):
        t = T_s[t_s_id]
        e = E_s[e_s_id]

        # Iterate over every instance of term and entity
        for t_i, e_i in ((t_i, e_i) for t_i in t for e_i in e):

            e_key = (
                e_i["text"].lower(),
                e_i["entity_type"],
                t_i["text"].lower(),
                t_i["pos"],
            )
            e_instance = {"entity": e_i, "term": t_i}
            if e_key not in Ep_e_t:
                Ep_e_t[e_key] = []
            Ep_e_t[e_key].append(e_instance)
def linkTokenToSent(T_s, Ep_s_t)

Create edge between token and sentence.

Expand source code
def linkTokenToSent(T_s, Ep_s_t):
    """
    Create edge between token and sentence.
    """

    for t_id in T_s.keys():

        t = T_s[t_id]
        ep_key = (t[0]["d_id"], t[0]["s_id"], t[0]["text"].lower(), t[0]["pos"])
        Ep_s_t[ep_key] = t