--- title: BERT keywords: fastai sidebar: home_sidebar summary: "Bidirectional Encoder Representations from Transformers." description: "Bidirectional Encoder Representations from Transformers." nb_path: "nbs/models/bert.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

class BERT[source]

BERT(args) :: Module

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

{% endraw %} {% raw %}
{% endraw %} {% raw %}
class Args:
    bert_hidden_units = 4
    bert_num_heads = 2
    bert_head_size = 4
    bert_dropout = 0.2
    bert_attn_dropout = 0.2
    bert_num_blocks = 4
    num_items = 10
    bert_hidden_units = 4
    bert_max_len = 8
    bert_dropout = 0.2

args = Args()
model = BERT(args)
model.parameters
<bound method Module.parameters of BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(12, 4, padding_idx=0)
    (position): PositionalEmbedding(
      (pe): Embedding(9, 4)
    )
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (model): BERTModel(
    (transformer_blocks): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadedAttention(
          (linear_layers): ModuleList(
            (0): Linear(in_features=4, out_features=8, bias=True)
            (1): Linear(in_features=4, out_features=8, bias=True)
            (2): Linear(in_features=4, out_features=8, bias=True)
          )
          (attention): Attention()
          (dropout): Dropout(p=0.2, inplace=False)
          (output_linear): Linear(in_features=8, out_features=4, bias=True)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=4, out_features=16, bias=True)
          (w_2): Linear(in_features=16, out_features=4, bias=True)
          (activation): GELU()
        )
        (input_sublayer): SublayerConnection(
          (layer_norm): LayerNorm()
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (output_sublayer): SublayerConnection(
          (layer_norm): LayerNorm()
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
      (1): TransformerBlock(
        (attention): MultiHeadedAttention(
          (linear_layers): ModuleList(
            (0): Linear(in_features=4, out_features=8, bias=True)
            (1): Linear(in_features=4, out_features=8, bias=True)
            (2): Linear(in_features=4, out_features=8, bias=True)
          )
          (attention): Attention()
          (dropout): Dropout(p=0.2, inplace=False)
          (output_linear): Linear(in_features=8, out_features=4, bias=True)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=4, out_features=16, bias=True)
          (w_2): Linear(in_features=16, out_features=4, bias=True)
          (activation): GELU()
        )
        (input_sublayer): SublayerConnection(
          (layer_norm): LayerNorm()
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (output_sublayer): SublayerConnection(
          (layer_norm): LayerNorm()
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
      (2): TransformerBlock(
        (attention): MultiHeadedAttention(
          (linear_layers): ModuleList(
            (0): Linear(in_features=4, out_features=8, bias=True)
            (1): Linear(in_features=4, out_features=8, bias=True)
            (2): Linear(in_features=4, out_features=8, bias=True)
          )
          (attention): Attention()
          (dropout): Dropout(p=0.2, inplace=False)
          (output_linear): Linear(in_features=8, out_features=4, bias=True)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=4, out_features=16, bias=True)
          (w_2): Linear(in_features=16, out_features=4, bias=True)
          (activation): GELU()
        )
        (input_sublayer): SublayerConnection(
          (layer_norm): LayerNorm()
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (output_sublayer): SublayerConnection(
          (layer_norm): LayerNorm()
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
      (3): TransformerBlock(
        (attention): MultiHeadedAttention(
          (linear_layers): ModuleList(
            (0): Linear(in_features=4, out_features=8, bias=True)
            (1): Linear(in_features=4, out_features=8, bias=True)
            (2): Linear(in_features=4, out_features=8, bias=True)
          )
          (attention): Attention()
          (dropout): Dropout(p=0.2, inplace=False)
          (output_linear): Linear(in_features=8, out_features=4, bias=True)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=4, out_features=16, bias=True)
          (w_2): Linear(in_features=16, out_features=4, bias=True)
          (activation): GELU()
        )
        (input_sublayer): SublayerConnection(
          (layer_norm): LayerNorm()
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (output_sublayer): SublayerConnection(
          (layer_norm): LayerNorm()
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (linear): Linear(in_features=4, out_features=4, bias=True)
    (activation): GELU()
  )
)>
{% endraw %}