<bound method Module.parameters of BERT(
(embedding): BERTEmbedding(
(token): TokenEmbedding(12, 4, padding_idx=0)
(position): PositionalEmbedding(
(pe): Embedding(9, 4)
)
(layer_norm): LayerNorm()
(dropout): Dropout(p=0.2, inplace=False)
)
(model): BERTModel(
(transformer_blocks): ModuleList(
(0): TransformerBlock(
(attention): MultiHeadedAttention(
(linear_layers): ModuleList(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): Linear(in_features=4, out_features=8, bias=True)
(2): Linear(in_features=4, out_features=8, bias=True)
)
(attention): Attention()
(dropout): Dropout(p=0.2, inplace=False)
(output_linear): Linear(in_features=8, out_features=4, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=4, out_features=16, bias=True)
(w_2): Linear(in_features=16, out_features=4, bias=True)
(activation): GELU()
)
(input_sublayer): SublayerConnection(
(layer_norm): LayerNorm()
(dropout): Dropout(p=0.2, inplace=False)
)
(output_sublayer): SublayerConnection(
(layer_norm): LayerNorm()
(dropout): Dropout(p=0.2, inplace=False)
)
)
(1): TransformerBlock(
(attention): MultiHeadedAttention(
(linear_layers): ModuleList(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): Linear(in_features=4, out_features=8, bias=True)
(2): Linear(in_features=4, out_features=8, bias=True)
)
(attention): Attention()
(dropout): Dropout(p=0.2, inplace=False)
(output_linear): Linear(in_features=8, out_features=4, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=4, out_features=16, bias=True)
(w_2): Linear(in_features=16, out_features=4, bias=True)
(activation): GELU()
)
(input_sublayer): SublayerConnection(
(layer_norm): LayerNorm()
(dropout): Dropout(p=0.2, inplace=False)
)
(output_sublayer): SublayerConnection(
(layer_norm): LayerNorm()
(dropout): Dropout(p=0.2, inplace=False)
)
)
(2): TransformerBlock(
(attention): MultiHeadedAttention(
(linear_layers): ModuleList(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): Linear(in_features=4, out_features=8, bias=True)
(2): Linear(in_features=4, out_features=8, bias=True)
)
(attention): Attention()
(dropout): Dropout(p=0.2, inplace=False)
(output_linear): Linear(in_features=8, out_features=4, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=4, out_features=16, bias=True)
(w_2): Linear(in_features=16, out_features=4, bias=True)
(activation): GELU()
)
(input_sublayer): SublayerConnection(
(layer_norm): LayerNorm()
(dropout): Dropout(p=0.2, inplace=False)
)
(output_sublayer): SublayerConnection(
(layer_norm): LayerNorm()
(dropout): Dropout(p=0.2, inplace=False)
)
)
(3): TransformerBlock(
(attention): MultiHeadedAttention(
(linear_layers): ModuleList(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): Linear(in_features=4, out_features=8, bias=True)
(2): Linear(in_features=4, out_features=8, bias=True)
)
(attention): Attention()
(dropout): Dropout(p=0.2, inplace=False)
(output_linear): Linear(in_features=8, out_features=4, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=4, out_features=16, bias=True)
(w_2): Linear(in_features=16, out_features=4, bias=True)
(activation): GELU()
)
(input_sublayer): SublayerConnection(
(layer_norm): LayerNorm()
(dropout): Dropout(p=0.2, inplace=False)
)
(output_sublayer): SublayerConnection(
(layer_norm): LayerNorm()
(dropout): Dropout(p=0.2, inplace=False)
)
)
)
(linear): Linear(in_features=4, out_features=4, bias=True)
(activation): GELU()
)
)>