# MoE¶

## 一、原理介绍¶

$MoE\left ( {x} \right )=\sum ^{n}_{i=1} \left ( {{G\left ( {x} \right )}_{i}{E}_{i}\left ( {x} \right )} \right )$
$G\left ( {x} \right )=TopK\left ( {softmax\left ( {{W}_{g}\left ( {x} \right )+ϵ} \right )} \right )$

## 三、动态图使用方法¶

import paddle
from paddle.nn import Layer, LayerList, Linear, Dropout
import numpy as np


num_experts = 8
d_model = 512
d_hidden = 2048

class ExpertLayer(Layer):
def __init__(self, d_model, d_hidden, name=None):
super().__init__()
self.htoh4 = Linear(d_model, d_hidden)
self.h4toh = Linear(d_hidden, d_model)

def forward(self, x):
x = self.htoh4(x)
x = self.h4toh(x)
return x


fleet.init(is_collective=True)


gate_config = {
"type": "gshard",
"top_k": 2,
}

experts_list = LayerList()
for expi in range(num_experts):
exp_layer = ExpertLayer(d_model, d_hidden)
experts_list.append(exp_layer)


class Model(Layer):
def __init__(self, d_model, d_hidden, name=None):
super().__init__()
self.linear1 = Linear(d_model, d_model)
self.moe_layer = MoELayer(d_model = d_model,
experts=experts_list,
gate=gate_config,
moe_group=moe_group,
recompute_interval=0)

self.linear2 = Linear(d_model, d_model)
self.dropout = Dropout(p=0.1)

def forward(self, x):
x = self.linear1(x)
x = self.moe_layer(x)
x = self.linear2(x)
x = self.dropout(x)
return x

model = Model(d_model, d_hidden)


for step in range(1, 100):

y = model(x)
loss = y.mean()
loss.backward()
optim.step()

print("=== step : {}, loss : {}".format(step, loss.numpy()))


python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 --log_dir logs train_moe.py