李沐-《动手学深度学习》
1.GRU简洁实现
num_inputs = vocab_size
gru_layer = nn.GRU(num_inputs, num_hiddens)#输入大小隐藏层大小
model = d2l.RNNModel(gru_layer, len(vocab))#构建模型
model = model.to(device)
d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)
2.LSTM的简洁实现
num_inputs=vocab_size
lstm_layer=nn.LSTM(num_inputs,num_hiddens)
model=d2l.RNNModel(lstm_layer,len((vocab)))
model=model.to(device)
d2l.train_ch8(model,train_iter,vocab,lr,num_epochs,device)
3.具体实现
3.1 GRU
import torch
from torch import nn
from d2l import torch as d2l
import rnn
batch_size,num_steps=32,35
train_iter,vocab=d2l.load_data_time_machine(batch_size,num_steps)
def get_params(vocab_size,num_hiddens,device):
num_inputs=num_outputs=vocab_size
#初始化权重
def normal(shape):
return torch.randn(size=shape,device=device)*0.01
def three():
return (normal((num_inputs,num_hiddens)),
normal((num_hiddens,num_hiddens)),
torch.zeros(num_hiddens,device=device))
W_xz,W_hz,b_z=three()
W_xr,W_hr,b_r=three()
W_xh,W_hh,b_h=three()
W_hq=normal((num_hiddens,num_outputs))
b_q=torch.zeros(num_outputs,device=device)
params=[W_xz,W_hz,b_z,W_xr,W_hr,b_r,W_xh,W_hh,b_h]
for param in params:
param.requires_grad_(True)
return params
#定义模型
def init_gru_state(banch_size,num_hiddens,device):
return (torch.zeros((banch_size,num_hiddens),device=device),)
def gru(inputs,state,params):
W_xz,W_hz,b_z,W_xr,W_hr,b_r,W_xh,W_hh,b_h,W_hq,b_q=params
H,=state
outputs=[]
for X in inputs:
Z=torch.sigmoid((X@W_xz)+(H@W_hz)+b_z)
R=torch.sigmoid((X@W_xr)+(H@W_hr)+b_r)
H_tilda=torch.tanh((X@W_xh)+((R*H)@W_hh)+b_h)
H=Z*H+(1-Z)*H*H_tilda
Y=H@W_hq+b_q
outputs.append(Y)
return torch.cat(outputs,dim=0),(H,)
#训练
vocab_size,num_hiddens,device=len(vocab),256,d2l.try_gpu()
num_epochs,lr=500,1
model=d2l.RNNModelScratch(len(vocab),num_hiddens,device,get_params,
init_gru_state,gru)
d2l.train_ch8(model,train_iter,vocab,lr,num_epochs,device)
3.2 LSTM
import torch
from torch import nn
from d2l import torch as d2l
batch_size,num_steps=32,35
train_iter,vocab=d2l.load_data_time_machine(batch_size,num_steps)
def get_lstm_params(vocab_size,num_hiddens,device):
num_inputs=num_outputs=vocab_size
def normal(shape):
return torch.randn(size=shape,device=device)*0.01
def three():
return (normal((num_inputs,num_hiddens)),
normal((num_hiddens,num_hiddens)),
torch.zeros(num_hiddens,device=device))
W_xi,W_hi,b_i=three()
W_xf,W_hf,b_f=three()
W_xo,W_ho,b_o=three()
W_xc,W_hc,b_c=three()
W_hq=normal((num_hiddens,num_outputs))
b_q=torch.zeros(num_outputs,device=device)
params=[W_xi,W_hi,b_i,W_xf,W_hf,b_f,W_xo,W_ho,b_o,W_xc,W_hc,b_c,W_hq,b_q]
for param in params:
param.requires_grad_(True)
return params
def init_lstm_state(batch_size,num_hiddens,device):
return (torch.zeros((batch_size,num_hiddens),device=device),
torch.zeros((batch_size,num_hiddens),device))
def lstm(inputs,state,params):
[W_xi,W_hi,b_i,W_xf,W_hf,b_f,W_xo,W_ho,b_o,W_xc,W_hc,b_c,W_hq,b_q]=params
(H,C)=state
outputs=[]
for X in inputs:
I=torch.sigmoid((X@W_xi)+(X@W_hi)+b_i)
F=torch.sigmoid((X@W_xf)+(X@W_hf)+b_f)
O=torch.sigmoid((X@W_xo)+(X@W_ho)+b_o)
C_tilda=torch.tanh((X@W_xc)+(H@W_hc)+b_c)
C=F*C+I*C_tilda
H=O*torch.tanh(C)
Y=(H@W_hq)+b_q
outputs.append(Y)
return torch.cat(outputs,dim=0),(H,C)
文章出处登录后可见!
已经登录?立即刷新