# IMDB 数据集使用BOW网络的文本分类¶

## 一、环境配置¶

import paddle
import numpy as np

2.1.0


## 二、加载数据¶

IMDB数据集是一个对电影评论标注为正向评论与负向评论的数据集，共有25000条文本数据作为训练集，25000条文本数据作为测试集。 该数据集的官方地址为： http://ai.stanford.edu/~amaas/data/sentiment/

print('loading dataset...')

loading dataset...


word_dict = train_dataset.word_idx

for k in list(word_dict)[:5]:
print("{}:{}".format(k.decode('ASCII'), word_dict[k]))

print("...")

for k in list(word_dict)[-5:]:
print("{}:{}".format(k if isinstance(k, str) else k.decode('ASCII'), word_dict[k]))

print("totally {} words".format(len(word_dict)))

the:0
and:1
a:2
of:3
to:4
...
virtual:5143
warriors:5144
widely:5145
<unk>:5146
totally 5148 words


### 2.1 参数设置¶

vocab_size = len(word_dict) + 1
emb_size = 256
seq_len = 200
batch_size = 32
epochs = 2

classes = ['negative', 'positive']

def ids_to_str(ids):
#print(ids)
words = []
for k in ids:
w = list(word_dict)[k]
words.append(w if isinstance(w, str) else w.decode('ASCII'))
return " ".join(words)


# 取出来第一条数据看看样子。
sent = train_dataset.docs[0]
label = train_dataset.labels[1]
print('sentence list id is:', sent)
print('sentence label id is:', label)
print('--------------------------')
print('sentence list is: ', ids_to_str(sent))
print('sentence label is: ', classes[label])

sentence list id is: [5146, 43, 71, 6, 1092, 14, 0, 878, 130, 151, 5146, 18, 281, 747, 0, 5146, 3, 5146, 2165, 37, 5146, 46, 5, 71, 4089, 377, 162, 46, 5, 32, 1287, 300, 35, 203, 2136, 565, 14, 2, 253, 26, 146, 61, 372, 1, 615, 5146, 5, 30, 0, 50, 3290, 6, 2148, 14, 0, 5146, 11, 17, 451, 24, 4, 127, 10, 0, 878, 130, 43, 2, 50, 5146, 751, 5146, 5, 2, 221, 3727, 6, 9, 1167, 373, 9, 5, 5146, 7, 5, 1343, 13, 2, 5146, 1, 250, 7, 98, 4270, 56, 2316, 0, 928, 11, 11, 9, 16, 5, 5146, 5146, 6, 50, 69, 27, 280, 27, 108, 1045, 0, 2633, 4177, 3180, 17, 1675, 1, 2571]
sentence label id is: 0
--------------------------
sentence list is:  <unk> has much in common with the third man another <unk> film set among the <unk> of <unk> europe like <unk> there is much inventive camera work there is an innocent american who gets emotionally involved with a woman he doesnt really understand and whose <unk> is all the more striking in contrast with the <unk> br but id have to say that the third man has a more <unk> storyline <unk> is a bit disjointed in this respect perhaps this is <unk> it is presented as a <unk> and making it too coherent would spoil the effect br br this movie is <unk> <unk> in more than one sense one never sees the sun shine grim but intriguing and frightening
sentence label is:  negative


def create_padded_dataset(dataset):
labels = []
for batch_id, data in enumerate(dataset):
sent, label = data[0], data[1]
labels.append(label)

print(train_sents.shape)
print(train_labels.shape)
print(test_sents.shape)
print(test_labels.shape)

for sent in train_sents[:3]:
print(ids_to_str(sent))

(25000, 200)
(25000, 1)
(25000, 200)
(25000, 1)
<unk> von <unk> is never <unk> in trying out new techniques some of them are very original while others are best <unk> br he depicts <unk> germany as a <unk> train journey with so many cities lying in ruins <unk> <unk> a young american of german descent feels <unk> to help in their <unk> it is not a simple task as he quickly finds outbr br his uncle finds him a job as a night <unk> on the <unk> <unk> line his job is to <unk> to the needs of the passengers when the shoes are <unk> a <unk> mark is made on the <unk> a terrible argument <unk> when a passengers shoes are not <unk> despite the fact they have been <unk> there are many <unk> to the german <unk> of <unk> to such stupid <unk> br the <unk> journey is like an <unk> <unk> mans <unk> through life with all its <unk> and <unk> in one sequence <unk> <unk> through the back <unk> to discover them filled with <unk> bodies appearing to have just escaped from <unk> these images horrible as they are are <unk> as in a dream each with its own terrible impact yet <unk> br


### 2.3 用Dataset 与 DataLoader 加载¶

class IMDBDataset(paddle.io.Dataset):
def __init__(self, sents, labels):

self.sents = sents
self.labels = labels

def __getitem__(self, index):

data = self.sents[index]
label = self.labels[index]

return data, label

def __len__(self):

return len(self.sents)

train_dataset = IMDBDataset(train_sents, train_labels)
test_dataset = IMDBDataset(test_sents, test_labels)

batch_size=batch_size, drop_last=True)
batch_size=batch_size, drop_last=True)


## 三、组建网络¶

class MyNet(paddle.nn.Layer):
def __init__(self):
super(MyNet, self).__init__()

def forward(self, x):
x = self.emb(x)
x = self.dropout(x)
x = self.fc(x)
return x


## 四、方式1：用高层API训练与验证¶

Model 封装模型，调用 fit、prepare 完成模型的训练与验证

[9]:

model = paddle.Model(MyNet()) # 用 Model封装 MyNet

# 模型配置

# 模型训练
epochs=epochs,
batch_size=batch_size,
verbose=1)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/2
step 781/781 [==============================] - loss: 0.3242 - 5ms/step
Eval begin...
step 570/781 [====================>.........] - loss: 0.3247 - ETA: 0s - 3ms/st


## 五、方式：2 用底层API训练与验证¶

def train(model):

model.train()

for epoch in range(epochs):

sent = data[0]
label = data[1]

logits = model(sent)

if batch_id % 500 == 0:
print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, loss.numpy()))

loss.backward()
opt.step()

# evaluate model after one epoch
model.eval()
accuracies = []
losses = []

sent = data[0]
label = data[1]

logits = model(sent)

accuracies.append(acc.numpy())
losses.append(loss.numpy())

avg_acc, avg_loss = np.mean(accuracies), np.mean(losses)
print("[validation] accuracy/loss: {}/{}".format(avg_acc, avg_loss))

model.train()

model = MyNet()
train(model)

epoch: 0, batch_id: 0, loss is: [0.6925806]
epoch: 0, batch_id: 500, loss is: [0.2938326]
[validation] accuracy/loss: 0.8513924479484558/0.3604280650615692
epoch: 1, batch_id: 0, loss is: [0.31649047]
epoch: 1, batch_id: 500, loss is: [0.47300753]
[validation] accuracy/loss: 0.8644366264343262/0.32513079047203064