import random
import torch
from d2l import torch as d2l
tokens = d2l.tokenize(d2l.read_time_machine())
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
vocab.token_freqs[:10]
[('the', 2261), ('i', 1267), ('and', 1245), ('of', 1155), ('a', 816), ('to', 695), ('was', 552), ('in', 541), ('that', 443), ('my', 440)]
最流行的词 被称为停用词 画出的词频图
freqs = [freq for token, freq in vocab.token_freqs]
d2l.plot(freqs, xlabel='token: x', ylabel='frequency: n(x)', xscale='log',
yscale='log')
其他的词元组合,比如二元语法、三元语法等等,又会如何呢?
bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])]
bigram_vocab = d2l.Vocab(bigram_tokens)
bigram_vocab.token_freqs[:10]
[(('of', 'the'), 309), (('in', 'the'), 169), (('i', 'had'), 130), (('i', 'was'), 112), (('and', 'the'), 109), (('the', 'time'), 102), (('it', 'was'), 99), (('to', 'the'), 85), (('as', 'i'), 78), (('of', 'a'), 73)]
trigram_tokens = [
triple for triple in zip(corpus[:-2], corpus[1:-1], corpus[2:])]
trigram_vocab = d2l.Vocab(trigram_tokens)
trigram_vocab.token_freqs[:10]
[(('the', 'time', 'traveller'), 59), (('the', 'time', 'machine'), 30), (('the', 'medical', 'man'), 24), (('it', 'seemed', 'to'), 16), (('it', 'was', 'a'), 15), (('here', 'and', 'there'), 15), (('seemed', 'to', 'me'), 14), (('i', 'did', 'not'), 14), (('i', 'saw', 'the'), 13), (('i', 'began', 'to'), 13)]
直观地对比三种模型中的标记频率
bigram_freqs = [freq for token, freq in bigram_vocab.token_freqs]
trigram_freqs = [freq for token, freq in trigram_vocab.token_freqs]
d2l.plot([freqs, bigram_freqs, trigram_freqs], xlabel='token: x',
ylabel='frequency: n(x)', xscale='log', yscale='log',
legend=['unigram', 'bigram', 'trigram'])
随机地生成一个小批量数据的特征和标签以供读取。 在随机采样中,每个样本都是在原始的长序列上任意捕获的子序列
def seq_data_iter_random(corpus, batch_size, num_steps):
"""使用随机抽样生成一个小批量子序列。"""
corpus = corpus[random.randint(0, num_steps - 1):]
num_subseqs = (len(corpus) - 1) // num_steps
initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
random.shuffle(initial_indices)
def data(pos):
return corpus[pos:pos + num_steps]
num_batches = num_subseqs // batch_size
for i in range(0, batch_size * num_batches, batch_size):
initial_indices_per_batch = initial_indices[i:i + batch_size]
X = [data(j) for j in initial_indices_per_batch]
Y = [data(j + 1) for j in initial_indices_per_batch]
yield torch.tensor(X), torch.tensor(Y)
生成一个从 $0$ 到 $34$ 的序列
my_seq = list(range(35))
for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_steps=5):
print('X: ', X, '\nY:', Y)
X: tensor([[ 0, 1, 2, 3, 4], [20, 21, 22, 23, 24]]) Y: tensor([[ 1, 2, 3, 4, 5], [21, 22, 23, 24, 25]]) X: tensor([[25, 26, 27, 28, 29], [15, 16, 17, 18, 19]]) Y: tensor([[26, 27, 28, 29, 30], [16, 17, 18, 19, 20]]) X: tensor([[ 5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]) Y: tensor([[ 6, 7, 8, 9, 10], [11, 12, 13, 14, 15]])
保证两个相邻的小批量中的子序列在原始序列上也是相邻的
def seq_data_iter_sequential(corpus, batch_size, num_steps):
"""使用顺序分区生成一个小批量子序列。"""
offset = random.randint(0, num_steps)
num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
Xs = torch.tensor(corpus[offset:offset + num_tokens])
Ys = torch.tensor(corpus[offset + 1:offset + 1 + num_tokens])
Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
num_batches = Xs.shape[1] // num_steps
for i in range(0, num_steps * num_batches, num_steps):
X = Xs[:, i:i + num_steps]
Y = Ys[:, i:i + num_steps]
yield X, Y
读取每个小批量的子序列的特征 X
和标签 Y
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
print('X: ', X, '\nY:', Y)
X: tensor([[ 0, 1, 2, 3, 4], [17, 18, 19, 20, 21]]) Y: tensor([[ 1, 2, 3, 4, 5], [18, 19, 20, 21, 22]]) X: tensor([[ 5, 6, 7, 8, 9], [22, 23, 24, 25, 26]]) Y: tensor([[ 6, 7, 8, 9, 10], [23, 24, 25, 26, 27]]) X: tensor([[10, 11, 12, 13, 14], [27, 28, 29, 30, 31]]) Y: tensor([[11, 12, 13, 14, 15], [28, 29, 30, 31, 32]])
class SeqDataLoader:
"""加载序列数据的迭代器。"""
def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
if use_random_iter:
self.data_iter_fn = d2l.seq_data_iter_random
else:
self.data_iter_fn = d2l.seq_data_iter_sequential
self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
self.batch_size, self.num_steps = batch_size, num_steps
def __iter__(self):
return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
最后,我们定义了一个函数 load_data_time_machine
,它同时返回数据迭代器和词汇表
def load_data_time_machine(batch_size, num_steps,
use_random_iter=False, max_tokens=10000):
"""返回时光机器数据集的迭代器和词汇表。"""
data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter,
max_tokens)
return data_iter, data_iter.vocab