导包

1
2
3
4
import collections  
import re
from d2l import torch as d2l
import os

读取数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',  
'090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():
"""将时间机器数据集加载到文本行的列表中"""
with open(d2l.download('time_machine',cache_dir=os.path.join('..','pytorch','data')), 'r') as f:
lines = f.readlines() # lines是一个list,每一行占一个下标
# 将所有不是字母的都转为空格;再将首位空格删除,全部转为小写
return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_time_machine()

print(lines[:10])
print(f'# 文本总行数: {len(lines)}')

输出:

1
2
3
['the time machine by h g wells', '', '', '', '', 'i', '', '', 'the time traveller for so it will be convenient to speak of him', 'was expounding a recondite matter to us his grey eyes shone and']

文本总行数: 3221

词元化

将文本行列表(lines)作为输入,列表中的每个元素是一个文本序列(如一条文本行)。
每个文本序列又被拆分成一个词元列表,词元(token 是文本的基本单位。
最后,返回一个由词元列表组成的列表,其中的每个词元都是一个字符串(string)。

1
2
3
4
5
6
7
8
9
10
11
def tokenize(lines, token='word'):  
"""将文本行拆分为单词或字符词元"""
if token == 'word':
return [line.split() for line in lines if line != ""]
elif token == 'char':
return [list(line) for line in lines]
else:
print('错误:未知词元类型:' + token)

tokens = tokenize(lines)
print(tokens[:10])

实际上,本例子种只将每次词作为一个词元,所以 token == 'char' 是不会有的。因此,tokenize 在本例中可以简化为:

1
2
def tokenize(lines):  
return [line.split() for line in lines if line != ""]

输出:

1
2
3
4
5
6
7
8
9
10
[['the', 'time', 'machine', 'by', 'h', 'g', 'wells'], 
['i'],
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him'],
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and'],
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the'],
['fire', 'burned', 'brightly', 'and', 'the', 'soft', 'radiance', 'of', 'the', 'incandescent'],
['lights', 'in', 'the', 'lilies', 'of', 'silver', 'caught', 'the', 'bubbles', 'that', 'flashed', 'and'],
['passed', 'in', 'our', 'glasses', 'our', 'chairs', 'being', 'his', 'patents', 'embraced', 'and'],
['caressed', 'us', 'rather', 'than', 'submitted', 'to', 'be', 'sat', 'upon', 'and', 'there', 'was', 'that'],
['luxurious', 'after', 'dinner', 'atmosphere', 'when', 'thought', 'roams', 'gracefully']]

记录词频

isinstance(A, B) :判断 A 是不是 B 类型的

1
2
3
4
def count_corpus(tokens):  
if len(tokens) == 0 or isinstance(tokens[0],list):
tokens = [token for line in tokens for token in line]
return collections.Counter(tokens)
1
2
# counter = count_corpus(tokens)  
# print(dict(counter))

输出:

1
{'the': 2261, 'time': 200, 'machine': 85, 'by': 103, 'h': 1, 'g': 1, 'wells': 9, 'i': 1267, 'traveller': 61, 'for': 221, 'so': 112, 'it': 437, 'will': 37, 'be': 93, 'convenient': 5, 'to': 695, 'speak': 6, 'of': 1155, 'him': 40, 'was': 552, 'expounding': 2, 'a': 816, 'recondite': 1, 'matter': 6, 'us': 35, 'his': 129, 'grey': 11, 'eyes': 35, 'shone': 8, 'and': 1245, 'twinkled': 1, 'usually': 3, 'pale': 10, 'face': 38, 'flushed': 2, 'animated': 3, 'fire': 30, 'burned': 6, 'brightly': 4, 'soft': 16, 'radiance': 1, 'incandescent': 1, 'lights': 1, 'in': 541, 'lilies': 1, 'silver': 6, 'caught': 10, 'bubbles': 1, 'that': 443, 'flashed': 4, 'passed': 13, 'our': 57, 'glasses': 1, 'chairs': 2, 'being': 14, 'patents': 1, 'embraced': 1, 'caressed': 2, 'rather': 18, 'than': 34, 'submitted': 1, 'sat': 22, 'upon': 113, 'there': 127, 'luxurious': 1, 'after': 37, 'dinner': 13, 'atmosphere': 2, 'when': 55, 'thought': 57, 'roams': 1, 'gracefully': 1,……}

构建词表

tokens:是一个嵌套 list,里面每个 list 表示一行,内部的 list 种是各种单词
min_freq :表示最小出现次数,低于这个次数的单词就会被删除
counter :记录着基于原文的词频统计信息 _token_freqs :根据词频排序,返回字典,key 是单词 (token),value 是次数
idx_to_token :是一个 list,用来记录已经加进去的词频满足要求的词 enumerate() :同时列出数据下标和数据,这里保存的是原文种所有单词再 idx_to_token 的下标

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class Vocab:  
def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
if tokens is None:
tokens = []
if reserved_tokens is None:
reserved_tokens = []
counter = count_corpus(tokens)

self._token_freqs = sorted(counter.items(), key=lambda x:x[1], reverse=True)
self.idx_to_token = ['<unk>'] + reserved_tokens
self.token_to_idx = {token: idx for idx,token in enumerate(self.idx_to_token)}

for token, freq in self._token_freqs:
# 因为已经排好序了,所以如果当前token的次数已经小于min_freq,那后面的一定会更小
if freq < min_freq:
break
# 当前token词频足够,且没有加入token_to_idx
if token not in self.token_to_idx:
self.idx_to_token.append(token)
self.token_to_idx[token] = len(self.idx_to_token) - 1

def __len__(self):
return len(self.idx_to_token)

def __getitem__(self, tokens):
if not isinstance(tokens, (list,tuple)): # 如果不是长句
return self.token_to_idx.get(tokens,self.unk)
return [self.__getitem__(token) for token in tokens]

def to_tokens(self, indices):
if not isinstance(indices, (list,tuple)):
return self.idx_to_token[indices]
return [self.idx_to_token[index] for index in indices]

@property
def unk(self):
return 0

@property
def token_freqs(self):
return self._token_freqs

下面是我照着自己的想法进行的一点修改:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class Vocab:  
def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
if tokens is None:
tokens = []
if reserved_tokens is None:
reserved_tokens = []
counter = count_corpus(tokens)

self._token_freqs = sorted(counter.items(), key=lambda x:x[1], reverse=True)
self.idx_to_token = ['<unk>'] + reserved_tokens

for token, freq in self._token_freqs:
# 因为已经排好序了,所以如果当前token的次数已经小于min_freq,那后面的一定会更小
if freq < min_freq:
break
# 当前token词频足够,且没有加入token_to_idx
if token not in self.idx_to_token:
self.idx_to_token.append(token)

# 这里统一生成token_to_idx
self.token_to_idx = {token: idx for idx,token in enumerate(self.idx_to_token)}

def __len__(self):
return len(self.idx_to_token)

def __getitem__(self, tokens):
if not isinstance(tokens, (list,tuple)): # 如果不是长句
return self.token_to_idx.get(tokens,self.unk)
return [self.__getitem__(token) for token in tokens]

def to_tokens(self, indices):
if not isinstance(indices, (list,tuple)):
return self.idx_to_token[indices]
return [self.idx_to_token[index] for index in indices]

@property
def unk(self):
return 0

@property
def token_freqs(self):
return self._token_freqs

调用

注意: 要想查看 vocab 的内容,需要调用方法并转为 list,即 list(vocab.token_to_idx.items())

1
2
vocab = Vocab(tokens)   
print(list(vocab.token_to_idx.items()))

输出:

1
[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9), ('my', 10), ('it', 11), ('had', 12), ('me', 13), ('as', 14), ('at', 15), ('for', 16), ('with', 17), ('but', 18), ('time', 19), ('were', 20), ('this', 21), ('you', 22), ('on', 23), ('then', 24), ('his', 25), ('there', 26), ('he', 27), ('have', 28), ('they', 29), ('from', 30), ('one', 31), ('all', 32), ('not', 33), ('into', 34), ('upon', 35), ('little', 36), ('so', 37), ('is', 38), ('came', 39), ('by', 40), ('some', 41), ('be', 42), ('no', 43), ('could', 44), ('their', 45), ('said', 46), ('saw', 47), ('down', 48), ('them', 49), ('machine', 50), ('which', 51), ('very', 52), ('or', 53), ('an', 54), ('we', 55), ('now', 56), ('what', 57), ('been', 58), ('these', 59), ('like', 60), ('her', 61), ('out', 62), ('seemed', 63), ('up', 64), ('man', 65), ('about', 66), ('s', 67), ('its', 68), ('thing', 69), ('again', 70), ('traveller', 71), ('would', 72), ('more', 73), ('white', 74), ('our', 75), ('thought', 76), ('felt', 77), ('when', 78), ('over', 79), ('weena', 80), ('still', 81), ('world', 82), ('myself', 83), ('even', 84), ('must', 85), ('through', 86), ('if', 87),……]

转索引

将每条文本行转为一个数字索引列表。

1
2
3
for i in range(5):  
print('文本:',tokens[i])
print('索引:',vocab[tokens[i]])

输出:

1
2
3
4
5
6
7
8
9
10
11
', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引: [1, 19, 50, 40, 2183, 2184, 400]
文本: ['i']
索引: [2]
文本: ['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
索引: [1, 19, 71, 16, 37, 11, 115, 42, 680, 6, 586, 4, 108]
文本: ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
索引: [7, 1420, 5, 2185, 587, 6, 126, 25, 330, 127, 439, 3]
文本: ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
索引: [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]

整合

将所有功能打包到load_corpus_time_machine函数中, 该函数返回corpus(词元索引列表)和vocab(时光机器语料库的词表)。

1
2
3
4
5
6
7
8
9
10
11
12
13
def load_corpus_time_machine(max_tokens=-1):  
lines = read_time_machine() # 读取文件
tokens = tokenize(lines,'word') # 将文章划分为单词
# print(tokens)
vocab = Vocab(tokens) # 生成词典
corpus = [vocab[token] for line in tokens for token in line] # line是一个list,里面存放一行的所有单词,token是每个单词,最后输出全文的所有单词的索引
if max_tokens > 0:
corpus = corpus[:max_tokens]
return corpus,vocab

corpus,vocab = load_corpus_time_machine()
# print(corpus)
# print(vocab.token_to_idx.items())

相关链接

文本预处理-documentation
bilibili