导包

import collections  
import re  
from d2l import torch as d2l  
import os

读取数据集

d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',  
                                '090b5e7e70c295757f55df93cb0a180b9691891a')  
  
def read_time_machine(): 
    """将时间机器数据集加载到文本行的列表中"""  
    with open(d2l.download('time_machine',cache_dir=os.path.join('..','pytorch','data')), 'r') as f:  
        lines = f.readlines()     # lines是一个list，每一行占一个下标   
    # 将所有不是字母的都转为空格；再将首位空格删除，全部转为小写  
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]  
  
lines = read_time_machine()  

print(lines[:10])  
print(f'# 文本总行数: {len(lines)}')

输出：

1
2
3

['the time machine by h g wells', '', '', '', '', 'i', '', '', 'the time traveller for so it will be convenient to speak of him', 'was expounding a recondite matter to us his grey eyes shone and']

文本总行数: 3221

词元化

将文本行列表（lines）作为输入，列表中的每个元素是一个文本序列（如一条文本行）。
每个文本序列又被拆分成一个词元列表，词元（token） 是文本的基本单位。
最后，返回一个由词元列表组成的列表，其中的每个词元都是一个字符串（string）。

def tokenize(lines, token='word'):  
    """将文本行拆分为单词或字符词元"""  
    if token == 'word':  
        return [line.split() for line in lines if line != ""]  
    elif token == 'char':  
        return [list(line) for line in lines]  
    else:  
        print('错误：未知词元类型：' + token)  

tokens = tokenize(lines)  
print(tokens[:10])

实际上，本例子种只将每次词作为一个词元，所以 token == 'char' 是不会有的。因此，tokenize 在本例中可以简化为：

1 2	def tokenize(lines): return [line.split() for line in lines if line != ""]

输出：

[['the', 'time', 'machine', 'by', 'h', 'g', 'wells'], 
['i'], 
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him'], 
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and'], 
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the'], 
['fire', 'burned', 'brightly', 'and', 'the', 'soft', 'radiance', 'of', 'the', 'incandescent'], 
['lights', 'in', 'the', 'lilies', 'of', 'silver', 'caught', 'the', 'bubbles', 'that', 'flashed', 'and'], 
['passed', 'in', 'our', 'glasses', 'our', 'chairs', 'being', 'his', 'patents', 'embraced', 'and'], 
['caressed', 'us', 'rather', 'than', 'submitted', 'to', 'be', 'sat', 'upon', 'and', 'there', 'was', 'that'], 
['luxurious', 'after', 'dinner', 'atmosphere', 'when', 'thought', 'roams', 'gracefully']]

记录词频

isinstance(A, B) ：判断 A 是不是 B 类型的

def count_corpus(tokens):  
    if len(tokens) == 0 or isinstance(tokens[0],list):    
        tokens = [token for line in tokens for token in line]  
    return collections.Counter(tokens)

1 2	# counter = count_corpus(tokens) # print(dict(counter))

输出：

{'the': 2261, 'time': 200, 'machine': 85, 'by': 103, 'h': 1, 'g': 1, 'wells': 9, 'i': 1267, 'traveller': 61, 'for': 221, 'so': 112, 'it': 437, 'will': 37, 'be': 93, 'convenient': 5, 'to': 695, 'speak': 6, 'of': 1155, 'him': 40, 'was': 552, 'expounding': 2, 'a': 816, 'recondite': 1, 'matter': 6, 'us': 35, 'his': 129, 'grey': 11, 'eyes': 35, 'shone': 8, 'and': 1245, 'twinkled': 1, 'usually': 3, 'pale': 10, 'face': 38, 'flushed': 2, 'animated': 3, 'fire': 30, 'burned': 6, 'brightly': 4, 'soft': 16, 'radiance': 1, 'incandescent': 1, 'lights': 1, 'in': 541, 'lilies': 1, 'silver': 6, 'caught': 10, 'bubbles': 1, 'that': 443, 'flashed': 4, 'passed': 13, 'our': 57, 'glasses': 1, 'chairs': 2, 'being': 14, 'patents': 1, 'embraced': 1, 'caressed': 2, 'rather': 18, 'than': 34, 'submitted': 1, 'sat': 22, 'upon': 113, 'there': 127, 'luxurious': 1, 'after': 37, 'dinner': 13, 'atmosphere': 2, 'when': 55, 'thought': 57, 'roams': 1, 'gracefully': 1,……}

构建词表

tokens：是一个嵌套 list，里面每个 list 表示一行，内部的 list 种是各种单词
min_freq ：表示最小出现次数，低于这个次数的单词就会被删除
counter ：记录着基于原文的词频统计信息 _token_freqs ：根据词频排序，返回字典，key 是单词 (token)，value 是次数
idx_to_token ：是一个 list，用来记录已经加进去的词频满足要求的词 enumerate() ：同时列出数据下标和数据，这里保存的是原文种所有单词再 idx_to_token 的下标

class Vocab:  
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):  
        if tokens is None:  
            tokens = []  
        if reserved_tokens is None:  
            reserved_tokens = []  
        counter = count_corpus(tokens)   
  
        self._token_freqs = sorted(counter.items(), key=lambda x:x[1], reverse=True)   
        self.idx_to_token = ['<unk>'] + reserved_tokens 
        self.token_to_idx = {token: idx for idx,token in enumerate(self.idx_to_token)}   
  
        for token, freq in self._token_freqs:  
	        # 因为已经排好序了，所以如果当前token的次数已经小于min_freq，那后面的一定会更小  
            if freq < min_freq:  
                break     
            # 当前token词频足够，且没有加入token_to_idx 
            if token not in self.token_to_idx:    
                self.idx_to_token.append(token)  
                self.token_to_idx[token] = len(self.idx_to_token) - 1  
  
    def __len__(self):  
        return len(self.idx_to_token)  
  
    def __getitem__(self, tokens):  
        if not isinstance(tokens, (list,tuple)):     # 如果不是长句  
            return self.token_to_idx.get(tokens,self.unk)  
        return [self.__getitem__(token) for token in tokens]  
  
    def to_tokens(self, indices):  
        if not isinstance(indices, (list,tuple)):  
            return self.idx_to_token[indices]  
        return [self.idx_to_token[index] for index in indices]  
  
    @property  
    def unk(self):  
        return 0  
  
    @property  
    def token_freqs(self):  
        return self._token_freqs

下面是我照着自己的想法进行的一点修改：

class Vocab:  
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):  
        if tokens is None:  
            tokens = []  
        if reserved_tokens is None:  
            reserved_tokens = []  
        counter = count_corpus(tokens)  
  
        self._token_freqs = sorted(counter.items(), key=lambda x:x[1], reverse=True)
        self.idx_to_token = ['<unk>'] + reserved_tokens    
  
        for token, freq in self._token_freqs:  
	        # 因为已经排好序了，所以如果当前token的次数已经小于min_freq，那后面的一定会更小  
            if freq < min_freq:  
                break     
	        # 当前token词频足够，且没有加入token_to_idx
            if token not in self.idx_to_token:     
                self.idx_to_token.append(token)  

		# 这里统一生成token_to_idx
        self.token_to_idx = {token: idx for idx,token in enumerate(self.idx_to_token)}  
  
    def __len__(self):  
        return len(self.idx_to_token)  
  
    def __getitem__(self, tokens):  
        if not isinstance(tokens, (list,tuple)):     # 如果不是长句  
            return self.token_to_idx.get(tokens,self.unk)  
        return [self.__getitem__(token) for token in tokens]  
  
    def to_tokens(self, indices):  
        if not isinstance(indices, (list,tuple)):  
            return self.idx_to_token[indices]  
        return [self.idx_to_token[index] for index in indices]  
  
    @property  
    def unk(self):  
        return 0  
  
    @property  
    def token_freqs(self):  
        return self._token_freqs

调用

注意： 要想查看 vocab 的内容，需要调用方法并转为 list，即 list(vocab.token_to_idx.items())。

1 2	vocab = Vocab(tokens) print(list(vocab.token_to_idx.items()))

输出：

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9), ('my', 10), ('it', 11), ('had', 12), ('me', 13), ('as', 14), ('at', 15), ('for', 16), ('with', 17), ('but', 18), ('time', 19), ('were', 20), ('this', 21), ('you', 22), ('on', 23), ('then', 24), ('his', 25), ('there', 26), ('he', 27), ('have', 28), ('they', 29), ('from', 30), ('one', 31), ('all', 32), ('not', 33), ('into', 34), ('upon', 35), ('little', 36), ('so', 37), ('is', 38), ('came', 39), ('by', 40), ('some', 41), ('be', 42), ('no', 43), ('could', 44), ('their', 45), ('said', 46), ('saw', 47), ('down', 48), ('them', 49), ('machine', 50), ('which', 51), ('very', 52), ('or', 53), ('an', 54), ('we', 55), ('now', 56), ('what', 57), ('been', 58), ('these', 59), ('like', 60), ('her', 61), ('out', 62), ('seemed', 63), ('up', 64), ('man', 65), ('about', 66), ('s', 67), ('its', 68), ('thing', 69), ('again', 70), ('traveller', 71), ('would', 72), ('more', 73), ('white', 74), ('our', 75), ('thought', 76), ('felt', 77), ('when', 78), ('over', 79), ('weena', 80), ('still', 81), ('world', 82), ('myself', 83), ('even', 84), ('must', 85), ('through', 86), ('if', 87),……]

转索引

将每条文本行转为一个数字索引列表。

1
2
3

for i in range(5):  
    print('文本：',tokens[i])  
    print('索引：',vocab[tokens[i]])

输出：

', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引： [1, 19, 50, 40, 2183, 2184, 400]
文本： ['i']
索引： [2]
文本： ['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
索引： [1, 19, 71, 16, 37, 11, 115, 42, 680, 6, 586, 4, 108]
文本： ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
索引： [7, 1420, 5, 2185, 587, 6, 126, 25, 330, 127, 439, 3]
文本： ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
索引： [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]

整合

将所有功能打包到load_corpus_time_machine函数中，该函数返回corpus（词元索引列表）和vocab（时光机器语料库的词表）。

def load_corpus_time_machine(max_tokens=-1):  
    lines = read_time_machine()    # 读取文件  
    tokens = tokenize(lines,'word')     # 将文章划分为单词  
    # print(tokens)  
    vocab = Vocab(tokens)      # 生成词典  
    corpus = [vocab[token] for line in tokens for token in line]    # line是一个list，里面存放一行的所有单词，token是每个单词，最后输出全文的所有单词的索引  
    if max_tokens > 0:  
        corpus = corpus[:max_tokens]  
    return corpus,vocab  
  
corpus,vocab = load_corpus_time_machine()  
# print(corpus)  
# print(vocab.token_to_idx.items())

Pytorch基础系列（3）

导包

读取数据集

词元化

记录词频

构建词表

调用

转索引

整合

相关链接