MAX_VOCAB_SIZE = 10000
UNK, PAD = '<UNK>', '<PAD>'

def build_vocab(file_name, tokenize, max_size, min_freq):
    vocab_dic = {}
    with open(file_name, 'r', encoding='utf-8') as f:
        for line in f:
            lin = line.strip()
            if not lin:
                continue
            content = lin.split('\t')[0]
            # print(content)

            for word in tokenize(content):
                # print(word)
                vocab_dic[word] = vocab_dic.get(word, 0) +1
            # break
        vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[
                     :max_size]
        vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
    print(vocab_dic)
    print(vocab_list)

file_name = '../text/THUCNews/data/train.txt'
tokenize = lambda x: x.strip(' ')
build_vocab(file_name, tokenize, max_size=MAX_VOCAB_SIZE, min_freq=1)

Logo

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。

更多推荐