[Day12] 文本/词表示方式(三)-TFIDF实作

一. 序

这篇主要用python实作TFIDF，来表示文本的方式

二. 载入套件与文本

主要利用jieba断词

import jieba
import math

# 载入繁体
jieba.set_dictionary('dict.txt.big')

# 来自ithome的文章摘要，来源: https://www.ithome.com.tw/news/146142
text_a = '从GPT-3衍生改良而来的Codex模型，能够将使用者的自然语言指令转换为程序码，OpenAI现在以私人测试的方式释出CodexAPI'
# 来自ithome的文章摘要，来源: https://www.ithome.com.tw/news/145743
text_b = 'Blender2.0除了能即时搜寻网路资讯，脸书也为其打造新的神经模组，可根据之前使用者与它的聊天脉络来累积记忆'

断词与计算每个词出现的次数

texta_seg = jieba.lcut(text_a)
textb_seg = jieba.lcut(text_b)
unique_words = set(texta_seg).union(set(textb_seg)) ##所有文件中的单词

# 建立2个新字典，分别存2篇文章词的出现次数
num_words_a = dict.fromkeys(unique_words, 0)
num_words_b = dict.fromkeys(unique_words, 0)

for word in texta_seg:
    num_words_a[word] += 1
    
for word in textb_seg:
    num_words_b[word] += 1

num_words_a 就是text_a 有出现的词并且其出现的次数

三. 实作TF与IDF的function

def get_TF_value(w_dict, text_seg_len):
    tf_dict = {}
    
    for w, count in w_dict.items():
        # 计算tf的公式
        tf_dict[w] = count / float(text_seg_len)
    
    return tf_dict

def get_IDF_value(text_list, all_words):
    
    idf_dict = dict.fromkeys(all_words.keys(), 0)
    
    for text in text_list:
        for w, val in text.items():
            # 表示出现过在一次文本中         
            if val > 0:
                idf_dict[w] += 1
    
    for w, val in idf_dict.items():
        # 计算idf的公式
        idf_dict[w] = math.log(len(text_list) / float(val))
    return idf_dict

三. 计算tfidf

tf_a = get_TF_value(num_words_a, len(texta_seg))
tf_b = get_TF_value(num_words_b, len(textb_seg))

idf = get_IDF_value([num_words_a, num_words_b], num_words_a)

# 计算tfidf
tfidf_a = {}
tfidf_b = {}
for w, val in tf_a.items():
    tfidf_a[w] = val * idf[w]

for w, val in tf_b.items():
    tfidf_b[w] = val * idf[w]

tfidf_a的output如下:

{'能': 0.0,
 '来': 0.0,
 '即时': 0.0,
 '而来': 0.023104906018664842,
 '可': 0.0,
 '，': 0.0,
 '之前': 0.0,
 '模组': 0.0,
 '指令': 0.023104906018664842,
 '的': 0.0,
 '测试': 0.023104906018664842,
 '也': 0.0,
 '使用者': 0.0,
 '3': 0.023104906018664842,
 ...}

四. 用TFIDF表示成句字/文本

# 创建一个表示text a的list
bow_a = []
# 将tfidf_a带入即可
for w, val in tfidf_a.items():
    bow_a.append(val)

print(bow_a)

bow_a最後为表示如下:

[0.0, 0.0, 0.0, 0.023104906018664842, 0.0, 0.0, 0.0, 0.0, 0.023104906018664842, 0.0, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.0, 0.023104906018664842, 0.023104906018664842, 0.0, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.023104906018664842, 0.0, 0.0, 0.0, 0.0, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.0, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.0, 0.023104906018664842, 0.023104906018664842, 0.0, 0.0, 0.0, 0.0, 0.023104906018664842]