日本語テキストをif-idf+cos類似度で類似文章検索
はじめに
日本語文章の類似度検索をする機会があったのでここにまとめます。 やったこととしては日本語文章でのtf-idfベクトル化、cos類似度を使いました。 日本語文章にはlivedoor ニュースコーパスを使います。
実施手順
- 日本語文章の取得
- tf-idfベクトルの作成
- cos類似度の算出
- 類似度が最も高い文章の抽出
- 類似度の分布の出力
import glob from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from collections import Counter import pandas as pd def load_livedoor_news_corpus(): category = { "dokujo-tsushin": 1, "it-life-hack": 2, "kaden-channel": 3, "livedoor-homme": 4, "movie-enter": 5, "peachy": 6, "smax": 7, "sports-watch": 8, "topic-news": 9 } docs = [] labels = [] for c_name, c_id in category.items(): files = glob.glob("./text/{c_name}/{c_name}*.txt".format(c_name=c_name)) text = "" for file in files: with open(file, "r") as f: lines = f.read().splitlines() # url = lines[0] # datetime = lines[1] subject = lines[2] body = "\n".join(lines[3:]) text = subject + "\n" + body docs.append(text) labels.append(c_id) return docs, labels import MeCab parts = ["名詞"] def tokenize(text): mecabTagger = MeCab.Tagger("-Ochasen") node = mecabTagger.parseToNode(text) words = [] while node: if node.feature.split(",")[0] in parts: words.append(node.surface) node = node.next return words def split_method(value, split_lists): for i, (lower, upper) in enumerate(zip(split_lists[:-1], split_lists[1:])): if lower <= value < upper: return i + 1 else: print("ERROR NUM") return -1 if __name__ == "__main__": # 1. 日本語文章の取得 docs, labels = load_livedoor_news_corpus() life_hack_doc = [d for (d, l) in zip(docs, labels) if l==2] print("life_hack_doc has {} sentences:".format(len(life_hack_doc))) # 2. tf-idfベクトルの作成 vectorizer = TfidfVectorizer(tokenizer=tokenize, smooth_idf=False) docs_vec = vectorizer.fit_transform(life_hack_doc) # 3. cos類似度の算出 cos = [cosine_similarity(v, docs_vec)[0] for v in docs_vec] cos = pd.DataFrame(cos, index=range(len(cos)), columns=range(len(cos))) # 4. 類似度が最も高い文章の抽出 def get_max_val(x): max_series = x.drop(x.name).sort_values(ascending=False) col_index = max_series.head(1).keys()[0] max_val = max_series.head(1).values[0] return (col_index, max_val) max_val_index = cos.apply(lambda x: get_max_val(x)) cos_result = pd.DataFrame() cos_result["ID_2"] = [index for index, _ in max_val_index] cos_result["score"] = [max_val for _, max_val in max_val_index] cos_result["text_1"] = [life_hack_doc[i] for i in cos.index] cos_result["text_2"] = [life_hack_doc[i] for i in cos["ID_2"]] import ipdb; ipdb.set_trace() # 5. 類似度の分布の出力 cos_1d = cos.values.flatten() cos_1d = [round(float(x), 3) for x in [format(x, '.4f') for x in cos_1d]] split_lists = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1] split_labels = [split_method(x, split_lists) for x in cos_1d] ans = {} for key, val in Counter(split_labels).items(): if key == 11: ans[key] = cos.shape[0] else: ans[key] = int(val / 2) print(ans)
参考
日本語文章の取得 * https://qiita.com/kotaroito/items/bd5f5760a45152281b54