import pandas as pd
papers = pd.read_csv('Google_AI_published_research.csv')
#加載正則表達式庫
import re
#刪除標點符號
papers['title_abstract_processed']=papers['title_abstract'].map(lambda x:re.sub('[,\.!?:-]','',x))
#將標題轉換為小寫的
papers ['title_abstract_processed'] = papers ['title_abstract_processed'].map(lambda x:x.lower())
#打印出論文的第一行
papers ['title_abstract_processed'].head()
papers.head(10)
再來是將資料處理成數字,利用TfidfVectorizer,將一些常用的英文助詞刪去(ex:I,and,or)。
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
text = vectorizer.fit_transform(papers.title_abstract_processed)
讓我們嘗試使用2到9個集群對數據進行幾次集群。
from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
import skfuzzy as fuzz
# Set up the loop and plot
fig1, axes1 = plt.subplots(3, 3, figsize=(8, 8))
alldata = text.toarray()
fpcs = []
for ncenters, ax in enumerate(axes1.reshape(-1), 2):
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
alldata, ncenters, 2, error=0.005, maxiter=1000, init=None)
# Store fpc values for later
fpcs.append(fpc)
# Plot assigned clusters, for each data point in training set
cluster_membership = np.argmax(u, axis=0)
# Mark the center of each fuzzy cluster
for pt in cntr:
ax.plot(pt[0], pt[1], 'rs')
ax.set_title('Centers = {0}; FPC = {1:.2f}'.format(ncenters, fpc))
ax.axis('off')
fig1.tight_layout()
將上圖2-9個群集的FPC做成下圖。
fig2, ax2 = plt.subplots()
ax2.plot(np.r_[2:11], fpcs)
ax2.set_xlabel("Number of centers")
ax2.set_ylabel("Fuzzy partition coefficient")