import pandas as pd
papers = pd.read_csv('Google_AI_published_research.csv')

#加載正則表達式庫
import re 
#刪除標點符號
papers['title_abstract_processed']=papers['title_abstract'].map(lambda x:re.sub('[,\.!?:-]','',x))
#將標題轉換為小寫的
papers ['title_abstract_processed'] = papers ['title_abstract_processed'].map(lambda x:x.lower())
#打印出論文的第一行
papers ['title_abstract_processed'].head()

papers.head(10)

再來是將資料處理成數字，利用TfidfVectorizer，將一些常用的英文助詞刪去(ex:I,and,or)。

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
text = vectorizer.fit_transform(papers.title_abstract_processed)

讓我們嘗試使用2到9個集群對數據進行幾次集群。

from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
import skfuzzy as fuzz


# Set up the loop and plot
fig1, axes1 = plt.subplots(3, 3, figsize=(8, 8))
alldata = text.toarray()
fpcs = []

for ncenters, ax in enumerate(axes1.reshape(-1), 2):
    cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
        alldata, ncenters, 2, error=0.005, maxiter=1000, init=None)

    # Store fpc values for later
    fpcs.append(fpc)

    # Plot assigned clusters, for each data point in training set
    cluster_membership = np.argmax(u, axis=0)
    # Mark the center of each fuzzy cluster
    for pt in cntr:
        ax.plot(pt[0], pt[1], 'rs')

    ax.set_title('Centers = {0}; FPC = {1:.2f}'.format(ncenters, fpc))
    ax.axis('off')

fig1.tight_layout()

將上圖2-9個群集的FPC做成下圖。

fig2, ax2 = plt.subplots()
ax2.plot(np.r_[2:11], fpcs)
ax2.set_xlabel("Number of centers")
ax2.set_ylabel("Fuzzy partition coefficient")

Text(0, 0.5, 'Fuzzy partition coefficient')

	title_abstract	title_abstract_processed
0	Evaluating similarity measures: a large-scale ...	evaluating similarity measures a largescale st...
1	Web Search for a Planet: The Google Cluster Ar...	web search for a planet the google cluster arc...
2	The Price of Performance: An Economic Case for...	the price of performance an economic case for ...
3	The Google File System. We have designed and ...	the google file system we have designed and i...
4	Interpreting the Data: Parallel Analysis with ...	interpreting the data parallel analysis with s...
5	Query-Free News Search. Many daily activities...	queryfree news search many daily activities p...
6	Searching the Web by Voice. Spoken queries ar...	searching the web by voice spoken queries are...
7	Who Links to Whom: Mining Linkage between Web ...	who links to whom mining linkage between web s...
8	PowerPoint: Shot with its own bullets. Imagin...	powerpoint shot with its own bullets imagine ...
9	The Chubby lock service for loosely-coupled di...	the chubby lock service for looselycoupled dis...