In [2]:
import pandas as pd
papers = pd.read_csv('Google_AI_published_research.csv')

#加載正則表達式庫
import re 
#刪除標點符號
papers['title_abstract_processed']=papers['title_abstract'].map(lambda x:re.sub('[,\.!?:-]','',x))
#將標題轉換為小寫的
papers ['title_abstract_processed'] = papers ['title_abstract_processed'].map(lambda x:x.lower())
#打印出論文的第一行
papers ['title_abstract_processed'].head()

papers.head(10)
Out[2]:
title_abstract title_abstract_processed
0 Evaluating similarity measures: a large-scale ... evaluating similarity measures a largescale st...
1 Web Search for a Planet: The Google Cluster Ar... web search for a planet the google cluster arc...
2 The Price of Performance: An Economic Case for... the price of performance an economic case for ...
3 The Google File System. We have designed and ... the google file system we have designed and i...
4 Interpreting the Data: Parallel Analysis with ... interpreting the data parallel analysis with s...
5 Query-Free News Search. Many daily activities... queryfree news search many daily activities p...
6 Searching the Web by Voice. Spoken queries ar... searching the web by voice spoken queries are...
7 Who Links to Whom: Mining Linkage between Web ... who links to whom mining linkage between web s...
8 PowerPoint: Shot with its own bullets. Imagin... powerpoint shot with its own bullets imagine ...
9 The Chubby lock service for loosely-coupled di... the chubby lock service for looselycoupled dis...

再來是將資料處理成數字,利用TfidfVectorizer,將一些常用的英文助詞刪去(ex:I,and,or)。

In [3]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
text = vectorizer.fit_transform(papers.title_abstract_processed)

讓我們嘗試使用2到9個集群對數據進行幾次集群。

In [11]:
from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
import skfuzzy as fuzz


# Set up the loop and plot
fig1, axes1 = plt.subplots(3, 3, figsize=(8, 8))
alldata = text.toarray()
fpcs = []

for ncenters, ax in enumerate(axes1.reshape(-1), 2):
    cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
        alldata, ncenters, 2, error=0.005, maxiter=1000, init=None)

    # Store fpc values for later
    fpcs.append(fpc)

    # Plot assigned clusters, for each data point in training set
    cluster_membership = np.argmax(u, axis=0)
    # Mark the center of each fuzzy cluster
    for pt in cntr:
        ax.plot(pt[0], pt[1], 'rs')

    ax.set_title('Centers = {0}; FPC = {1:.2f}'.format(ncenters, fpc))
    ax.axis('off')

fig1.tight_layout()

將上圖2-9個群集的FPC做成下圖。

In [10]:
fig2, ax2 = plt.subplots()
ax2.plot(np.r_[2:11], fpcs)
ax2.set_xlabel("Number of centers")
ax2.set_ylabel("Fuzzy partition coefficient")
Out[10]:
Text(0, 0.5, 'Fuzzy partition coefficient')