import pandas as pd
df = pd.read_csv('Google_AI_published_research.csv', encoding='utf-8')
df.head()
import re
for i in range(len(df['title_abstract'])):
df['title_abstract'][i] =\
re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', df['title_abstract'][i])
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
stop = stopwords.words('english')
def cleaner_tokenizer(text, stopword=stop):
text = re.sub('[\W]+', ' ', text.lower())
text = re.sub(r'[0123456789]*', '', text)
text = text.split()
text = [w for w in text if w not in stopword]
return text
X = pd.DataFrame(df['title_abstract'].apply(cleaner_tokenizer))
X.head()
from nltk.corpus import wordnet
def get_wordnet_pos(word):
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
X_lem = []
for row in X['title_abstract']:
X_lem.append(' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in row]))
X_lem = pd.DataFrame(X_lem, columns=['Text_Lemmatized'])
X_lem.head()
porter = PorterStemmer()
X_stem = []
for row in X['title_abstract']:
X_stem.append(' '.join([porter.stem(word) for word in row]))
X_stem = pd.DataFrame(X_stem, columns=['Text_Stemmed'])
X_stem.head()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
count = CountVectorizer(stop_words='english',
max_features=10000)
lda = LatentDirichletAllocation(n_components=10, random_state=6, learning_method='batch')
X_lda = count.fit_transform(df['title_abstract'].values)
X_topics = lda.fit_transform(X_lda)
n_top_words = 5
top_words = []
feature_names = count.get_feature_names()
for idx, topic in enumerate(lda.components_):
print("Topic %d:" % (idx + 1))
top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]])
print(" ".join(top_words[idx]))
text_idx = []
for i in range(10):
topic = X_topics[:, i].argsort()
text_idx.append(topic[-1:-4:-1])
print('Topic #%d:' % (i + 1), '(text #%d)' %topic[-1])
print(df['title_abstract'][topic[-1]])#[:300],'...')
count = CountVectorizer(stop_words='english',
max_df=0.1,
max_features=10000)
lda = LatentDirichletAllocation(n_components=10, random_state=6, learning_method='batch')
X_lda = count.fit_transform(df['title_abstract'].values)
X_topics = lda.fit_transform(X_lda)
n_top_words = 5
top_words = []
feature_names = count.get_feature_names()
for idx, topic in enumerate(lda.components_):
print("Topic %d:" % (idx + 1))
top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]])
print(" ".join(top_words[idx]))
text_idx = []
for i in range(10):
topic = X_topics[:, i].argsort()
text_idx.append(topic[-1:-4:-1])
print('Topic #%d:' % (i + 1), '(text #%d)' %topic[-1])
print(df['title_abstract'][topic[-1]])#[:300],'...')
count = CountVectorizer(stop_words='english',
max_df=0.1,
max_features=10000)
lda = LatentDirichletAllocation(n_components=10, random_state=6, learning_method='batch')
X_lda = count.fit_transform(X_lem['Text_Lemmatized'])
X_topics = lda.fit_transform(X_lda)
n_top_words = 5
top_words_lem = []
feature_names = count.get_feature_names()
for idx, topic in enumerate(lda.components_):
print("Topic %d:" % (idx + 1))
top_words_lem.append([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]])
print(" ".join(top_words_lem[idx]))
text_idx_lem = []
for i in range(10):
topic = X_topics[:, i].argsort()
text_idx_lem.append(topic[:-4:-1])
print('Topic #%d:' % (i + 1), '(text #%d)' %topic[-1])
print(df['title_abstract'][topic[-1]])#[:300],'...')
Topic: Main topic of text (related top words)
count = CountVectorizer(stop_words='english',
max_df=0.1,
max_features=10000)
lda = LatentDirichletAllocation(n_components=10, random_state=6, learning_method='batch')
X_lda = count.fit_transform(X_stem['Text_Stemmed'])
X_topics = lda.fit_transform(X_lda)
n_top_words = 5
top_words_stem = []
feature_names = count.get_feature_names()
for idx, topic in enumerate(lda.components_):
print("Topic %d:" % (idx + 1))
top_words_stem.append([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]])
print(" ".join(top_words_stem[idx]))
text_idx_stem = []
for i in range(10):
topic = X_topics[:, i].argsort()
text_idx_stem.append(topic[-1:-4:-1])
print('Topic #%d:' % (i + 1), '(text #%d)' %topic[-1])
print(df['title_abstract'][topic[-1]])#[:300],'...')
match = []
print('Raw Text\n')
for n, idx in enumerate(text_idx):
m = 0
for i in idx:
for w in X['title_abstract'][i]:
if w in top_words[n]:
m+=1
print(' Topic %2d (%s): %d matches' % ((n+1), ', '.join(top_words[n]), m))
match.append(m)
print(' Total matches:', sum(match))
match = []
print('\nLemmatized Text\n')
for n, idx in enumerate(text_idx_lem):
m = 0
for i in idx:
for w in X_lem['Text_Lemmatized'][i].split():
if w in top_words_lem[n]:
m+=1
print(' Topic %2d (%s): %d matches' % ((n+1), ', '.join(top_words_lem[n]), m))
match.append(m)
print(' Total matches:', sum(match))
match = []
print('\nStemmed Text\n')
for n, idx in enumerate(text_idx_stem):
m = 0
for i in idx:
for w in X_stem['Text_Stemmed'][i].split():
if w in top_words_stem[n]:
m+=1
print(' Topic %2d (%s): %d matches' % ((n+1), ', '.join(top_words_stem[n]), m))
match.append(m)
print(' Total matches:', sum(match))
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf1 = TfidfVectorizer(max_features=10000, max_df=0.1)
X_lem_vect = tfidf1.fit_transform(X_lem['Text_Lemmatized'])
tfidf2 = TfidfVectorizer(max_features=10000, max_df=0.1)
X_stem_vect = tfidf2.fit_transform(X_stem['Text_Stemmed'])
X_lem_vect = X_lem_vect.toarray()
X_stem_vect = X_stem_vect.toarray()
print('X Lemmatized:')
print(X_lem_vect)
print('X Stemmed:')
print(X_stem_vect)
from sklearn.cluster import KMeans
import pyprind
pbar = pyprind.ProgBar(18)
distortions = []
for i in range(3, 21):
km = KMeans(n_clusters=i,
n_init=10,
max_iter=500,
tol=1e-04,
random_state=0,
n_jobs=-1)
km.fit(X_stem_vect.T)
distortions.append(km.inertia_)
pbar.update()
print('# of Clusters | Distortion')
print('--------------|-----------')
for i, d in enumerate(distortions):
print(' %-2d | %.2f' % (i+3, d))
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(range(3,21), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.xlim(3,20)
plt.grid()
plt.tight_layout()
plt.show()
from matplotlib import cm
from sklearn.metrics import silhouette_samples
import numpy as np
km = KMeans(n_clusters=8, n_init=10, max_iter=1000, tol=1e-04, random_state=0)
y_km = km.fit_predict(X_stem_vect.T)
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X_stem_vect.T, y_km, metric='euclidean')
feature_stem = tfidf2.get_feature_names()
for n in range(8):
print('Cluster #%2d | ' % (n+1), end='')
val = []
index = []
for i, c in enumerate(y_km):
if c == n:
val.append(silhouette_vals[i])
index.append(i)
sort = np.argsort(-np.array(val))
print('Avg Silhouette: %6.3f | Top Words: ' %np.average(val), end='')
for j in range(5):
if j < len(index):
print(feature_stem[index[sort[j]]], end=' ')
print('')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[y_km==c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(float(i) / n_clusters)
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color)
yticks.append((y_ax_lower+y_ax_upper) / 2.)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color='red')
plt.yticks(yticks, cluster_labels +1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
plt.show()
import skfuzzy as fuzz
fpcs = []
print('# of Clusters | FPC ')
print('--------------|-----------')
for ncenters in range(3, 11):
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
X_stem_vect, ncenters, 2, error=0.0001, maxiter=1000, init=None)
fpcs.append(fpc)
print(' %2d | %.3f' % (ncenters, fpc))
fig2, ax2 = plt.subplots()
ax2.plot(np.r_[3:11], fpcs)
ax2.set_xlabel("Number of centers")
ax2.set_ylabel("Fuzzy partition coefficient")
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
X_stem_vect, 3, 2, error=0.005, maxiter=1000, init=None)
cluster_membership = np.argmax(u, axis=0)
print(u, u.shape)
print(cluster_membership, len(cluster_membership))
for i, c in enumerate(cluster_membership):
if c == 1:
print(feature_stem[i], end=' ')
print(X_stem_vect.T.shape)
print(u.shape)
print(u.T)
from sklearn.metrics import silhouette_samples
silhouette_vals = silhouette_samples(X_stem_vect.T, (u.T).argmax(axis = 1), metric='euclidean')
feature_stem = tfidf2.get_feature_names()
for n in range(3):
print('Cluster #%2d | ' % (n+1), end='')
val = []
index = []
for i, c in enumerate(cluster_membership):
if c == n:
val.append(silhouette_vals[i])
index.append(i)
sort = np.argsort(-np.array(val))
print('Top Words: ', end='')
for j in range(5):
if j < len(index):
print(feature_stem[index[sort[j]]], end=' ')
print('')
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
%matplotlib inline
import matplotlib.pyplot as plt
import pyprind
pbar = pyprind.ProgBar(8)
sil_val = []
for n in range(3, 11):
ac = AgglomerativeClustering(n_clusters=n,
affinity='euclidean',
linkage='complete')
y_ac = ac.fit_predict(X_stem_vect.T)
sil_val.append(silhouette_score(X_stem_vect.T, y_ac, metric='euclidean'))
pbar.update()
plt.plot(range(3,11), sil_val, marker='s')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Value')
plt.xlim(3,10)
plt.grid()
plt.tight_layout()
plt.show()
ac = AgglomerativeClustering(n_clusters=4,
affinity='euclidean',
linkage='complete')
labels = ac.fit_predict(X_stem_vect.T)
print('Cluster labels %s' % labels[0:10])
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels = np.unique(labels)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X_stem_vect.T, labels, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[labels==c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(float(i) / n_clusters)
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color)
yticks.append((y_ax_lower+y_ax_upper) / 2.)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color='red')
plt.yticks(yticks, cluster_labels +1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
plt.show()
silhouette_vals = silhouette_samples(X_stem_vect.T, labels, metric='euclidean')
feature_stem = tfidf2.get_feature_names()
for n in range(4):
print('Cluster #%2d | ' % (n+1), end='')
val = []
index = []
for i, c in enumerate(labels):
if c == n:
val.append(silhouette_vals[i])
index.append(i)
sort = np.argsort(-np.array(val))
print('Top Words: ', end='')
for j in range(5):
if j < len(index):
print(feature_stem[index[sort[j]]], end=' ')
print('')
import pyprind
pbar = pyprind.ProgBar(18)
distortions = []
for i in range(3, 21):
km = KMeans(n_clusters=i,
n_init=10,
max_iter=500,
tol=1e-04,
random_state=0,
n_jobs=-1)
km.fit(X_stem_vect)
distortions.append(km.inertia_)
pbar.update()
print('# of Clusters | Distortion')
print('--------------|-----------')
for i, d in enumerate(distortions):
print(' %-2d | %.2f' % (i+3, d))
plt.plot(range(3,21), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.xlim(3,20)
plt.grid()
plt.tight_layout()
plt.show()
km = KMeans(n_clusters=5, n_init=10, max_iter=1000, tol=1e-04, random_state=0)
y_km = km.fit_predict(X_stem_vect)
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X_stem_vect, y_km, metric='euclidean')
for n in range(5):
print('Cluster #%2d | ' % (n+1), end='')
val = []
index = []
for i, c in enumerate(y_km):
if c == n:
val.append(silhouette_vals[i])
index.append(i)
sort = np.argsort(-np.array(val))
print('Avg Silhouette: %6.3f | Topics: ' %np.average(val), end='')
for j in range(3):
print(df['title_abstract'][index[sort[j]]].split('.')[0], end='. ')
print('')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[y_km==c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(float(i) / n_clusters)
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color)
yticks.append((y_ax_lower+y_ax_upper) / 2.)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color='red')
plt.yticks(yticks, cluster_labels +1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
plt.show()
import pyprind
pbar = pyprind.ProgBar(8)
sil_val = []
for n in range(3, 11):
ac = AgglomerativeClustering(n_clusters=n,
affinity='euclidean',
linkage='complete')
y_ac = ac.fit_predict(X_stem_vect)
sil_val.append(silhouette_score(X_stem_vect, y_ac, metric='euclidean'))
pbar.update()
plt.plot(range(3,11), sil_val, marker='s')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Value')
plt.xlim(3,10)
plt.grid()
plt.tight_layout()
plt.show()
ac = AgglomerativeClustering(n_clusters=4,
affinity='euclidean',
linkage='complete')
labels = ac.fit_predict(X_stem_vect)
print('Cluster labels %s' % labels[0:10])
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels = np.unique(labels)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X_stem_vect, labels, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[labels==c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(float(i) / n_clusters)
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color)
yticks.append((y_ax_lower+y_ax_upper) / 2.)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color='red')
plt.yticks(yticks, cluster_labels +1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
plt.show()
silhouette_vals = silhouette_samples(X_stem_vect, labels, metric='euclidean')
for n in range(4):
print('Cluster #%2d | ' % (n+1), end='')
val = []
index = []
for i, c in enumerate(labels):
if c == n:
val.append(silhouette_vals[i])
index.append(i)
sort = np.argsort(-np.array(val))
print('Article: ', end='')
for j in range(3):
print(df['title_abstract'][index[sort[j]]].split('.')[0], end='. ')
print('')