先將csv檔做預處理,像是將標點符號刪除,以及將英文字體都變成小寫,可以看到我的title_abstract_processed即為處理過後的樣子。
import pandas as pd
#將數據讀入papers
papers = pd.read_csv('Google_AI_published_research.csv')
#打印頭
papers.head()
#加載正則表達式庫
import re
#刪除標點符號
papers['title_abstract_processed']=papers['title_abstract'].map(lambda x:re.sub('[,\.!?:]','',x))
#將標題轉換為小寫的
papers ['title_abstract_processed'] = papers ['title_abstract_processed'].map(lambda x:x.lower())
#打印出論文的第一行
papers ['title_abstract_processed'].head()
接下來是將所有的資料依據出現次數變成一個可視化詞雲,方便檢視。由下圖可知,model、data、using皆是出現率很高的詞彙。
#從wordcloud導入wordcloud 導入wordcloud庫
from wordcloud import WordCloud
#將不同的已處理標題結合在一起。
long_string =','.join(list(papers ['title_abstract_processed'].values))
#創建一個WordCloud對象
wordcloud = WordCloud(background_color ="white",max_words = 5000,contour_width= 3,contour_color='steelblue')
#生成詞雲
wordcloud.generate(long_string)
#可視化詞雲
wordcloud.to_image()
計算各個單字出現的數量,由下圖可和可視化詞雲做對照。
#從sklearn.feature_extraction.text 使用CountVectorizer方法加載庫import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
#matplotlib inline
#輔助函數
def plot_10_most_common_words(count_data, count_vectorizer):
import matplotlib.pyplot as plt
words= count_vectorizer.get_feature_names()
total_counts = np.zeros(len(words))
for t in count_data:
total_counts +=t.toarray()[0]
count_dict =(zip(words, total_counts))
count_dict = sorted(count_dict,key = lambda x:x [1],reverse = True)[0:10]
words= [w [0] for w in count_dict]
counts = [w [1] for w in count_dict]
x_pos = np.arange(len(words))
plt.figure(2,figsize =(15,15 / 1.6180))
plt.subplot(title = '10 most common words')
sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.barplot(x_pos,counts,palette ='husl')
plt.xticks(x_pos,words,rotation= 90)
plt.xlabel('words')
plt.ylabel('counts')
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
#使用英語停用詞初始化count矢量化器
count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(papers ['title_abstract_processed'])
plot_10_most_common_words(count_data,count_vectorizer)
設定利用LDA要用的feature數和要得topic數量。
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
max_df=.1,
max_features=5000)
X = count.fit_transform(papers['title_abstract_processed'].values)
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10,
random_state=123,
learning_method='batch')
X_topics=lda.fit_transform(X)
print("(topics,features)=")
lda.components_.shape
以下是無限制feature=5000所做出來的10個topics。
#導入警告
import warnings
warnings.simplefilter("ignore",DeprecationWarning)
#從sklearn.decomposition的sk-learn中加載LDA模型。
from sklearn.decomposition import LatentDirichletAllocation as LDA
#輔助函數
def print_topics(model,count_vectorizer,n_top_words):
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
#調整下面的兩個參數
number_topics = 10
number_words = 10
#創建並擬合LDA模型
lda = LDA(n_components = number_topics,n_jobs = -1)
lda.fit(count_data)
#打印LDA模型找到的主題
print("topics:")
print_topics(lda,count_vectorizer,number_words)