Text analysis snippets
Python data science coding reference from investigate.ai
Reading in files
Reading in one file
Reading in one file, nice and easy
content = open("filename.txt").read()
Reading in multiple files
This will give you a dataframe with two columns - one with the filename, the other with the contents of the file.
It also uses glob
to pattern-match - this will read in all filenames that end in .txt
in the current folder.
import glob
import pandas as pd
filenames = glob.glob("*.txt")
contents = [open(filename).read() for filename in filenames]
df = pd.DataFrame({
'filename': filenames,
'content': contents
})
df.head()
Topic modeling
NME/NMF with sklearn
from sklearn.decomposition import NMF
model = NMF(n_components=5)
model.fit(matrix)
LDA with sklearn
from sklearn.decomposition import LatentDirichletAllocation
model = LatentDirichletAllocation(n_components=5,
learning_decay=0.7,
learning_method='online')
model.fit(matrix)
LSA/LSI with sklearn
from sklearn.decomposition import TruncatedSVD
model = TruncatedSVD(n_components=5)
model.fit(matrix)
Find best options for LDA
%%time
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
# Options to try with our LDA
# Beware it will try *all* of the combinations, so it'll take ages
search_params = {
'n_components': [5, 7, 10, 15],
'learning_decay': [.5, .7, .9]
}
# Set up LDA with the options we'll keep static
model = LatentDirichletAllocation(learning_method='online')
# Try all of the options
gridsearch = GridSearchCV(model, param_grid=search_params, cv=5, n_jobs=-1, verbose=1)
gridsearch.fit(matrix)
# What did we find?
print("Best Model's Params: ", gridsearch.best_params_)
print("Best Log Likelihood Score: ", gridsearch.best_score_)
Topic terms for topic models
n_words = 5
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
message = "Topic #%d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_words - 1:-1]])
print(message)
print()
Clustering
K-means
from sklearn.cluster import KMeans
km = KMeans(n_clusters=5)
km.fit(matrix)
df['prediction'] = km.predict()
Top cluster terms
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))