Using topic modeling to categorize legislation#
As we reproduced USA Today's piece on model legislation, we used text similarity to narrow down our search field of similar bills. An alternative approach could be to use keywords, or to assign topics to each bill, and only search within those in the same category.
Topic modeling is one way of assigning categories to your texts.
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('postgresql://localhost:5432/legislation', isolation_level="AUTOCOMMIT")
df = pd.read_sql_query("""
SELECT *
FROM bills
TABLESAMPLE BERNOULLI (20)
WHERE content is not null
""", engine)
df.shape
df.head()
%%time
from sklearn.feature_extraction.text import CountVectorizer
import Stemmer
stemmer = Stemmer.Stemmer('en')
analyzer = CountVectorizer().build_analyzer()
class StemmedCountVectorizer(CountVectorizer):
def build_analyzer(self):
analyzer = super(CountVectorizer, self).build_analyzer()
return lambda doc: stemmer.stemWords(analyzer(doc))
vectorizer = StemmedCountVectorizer(max_features=10000, max_df=0.25)
matrix = vectorizer.fit_transform(df.head(10000).content)
%%time
from sklearn.model_selection import GridSearchCV
# Define Search Param
search_params = {'n_components': [5, 10, 20, 30], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(learning_method='online')
# Init Grid Search Class
gridsearch = GridSearchCV(lda, param_grid=search_params, cv=5, n_jobs=-1, verbose=1)
# Do the Grid Search
gridsearch.fit(matrix)
# Model Parameters
print("Best Model's Params: ", gridsearch.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", gridsearch.best_score_)
%%time
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=30, learning_method='online')
lda.fit_transform(matrix)
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(matrix))
n_words = 5
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
message = "Topic #%d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_words - 1:-1]])
print(message)
print()