Using topic modeling to categorize legislation#

As we reproduced USA Today's piece on model legislation, we used text similarity to narrow down our search field of similar bills. An alternative approach could be to use keywords, or to assign topics to each bill, and only search within those in the same category.

Topic modeling is one way of assigning categories to your texts.

import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('postgresql://localhost:5432/legislation', isolation_level="AUTOCOMMIT")
df = pd.read_sql_query("""
        SELECT *
        FROM bills
        TABLESAMPLE BERNOULLI (20)
        WHERE content is not null
""", engine)
df.shape
(202882, 15)
df.head()
id bill_id code bill_number title description state session filename status status_date url error content processed_at
0 17 288308 JRH012 JRH012 Joint Resolution Urging Congress To Retain Fed... Joint Resolution Urging Congress To Retain Fed... VT 2011-2012 Session bill_data/VT/2011-2012_Regular_Session/bill/JR... 3 2011-03-08 http://www.leg.state.vt.us/docs/2012/resolutn/... None Microsoft Word - BillTemp.doc\n\n\nState of Ve... 2019-11-18 01:05:48.973909+00:00
1 13 373755 SCR032 SCR032 Senate Concurrent Resolution Honoring The Publ... Senate Concurrent Resolution Honoring The Publ... VT 2011-2012 Session bill_data/VT/2011-2012_Regular_Session/bill/SC... 4 2012-01-27 http://www.leg.state.vt.us/docs/2012/Acts/ACTR... None Microsoft Word - GENERAL-#276218-v1-Act_No__R-... 2019-11-17 22:04:56.879037+00:00
2 6 250811 H0157 H0157 An Act Relating To Restrictions On Tanning Beds An Act Relating To Restrictions On Tanning Beds VT 2011-2012 Session bill_data/VT/2011-2012_Regular_Session/bill/H0... 3 2012-04-24 http://www.leg.state.vt.us/docs/2012/Acts/ACT0... None Microsoft Word - GENERAL-#280144-v1-Act_No__97... 2019-11-17 22:05:05.226667+00:00
3 8 410679 HCR285 HCR285 House Concurrent Resolution Commemorating The ... House Concurrent Resolution Commemorating The ... VT 2011-2012 Session bill_data/VT/2011-2012_Regular_Session/bill/HC... 4 2012-03-02 http://www.leg.state.vt.us/docs/2012/resolutn/... None Microsoft Word - BillTemp.doc\n\n\nState of Ve... 2019-11-17 22:22:21.848827+00:00
4 15 427012 HCR401 HCR401 House Concurrent Resolution Congratulating Sco... House Concurrent Resolution Congratulating Sco... VT 2011-2012 Session bill_data/VT/2011-2012_Regular_Session/bill/HC... 4 2012-05-05 http://www.leg.state.vt.us/docs/2012/Acts/ACTR... None Microsoft Word - GENERAL-#281388-v1-Act_No__R-... 2019-11-17 23:09:30.982677+00:00
%%time

from sklearn.feature_extraction.text import CountVectorizer
import Stemmer

stemmer = Stemmer.Stemmer('en')

analyzer = CountVectorizer().build_analyzer()

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: stemmer.stemWords(analyzer(doc))

vectorizer = StemmedCountVectorizer(max_features=10000, max_df=0.25)
matrix = vectorizer.fit_transform(df.head(10000).content)
CPU times: user 47.4 s, sys: 2.35 s, total: 49.7 s
Wall time: 54.5 s
%%time
from sklearn.model_selection import GridSearchCV

# Define Search Param
search_params = {'n_components': [5, 10, 20, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation(learning_method='online')

# Init Grid Search Class
gridsearch = GridSearchCV(lda, param_grid=search_params, cv=5, n_jobs=-1, verbose=1)

# Do the Grid Search
gridsearch.fit(matrix)
Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 4min 57s, sys: 31 s, total: 5min 28s
Wall time: 1h 14min 25s
GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='online',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_samples=1000000.0,
                                                 verbose=0),
             iid='warn', n_jobs=-1,
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 10, 20, 30]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)
# Model Parameters
print("Best Model's Params: ", gridsearch.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", gridsearch.best_score_)
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 20}
Best Log Likelihood Score:  -13428302.519729739
 
%%time
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=30, learning_method='online')
lda.fit_transform(matrix)
CPU times: user 6min 13s, sys: 38.8 s, total: 6min 52s
Wall time: 4min 29s
array([[3.05810398e-05, 3.05810398e-05, 3.05810398e-05, ...,
        3.05810398e-05, 4.39805830e-03, 7.39428549e-02],
       [2.03416311e-02, 1.97238659e-04, 1.97238659e-04, ...,
        1.08501653e-02, 6.65828721e-01, 1.97238659e-04],
       [1.28205128e-04, 1.28205128e-04, 1.96927761e-02, ...,
        1.28205128e-04, 1.28205128e-04, 1.28205128e-04],
       ...,
       [4.01248331e-02, 4.63734847e-02, 2.68168410e-05, ...,
        2.68168410e-05, 2.68168410e-05, 2.68168410e-05],
       [6.28930818e-05, 6.28930818e-05, 6.28930818e-05, ...,
        6.28930818e-05, 6.28930818e-05, 3.99125002e-03],
       [3.67107195e-05, 6.62362324e-03, 1.45994652e-02, ...,
        3.67107195e-05, 3.67107195e-05, 3.67107195e-05]])
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(matrix))
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 5}
Best Log Likelihood Score:  -1296987.2198103345
Model Perplexity:  32484.16220204515
n_words = 5
feature_names = vectorizer.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    message = "Topic #%d: " % topic_idx
    message += " ".join([feature_names[i]
                         for i in topic.argsort()[:-n_words - 1:-1]])
    print(message)
print()
Topic #0: emerg fire medic injuri safeti
Topic #1: assist paragraph grant elig plan
Topic #2: 2019 fiscal head expens 2020
Topic #3: mississippi dollar 00 hundr thousand
Topic #4: kentucki research lrc calendar caucus
Topic #5: me photo 01 veto 06
Topic #6: land township rang water right
Topic #7: text begin statut schedul joint
Topic #8: il ls po ith frm
Topic #9: credit tax incom retir subdivis
Topic #10: minnesota subdivis commission subd bond
Topic #11: 32 50 35 38 39
Topic #12: elect vote candid ballot voter
Topic #13: your pleas lis call could
Topic #14: tax properti district citi municip
Topic #15: child parent famili care home
Topic #16: sale product manufactur retail food
Topic #17: health care licens medic insur
Topic #18: block substanc salt group cas
Topic #19: fals true pdf 00000 adob
Topic #20: project energi facil water resourc
Topic #21: facil need health bed certif
Topic #22: offens convict crimin violat offend
Topic #23: board employe appoint director district
Topic #24: utah mr le 2010 gov
Topic #25: file record action notic attorney
Topic #26: student district teacher charter enrol
Topic #27: vehicl licens motor fee highway
Topic #28: wherea resolut his alabama resolv
Topic #29: properti licens equip facil purchas