Downloading bill data from LegiScan#
There is a website called LegiScan. From their about page:
LegiScan launched to support the release of the national LegiScan data service, providing the nation's first impartial real-time legislative tracking service designed for both public citizens and government affairs professionals across all sectors in organizations large and small. Utilizing the LegiScan API, having nearly 20 years of development maturity, allows us to provide monitoring of every bill in the 50 states and Congress. Giving our users and clients a central and uniform interface with the ability to easily track a wide array of legislative information. Paired with one of the country's most powerful national full bill text legislative search engines.
We're using to use their API to download data on over a million different pieces of legislation in the US.
Imports#
import zipfile
import base64
import io
import glob
import time
import json
import os
import requests
import mimetypes
pylegiscan#
To talk to LegiScan's API, we're borrowing some code from pylegiscan. Since it isn't a package you can install with pip
, it wound up being easier for distribution to just cut and paste it here.
# Taken from https://github.com/poliquin/pylegiscan/blob/master/pylegiscan/legiscan.py
import os
import json
import requests
from urllib.parse import urlencode
from urllib.parse import quote_plus
# current aggregate status of bill
BILL_STATUS = {1: "Introduced",
2: "Engrossed",
3: "Enrolled",
4: "Passed",
5: "Vetoed",
6: "Failed/Dead"}
# significant steps in bill progress.
BILL_PROGRESS = {1: "Introduced",
2: "Engrossed",
3: "Enrolled",
4: "Passed",
5: "Vetoed",
6: "Failed/Dead",
7: "Veto Override",
8: "Chapter/Act/Statute",
9: "Committee Referral",
10: "Committee Report Pass",
11: "Committee Report DNP"}
"""
Interact with LegiScan API.
"""
# a helpful list of valid legiscan state abbreviations (no Puerto Rico)
STATES = ['ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
'hi', 'ia', 'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me',
'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm',
'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
'ut', 'va', 'vt', 'wa', 'wi', 'wv', 'wy']
class LegiScanError(Exception):
pass
class LegiScan(object):
BASE_URL = 'http://api.legiscan.com/?key={0}&op={1}&{2}'
def __init__(self, apikey=None):
"""LegiScan API. State parameters should always be passed as
USPS abbreviations. Bill numbers and abbreviations are case
insensitive. Register for API at http://legiscan.com/legiscan
"""
# see if API key available as environment variable
if apikey is None:
apikey = os.environ['LEGISCAN_API_KEY']
self.key = apikey.strip()
def _url(self, operation, params=None):
"""Build a URL for querying the API."""
if not isinstance(params, str) and params is not None:
params = urlencode(params)
elif params is None:
params = ''
return self.BASE_URL.format(self.key, operation, params)
def _get(self, url):
"""Get and parse JSON from API for a url."""
req = requests.get(url)
if not req.ok:
raise LegiScanError('Request returned {0}: {1}'\
.format(req.status_code, url))
data = json.loads(req.content)
if data['status'] == "ERROR":
raise LegiScanError(data['alert']['message'])
return data
def get_session_list(self, state):
"""Get list of available sessions for a state."""
url = self._url('getSessionList', {'state': state})
data = self._get(url)
return data['sessions']
def get_dataset_list(self, state=None, year=None):
"""Get list of available datasets, with optional state and year filtering.
"""
if state is not None:
url = self._url('getDatasetList', {'state': state})
elif year is not None:
url = self._url('getDatasetList', {'year': year})
else:
url = self._url('getDatasetList')
data = self._get(url)
# return a list of the bills
return data['datasetlist']
def get_dataset(self, id, access_key):
"""Get list of available datasets, with optional state and year filtering.
"""
url = self._url('getDataset', {'id': id, 'access_key': access_key})
data = self._get(url)
# return a list of the bills
return data['dataset']
def get_master_list(self, state=None, session_id=None):
"""Get list of bills for the current session in a state or for
a given session identifier.
"""
if state is not None:
url = self._url('getMasterList', {'state': state})
elif session_id is not None:
url = self._url('getMasterList', {'id': session_id})
else:
raise ValueError('Must specify session identifier or state.')
data = self._get(url)
# return a list of the bills
return [data['masterlist'][i] for i in data['masterlist']]
def get_bill(self, bill_id=None, state=None, bill_number=None):
"""Get primary bill detail information including sponsors, committee
references, full history, bill text, and roll call information.
This function expects either a bill identifier or a state and bill
number combination. The bill identifier is preferred, and required
for fetching bills from prior sessions.
"""
if bill_id is not None:
url = self._url('getBill', {'id': bill_id})
elif state is not None and bill_number is not None:
url = self._url('getBill', {'state': state, 'bill': bill_number})
else:
raise ValueError('Must specify bill_id or state and bill_number.')
return self._get(url)['bill']
def get_bill_text(self, doc_id):
"""Get bill text, including date, draft revision information, and
MIME type. Bill text is base64 encoded to allow for PDF and Word
data transfers.
"""
url = self._url('getBillText', {'id': doc_id})
return self._get(url)['text']
def get_amendment(self, amendment_id):
"""Get amendment text including date, adoption status, MIME type, and
title/description information. The amendment text is base64 encoded
to allow for PDF and Word data transfer.
"""
url = self._url('getAmendment', {'id': amendment_id})
return self._get(url)['amendment']
def get_supplement(self, supplement_id):
"""Get supplement text including type of supplement, date, MIME type
and text/description information. Supplement text is base64 encoded
to allow for PDF and Word data transfer.
"""
url = self._url('getSupplement', {'id': supplement_id})
return self._get(url)['supplement']
def get_roll_call(self, roll_call_id):
"""Roll call detail for individual votes and summary information."""
data = self._get(self._url('getRollcall', {'id': roll_call_id}))
return data['roll_call']
def get_sponsor(self, people_id):
"""Sponsor information including name, role, and a followthemoney.org
person identifier.
"""
url = self._url('getSponsor', {'id': people_id})
return self._get(url)['person']
def search(self, state, bill_number=None, query=None, year=2, page=1):
"""Get a page of results for a search against the LegiScan full text
engine; returns a paginated result set.
Specify a bill number or a query string. Year can be an exact year
or a number between 1 and 4, inclusive. These integers have the
following meanings:
1 = all years
2 = current year, the default
3 = recent years
4 = prior years
Page is the result set page number to return.
"""
if bill_number is not None:
params = {'state': state, 'bill': bill_number}
elif query is not None:
params = {'state': state, 'query': query,
'year': year, 'page': page}
else:
raise ValueError('Must specify bill_number or query')
data = self._get(self._url('search', params))['searchresult']
# return a summary of the search and the results as a dictionary
summary = data.pop('summary')
results = {'summary': summary, 'results': [data[i] for i in data]}
return results
def __str__(self):
return '<LegiScan API {0}>'.format(self.key)
def __repr__(self):
return str(self)
Connect to LegiScan#
Using pylegiscan, you just pass your API key to LegiScan
and you're good to go. I set up an environment variable for mine, but you can also just paste yours at OR_PUT_YOUR_API_KEY_HERE
.
api_key = os.environ.get('LEGISCAN_API_KEY', 'OR_PUT_YOUR_API_KEY_HERE')
legis = LegiScan(api_key)
If you wanted to search for bills based on state or text, that's easy to do.
bills = legis.search(state='tx', query='abortion')
bills['summary'] # how many results did we get?
You can also get single bills, one at a time, as long as you know their ID in the LegiScan database.
legis.get_bill('1256258')
datasets = legis.get_dataset_list()
dataset = legis.get_dataset(datasets[20]['session_id'], datasets[20]['access_key'])
dataset.keys()
They come in a really weird format, though: a base64-encoded zip file. SO first we need to convert the base64 zipfile into a normal file, then unzip it!
z_bytes = base64.b64decode(dataset['zip'])
z = zipfile.ZipFile(io.BytesIO(z_bytes))
z.extractall("./sample-data")
It creates a lot lot lot lot lot of .json
files. For example, let's take a look at a sample of what we just extracted.
import glob
filenames = glob.glob("./sample-data/*/*/bill/*", recursive=True)
filenames[:15]
Each file has all sorts of information about the bill, but none of the text of the bill itself! You can see for yourself:
import json
json_data = json.load(open("./sample-data/AK/2017-2018_30th_Legislature/bill/SCR10.json"))
json_data
You can download the bill text if you have the ID, but... for some reason we don't do this. I'm going to be honest: I don't remember why. Maybe it's because they're older versions? They're incomplete? I truly have forgotten.
doc = legis.get_bill_text('2015157')
contents = base64.b64decode(doc['doc'])
with open("filename.html", "wb") as file:
file.write(contents)
What we're going to need is the URL to the published version.
json_data['bill']['texts'][-1]
We're going to need the URL to the published version from every single one of those JSON files.
Download and extract all of the datasets from LegiScan#
datasets = legis.get_dataset_list()
len(datasets)
Downloading and extracting all 583 is going to take a while, so we'll use a progress bar from tqdm to keep track of where we're at.
import tqdm
total = len(datasets)
for dataset in tqdm.tqdm_notebook(datasets):
session_id = dataset['session_id']
access_key = dataset['access_key']
details = legis.get_dataset(session_id, access_key)
z_bytes = base64.b64decode(details['zip'])
z = zipfile.ZipFile(io.BytesIO(z_bytes))
z.extractall("./bill_data")
Converting the many JSON files to single CSV file#
The data isn't doing us much good sitting around as a zillion json files, so we'll convert them into a CSV file with the pieces of information we're interested in. Those pieces are:
- State
- Bill title
- Bill URL
filenames = glob.glob("bill_data/*/*/bill/*.json")
len(filenames)
filenames[:5]
If we want to process over a million rows, it's going to take a while! To speed things up we're going to turn to swifter, a package that can parallelize work on pandas dataframes. It's pretty easy to use:
without swifter:
df = pd.Series(filenames).apply(process_json)
with swifter:
df = pd.Series(filenames).swifter.apply(process_json)
And it does all the hard work for you! You just use it and hope for the best.
import json
import os
import swifter
import pandas as pd
def process_json(filename):
with open(filename) as file:
bill_data = {}
# We need to do a little string replacing so the
json_str = file.read().replace('"0000-00-00"', 'null')
content = json.loads(json_str)['bill']
bill_data['bill_id'] = content['bill_id']
bill_data['code'] = os.path.splitext(os.path.basename(filename))[0]
bill_data['bill_number'] = content['bill_number']
bill_data['title'] = content['title']
bill_data['description'] = content['description']
bill_data['state'] = content['state']
bill_data['session'] = content['session']['session_name']
bill_data['filename'] = filename
bill_data['status'] = content['status']
bill_data['status_date'] = content['status_date']
try:
bill_data['url'] = content['texts'][-1]['state_link']
except:
pass
return pd.Series(bill_data)
df = pd.Series(filenames).swifter.apply(process_json)
df.head()
And now we'll save it to prepare for the next step: inserting it into a database.
df.to_csv("data/bills-with-urls.csv", index=False)