Tracking down car information using VINs#
While our dataset supposedly includes the make, model and year of the cars, human-entered data can easily be full of typos and other flaws. By using a car's unique VIN identifier, though, we can use a government database to easily track down a car's make, model and year.
import pandas as pd
import requests
import os
import time
# You can't use make/model because it's really dirty,
# so we take the VIN then use that to look up make/model/year
# we'll keep them around to test, though!
vehicles = pd.read_csv("combined-vehicle-data.csv", usecols=['VIN_NO', 'VEH_MAKE', 'VEH_MODEL', 'VEH_YEAR'])
vehicles.head(2)
all_vins = vehicles.VIN_NO.dropna().unique()
len(all_vins)
I tried a LOT of libraries that weren't this API and they were all impossible trash
Trying an API from the gov't#
API_URL = 'https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVINValuesBatch/'
def fetch_vin_data(vin_list, dryrun=False):
delimited_vin_list = ';'.join(vin_list)
post_fields = { 'format': 'json', 'data': delimited_vin_list }
if dryrun:
print("Querying for", delimited_vin_list)
return pd.DataFrame([])
else:
response = requests.post(API_URL, data=post_fields)
return pd.DataFrame(response.json()['Results'])
#fetch_vin_data(['JNKCV51E63M013580', '5TFUY5F1XBX167340', '2HGFG4A59FH702545'])
fetch_vin_data(all_vins[:3])
numbers_per_batch = 1000
start = 70000
total = len(all_vins)
# Try again with 250 a lot failed
offset = start
error_offsets = []
while offset <= (start + total):
try:
filename = f"vin_cache/{start}-{offset}-{total}-{numbers_per_batch}.csv"
if not os.path.exists(filename):
time.sleep(1)
print(f"\rQuerying {offset - start} / {total- start} - batch of {numbers_per_batch} ", end='')
current = all_vins[offset:offset+numbers_per_batch]
result = fetch_vin_data(current)
result.to_csv(filename, index=False)
else:
print(f"\rSkipping {offset - start} / {total - start} - batch of {numbers_per_batch} ", end='')
except:
pass
offset = offset + numbers_per_batch
vin_complete.head()