Tracking down car information using VINs#
While our dataset supposedly includes the make, model and year of the cars, human-entered data can easily be full of typos and other flaws. By using a car's unique VIN identifier, though, we can use a government database to easily track down a car's make, model and year.
import pandas as pd
import requests
import os
import time
# You can't use make/model because it's really dirty, 
# so we take the VIN then use that to look up make/model/year
# we'll keep them around to test, though!
vehicles = pd.read_csv("combined-vehicle-data.csv", usecols=['VIN_NO', 'VEH_MAKE', 'VEH_MODEL', 'VEH_YEAR'])
vehicles.head(2)
all_vins = vehicles.VIN_NO.dropna().unique()
len(all_vins)
I tried a LOT of libraries that weren't this API and they were all impossible trash
Trying an API from the gov't#
API_URL = 'https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVINValuesBatch/'
def fetch_vin_data(vin_list, dryrun=False):    
    delimited_vin_list = ';'.join(vin_list)
    post_fields = { 'format': 'json', 'data': delimited_vin_list }
    if dryrun:
        print("Querying for", delimited_vin_list)
        return pd.DataFrame([])
    else:
        response = requests.post(API_URL, data=post_fields)
        return pd.DataFrame(response.json()['Results'])
#fetch_vin_data(['JNKCV51E63M013580', '5TFUY5F1XBX167340', '2HGFG4A59FH702545'])
fetch_vin_data(all_vins[:3])
numbers_per_batch = 1000
start = 70000
total = len(all_vins)
# Try again with 250 a lot failed
offset = start
error_offsets = []
while offset <= (start + total):
    try:
        filename = f"vin_cache/{start}-{offset}-{total}-{numbers_per_batch}.csv"
        if not os.path.exists(filename):
            time.sleep(1)
            print(f"\rQuerying {offset - start} / {total- start} - batch of {numbers_per_batch} ", end='')
            current = all_vins[offset:offset+numbers_per_batch]
            result = fetch_vin_data(current)
            result.to_csv(filename, index=False)
        else:
            print(f"\rSkipping {offset - start} / {total - start} - batch of {numbers_per_batch} ", end='')
    except:
        pass
    offset = offset + numbers_per_batch
vin_complete.head()