Tracking down car information using VINs#

While our dataset supposedly includes the make, model and year of the cars, human-entered data can easily be full of typos and other flaws. By using a car's unique VIN identifier, though, we can use a government database to easily track down a car's make, model and year.

import pandas as pd
import requests
import os
import time
# You can't use make/model because it's really dirty, 
# so we take the VIN then use that to look up make/model/year
# we'll keep them around to test, though!
vehicles = pd.read_csv("combined-vehicle-data.csv", usecols=['VIN_NO', 'VEH_MAKE', 'VEH_MODEL', 'VEH_YEAR'])
vehicles.head(2)
VEH_MAKE VEH_MODEL VEH_YEAR VIN_NO
0 CHEVY TAHOE 2005.0 1GNEK13Q2J285593
1 INFI 4S 2003.0 JNKCV51E63M013580
all_vins = vehicles.VIN_NO.dropna().unique()
len(all_vins)
641121

I tried a LOT of libraries that weren't this API and they were all impossible trash

Trying an API from the gov't#

API_URL = 'https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVINValuesBatch/'

def fetch_vin_data(vin_list, dryrun=False):    
    delimited_vin_list = ';'.join(vin_list)
    post_fields = { 'format': 'json', 'data': delimited_vin_list }
    if dryrun:
        print("Querying for", delimited_vin_list)
        return pd.DataFrame([])
    else:
        response = requests.post(API_URL, data=post_fields)
        return pd.DataFrame(response.json()['Results'])
#fetch_vin_data(['JNKCV51E63M013580', '5TFUY5F1XBX167340', '2HGFG4A59FH702545'])
fetch_vin_data(all_vins[:3])
ABS ActiveSafetySysNote AdaptiveCruiseControl AdaptiveDrivingBeam AdaptiveHeadlights AdditionalErrorText AirBagLocCurtain AirBagLocFront AirBagLocKnee AirBagLocSeatCushion ... VIN ValveTrainDesign VehicleType WheelBaseLong WheelBaseShort WheelBaseType WheelSizeFront WheelSizeRear Wheels Windows
0 In the Possible values section, the Numeric va... ... 1GNEK13Q2J285593 MULTIPURPOSE PASSENGER VEHICLE (MPV)
1 1st Row (Driver & Passenger) 1st Row (Driver & Passenger) ... JNKCV51E63M013580 PASSENGER CAR
2 All Rows 1st Row (Driver & Passenger) 1st Row (Driver & Passenger) ... 5TFUY5F1XBX167340 TRUCK Long

3 rows × 146 columns

numbers_per_batch = 1000
start = 70000
total = len(all_vins)

# Try again with 250 a lot failed

offset = start
error_offsets = []
while offset <= (start + total):
    try:
        filename = f"vin_cache/{start}-{offset}-{total}-{numbers_per_batch}.csv"
        if not os.path.exists(filename):
            time.sleep(1)
            print(f"\rQuerying {offset - start} / {total- start} - batch of {numbers_per_batch} ", end='')
            current = all_vins[offset:offset+numbers_per_batch]

            result = fetch_vin_data(current)
            result.to_csv(filename, index=False)
        else:
            print(f"\rSkipping {offset - start} / {total - start} - batch of {numbers_per_batch} ", end='')
    except:
        pass
    offset = offset + numbers_per_batch
Skipping 328000 / 571121 - batch of 1000 Querying 641000 / 571121 - batch of 1000 
vin_complete.head()
ABS ActiveSafetySysNote AdaptiveCruiseControl AdaptiveDrivingBeam AdaptiveHeadlights AdditionalErrorText AirBagLocCurtain AirBagLocFront AirBagLocKnee AirBagLocSeatCushion ... VIN ValveTrainDesign VehicleType WheelBaseLong WheelBaseShort WheelBaseType WheelSizeFront WheelSizeRear Wheels Windows
0 In the Possible values section, the Numeric va... ... 1GNEK13Q2J285593 MULTIPURPOSE PASSENGER VEHICLE (MPV)
1 1st Row (Driver & Passenger) 1st Row (Driver & Passenger) ... JNKCV51E63M013580 PASSENGER CAR
2 All Rows 1st Row (Driver & Passenger) 1st Row (Driver & Passenger) ... 5TFUY5F1XBX167340 TRUCK Long
3 1st & 2nd Rows 1st Row (Driver & Passenger) ... 2HGFG4A59FH702545 Dual Overhead Cam (DOHC) PASSENGER CAR
4 1st & 2nd Rows 1st Row (Driver & Passenger) ... 1HGCM66313A037175 Single Overhead Cam (SOHC) PASSENGER CAR

5 rows × 146 columns