Combining many, many Excel files on car crash data#

Armed with several years of quarterly reports from Maryland's Department of Transportation, we'll need to wrangle this series of files with multiple tabs into a coherent set of CSVs.

import glob
import pandas as pd
#!pip install xlrd
filenames = glob.glob("data/Crash_*.xlsx")
filenames
['data/Crash_Qtr02_2018.xlsx',
 'data/Crash_Qtr04_2015.xlsx',
 'data/Crash_Qtr01_2016.xlsx',
 'data/Crash_Qtr02_2015.xlsx',
 'data/Crash_Qtr01_2017.xlsx',
 'data/Crash_Qtr03_2015.xlsx',
 'data/Crash_Qtr03_2016.xlsx',
 'data/Crash_Qtr03_2017.xlsx',
 'data/Crash_Qtr04_2017.xlsx',
 'data/Crash_Qtr01_2018.xlsx',
 'data/Crash_Qtr02_2016.xlsx',
 'data/Crash_Qtr01_2015.xlsx',
 'data/Crash_Qtr02_2017.xlsx',
 'data/Crash_Qtr04_2016.xlsx']
excels = [pd.ExcelFile(filename) for filename in filenames]
excels
[<pandas.io.excel.ExcelFile at 0x11013b208>,
 <pandas.io.excel.ExcelFile at 0x105eb89e8>,
 <pandas.io.excel.ExcelFile at 0x11013b390>,
 <pandas.io.excel.ExcelFile at 0x11b1c2c88>,
 <pandas.io.excel.ExcelFile at 0x1137cb828>,
 <pandas.io.excel.ExcelFile at 0x1235a06a0>,
 <pandas.io.excel.ExcelFile at 0x11029a048>,
 <pandas.io.excel.ExcelFile at 0x11013b358>,
 <pandas.io.excel.ExcelFile at 0x130279eb8>,
 <pandas.io.excel.ExcelFile at 0x1412c9da0>,
 <pandas.io.excel.ExcelFile at 0x1412c9f98>,
 <pandas.io.excel.ExcelFile at 0x119615160>,
 <pandas.io.excel.ExcelFile at 0x1531a1438>,
 <pandas.io.excel.ExcelFile at 0x169c51fd0>]
crash_frames = []
vehicle_frames = []
person_frames = []
for excel in excels:
    # Stack on those vehicles
    vehicle_frames.append(excel.parse('VEHICLE'))
    person_frames.append(excel.parse('PERSON'))
    
    # Some years it's ACCIDENT and some its' CRASH
    sheet_name = "CRASH" if 'CRASH' in excel.sheet_names else "ACCIDENT"
    crash_frames.append(excel.parse(sheet_name))
people = pd.concat(person_frames, ignore_index=True, sort=True)
crashes = pd.concat(crash_frames, ignore_index=True, sort=True)
vehicles = pd.concat(vehicle_frames, ignore_index=True, sort=True)
vehicles.shape
(741906, 30)
vehicles.head()
AREA_DAMAGED_CODE1 AREA_DAMAGED_CODE2 AREA_DAMAGED_CODE3 AREA_DAMAGED_CODE_IMP1 AREA_DAMAGED_CODE_MAIN BODY_TYPE_CODE COMMERCIAL_FLAG CONTI_DIRECTION_CODE CV_BODY_TYPE_CODE DAMAGE_CODE ... PARKED_FLAG REPORT_NO SPEED_LIMIT TOWED_AWAY_FLAG TOWED_VEHICLE_CONFIG_CODE VEHICLE_ID VEH_MAKE VEH_MODEL VEH_YEAR VIN_NO
0 8.0 9.0 10.0 10.0 10.0 23.08 N E NaN 5 ... N ADJ487004H 30 Y 0 000238fd-44fa-4cd5-8eb7-41ab30500bec CHEVY TAHOE 2005.0 1GNEK13Q2J285593
1 12.0 NaN NaN 12.0 12.0 2.00 N E NaN 4 ... N MCP2487000M 40 Y 0 00038116-1bf9-48cc-b317-4f4375d14b60 INFI 4S 2003.0 JNKCV51E63M013580
2 6.0 7.0 NaN 6.0 6.0 20.00 N N NaN 3 ... N CB5190006B 55 N 0 0003b659-2785-4868-8877-0b786a284827 TOYT TK 2011.0 5TFUY5F1XBX167340
3 8.0 NaN NaN 8.0 8.0 2.00 N W NaN 2 ... N ADJ4590035 5 N 0 00050484-d08f-4b6e-bc7e-9ec270e94660 HONDA CIVIC 2015.0 2HGFG4A59FH702545
4 12.0 NaN NaN 12.0 12.0 2.00 N N NaN 4 ... N ADJ849000Z 10 N 0 00057af4-d848-4cee-b854-707f57581f4e HONDA ACCORD 2003.0 1HGCM66313A037175

5 rows × 30 columns

vehicles.columns
Index(['AREA_DAMAGED_CODE1', 'AREA_DAMAGED_CODE2', 'AREA_DAMAGED_CODE3',
       'AREA_DAMAGED_CODE_IMP1', 'AREA_DAMAGED_CODE_MAIN', 'BODY_TYPE_CODE',
       'COMMERCIAL_FLAG', 'CONTI_DIRECTION_CODE', 'CV_BODY_TYPE_CODE',
       'DAMAGE_CODE', 'DRIVERLESS_FLAG', 'FIRE_FLAG', 'GOING_DIRECTION_CODE',
       'GVW_CODE', 'HARM_EVENT_CODE', 'HAZMAT_SPILL_FLAG', 'HIT_AND_RUN_FLAG',
       'HZM_NUM', 'MOVEMENT_CODE', 'NUM_AXLES', 'PARKED_FLAG', 'REPORT_NO',
       'SPEED_LIMIT', 'TOWED_AWAY_FLAG', 'TOWED_VEHICLE_CONFIG_CODE',
       'VEHICLE_ID', 'VEH_MAKE', 'VEH_MODEL', 'VEH_YEAR', 'VIN_NO'],
      dtype='object')
people.head()
AIRBAG_DEPLOYED ALCOHOL_TESTTYPE_CODE ALCOHOL_TEST_CODE BAC_CODE CDL_FLAG CLASS CONDITION_CODE DATE_OF_BIRTH DRUG_TESTRESULT_CODE DRUG_TEST_CODE ... PED_LOCATION_CODE PED_OBEY_CODE PED_TYPE_CODE PED_VISIBLE_CODE PERSON_ID PERSON_TYPE REPORT_NO SAF_EQUIP_CODE SEX_CODE VEHICLE_ID
0 1.0 NaN 0.0 NaN N C 0.0 1952-04-20 00:00:00 NaN 0.0 ... NaN NaN NaN NaN 48dd00ee-e033-47e7-ad1e-0b734020301b D AB4284000S 13.0 F eb6aadb8-dacb-4744-a1a7-ab812c96f27f
1 1.0 NaN 0.0 NaN N NaN 0.0 1985-05-28 00:00:00 NaN 0.0 ... NaN NaN NaN NaN 166296bd-ffd3-4c16-aa74-4f4bf4139d8d D AB4313000X 13.0 F b463eb20-2f01-4200-9d6f-b18888ce2593
2 1.0 NaN 0.0 NaN N C 0.0 1960-10-04 00:00:00 NaN 0.0 ... NaN NaN NaN NaN f3b2743f-fbc3-4345-9419-56a0ca29102c D AB4313000X 13.0 F 3c8629d0-d524-47c1-bfbc-b18e07f3087e
3 1.0 NaN 0.0 NaN N B 0.0 1971-05-28 00:00:00 NaN 0.0 ... NaN NaN NaN NaN 5bbe589b-a2db-4dbb-be9b-17a800d69a08 D AB4313000Y 0.0 F c4628cdb-f295-4a24-8a4b-653741ac6ae7
4 1.0 NaN 0.0 NaN N D 0.0 1955-04-23 00:00:00 NaN 0.0 ... NaN NaN NaN NaN b914136f-5ecd-46bb-94ec-ff5d4136c3eb D AB4669001F 13.0 F cdda1580-fd79-4358-8819-c2250f494591

5 rows × 28 columns

people.columns
Index(['AIRBAG_DEPLOYED', 'ALCOHOL_TESTTYPE_CODE', 'ALCOHOL_TEST_CODE',
       'BAC_CODE', 'CDL_FLAG', 'CLASS', 'CONDITION_CODE', 'DATE_OF_BIRTH',
       'DRUG_TESTRESULT_CODE', 'DRUG_TEST_CODE', 'EJECT_CODE',
       'EMS_UNIT_LABEL', 'EQUIP_PROB_CODE', 'FAULT_FLAG', 'INJ_SEVER_CODE',
       'LICENSE_STATE_CODE', 'MOVEMENT_CODE', 'OCC_SEAT_POS_CODE',
       'PED_LOCATION_CODE', 'PED_OBEY_CODE', 'PED_TYPE_CODE',
       'PED_VISIBLE_CODE', 'PERSON_ID', 'PERSON_TYPE', 'REPORT_NO',
       'SAF_EQUIP_CODE', 'SEX_CODE', 'VEHICLE_ID'],
      dtype='object')
 
vehicles.to_csv("combined-vehicle-data.csv", index=False)
crashes.to_csv("combined-crash-data.csv", index=False)
people.to_csv("combined-person-data.csv", index=False)