Combining many, many Excel files on car crash data#
Armed with several years of quarterly reports from Maryland's Department of Transportation, we'll need to wrangle this series of files with multiple tabs into a coherent set of CSVs.
import glob
import pandas as pd
#!pip install xlrd
filenames = glob.glob("data/Crash_*.xlsx")
filenames
excels = [pd.ExcelFile(filename) for filename in filenames]
excels
crash_frames = []
vehicle_frames = []
person_frames = []
for excel in excels:
# Stack on those vehicles
vehicle_frames.append(excel.parse('VEHICLE'))
person_frames.append(excel.parse('PERSON'))
# Some years it's ACCIDENT and some its' CRASH
sheet_name = "CRASH" if 'CRASH' in excel.sheet_names else "ACCIDENT"
crash_frames.append(excel.parse(sheet_name))
people = pd.concat(person_frames, ignore_index=True, sort=True)
crashes = pd.concat(crash_frames, ignore_index=True, sort=True)
vehicles = pd.concat(vehicle_frames, ignore_index=True, sort=True)
vehicles.shape
vehicles.head()
vehicles.columns
people.head()
people.columns
vehicles.to_csv("combined-vehicle-data.csv", index=False)
crashes.to_csv("combined-crash-data.csv", index=False)
people.to_csv("combined-person-data.csv", index=False)