-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvehicle_registration_data_processor.py
221 lines (184 loc) · 11 KB
/
vehicle_registration_data_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 8 11:26:00 2023
@author: xiaodanxu
"""
import os
from pandas import read_csv
import pandas as pd
import geopandas as gps
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
os.chdir('/Users/xiaodanxu/Documents/SynthFirm.nosync')
########## input and scenario definition ###########
medium_duty_class = [3, 4, 5, 6] # ref: https://afdc.energy.gov/data/10380
heavy_duty_class = [7, 8]
plt.style.use('ggplot')
sns.set(font_scale=1.2) # larger font
min_size_lookup = {'0-2':1, '3-5':3, '6-10':6,
'11-50':11, '51-100':51, '101-1000':101,
'>1000':1001}
input_data_file = 'CA_MDHDbybizID.csv' # define vehicle input file
selected_state = 'CA'
output_dir = 'inputs_SF/fleet/'
registration_data = read_csv('PrivateData/registration/' + input_data_file)
def fleet_composition_processor(data):
size_interval = [-1, 2, 5, 10, 50, 100, 1000, data['totcount'].max()]
interval_name = ['0-2', '3-5', '6-10', '11-50', '51-100', '101-1000', '>1000']
data.loc[:, 'size_group'] = pd.cut(data['totcount'],
bins = size_interval, right = True,
labels = interval_name)
# private_truck_by_firm.head(5)
fleet_by_size = data.groupby('size_group').agg({'business_name': 'count',
'totcount': ['sum','std'],
'DIESEL Vocational HDV': 'sum',
'DIESEL Tractor HDV': 'sum',
'DIESEL Vocational MDV': 'sum',
'GAS Vocational MDV': 'sum'})
fleet_by_size = fleet_by_size.reset_index()
fleet_by_size.columns = ['fleet_size', 'total_carriers', 'total_trucks', 'total_truck_std',
'DIESEL Vocational HDV', 'DIESEL Tractor HDV',
'DIESEL Vocational MDV', 'GAS Vocational MDV']
# print(truck_count_by_size.head(5))
fleet_by_size.loc[:, 'min_size'] = fleet_by_size.loc[:, 'fleet_size'].map(min_size_lookup)
fleet_by_size.loc[:, 'avg_truck_per_carrier'] = fleet_by_size.loc[:, 'total_trucks'] / fleet_by_size.loc[:, 'total_carriers']
# truck_count_by_size.loc[:, 'avg_sut_per_carrier'] = truck_count_by_size.loc[:, 'total_single_trucks'] / truck_count_by_size.loc[:, 'total_carriers']
# truck_count_by_size.loc[:, 'avg_ct_per_carrier'] = truck_count_by_size.loc[:, 'total_combination_trucks'] / truck_count_by_size.loc[:, 'total_carriers']
fleet_by_size.loc[:, 'fraction_of_carrier'] = fleet_by_size.loc[:, 'total_carriers'] / fleet_by_size.loc[:, 'total_carriers'].sum()
fleet_by_size.loc[:, 'DIESEL Vocational HDV'] = fleet_by_size.loc[:, 'DIESEL Vocational HDV'] / fleet_by_size.loc[:, 'total_trucks']
fleet_by_size.loc[:, 'DIESEL Tractor HDV'] = fleet_by_size.loc[:, 'DIESEL Tractor HDV'] / fleet_by_size.loc[:, 'total_trucks']
fleet_by_size.loc[:, 'DIESEL Vocational MDV'] = fleet_by_size.loc[:, 'DIESEL Vocational MDV'] / fleet_by_size.loc[:, 'total_trucks']
fleet_by_size.loc[:, 'GAS Vocational MDV'] = fleet_by_size.loc[:, 'GAS Vocational MDV'] / fleet_by_size.loc[:, 'total_trucks']
return(fleet_by_size)
# <codecell>
# check inscope fleet
registration_data_by_carrier = pd.pivot_table(registration_data,
index = 'carrier_type',
columns = 'vin_gvw',
values = 'totcount',
aggfunc = 'sum')
# registration_data.groupby('carrier_type')['totcount'].sum()
registration_data_by_carrier.plot(kind = 'bar', stacked = True)
plt.show()
registration_data_by_carrier.loc[:, 'Total'] = registration_data_by_carrier.sum(axis = 1)
registration_data_ind = registration_data.loc[registration_data['carrier_type'] == 'INDIVIDUAL']
# individual_fleet_size = \
# registration_data_ind.groupby('business_name')[['totcount']].sum()
registration_data_ind['totcount'].hist(bins = 30)
# <codecell>
##### vehicle type assignment and selection #########
registration_data.loc[:, 'MDV'] = registration_data.loc[:, 'totcount'] * (registration_data.loc[:, 'vin_gvw'].isin(medium_duty_class)) + \
0 * (registration_data.loc[:, 'vin_gvw'].isin(heavy_duty_class))
registration_data.loc[:, 'HDV'] = 0 * (registration_data.loc[:, 'vin_gvw'].isin(medium_duty_class)) + \
registration_data.loc[:, 'totcount'] * (registration_data.loc[:, 'vin_gvw'].isin(heavy_duty_class))
print(registration_data.totcount.sum())
print(registration_data.fuel.unique())
registration_data.loc[:, 'vehicle_class'] = 'HDV'
registration_data.loc[registration_data['vin_gvw'].isin(medium_duty_class), 'vehicle_class'] = 'MDV'
truck_type_lookup = {'STRAIGHT TRUCK': 'Vocational',
'TRACTOR TRUCK': 'Tractor',
'CAB CHASSIS': 'Vocational',
'CUTAWAY': 'Vocational',
'INCOMPLETE (STRIP CHASSIS)': 'Vocational',
'STEP VAN': 'Vocational',
'VAN CARGO': 'Vocational',
'GLIDERS': 'Tractor',
'PICKUP': 'Vocational'}
# assign and filter body type
registration_data.loc[:, 'body type'] = registration_data.loc[:, 'vehicle_type'].map(truck_type_lookup)
registration_data = registration_data.dropna(subset = ['body type'])
hdv_fuels = ['DIESEL']
mdv_fuels = ['DIESEL', 'GAS']
fuel_filter_hdt = (registration_data.loc[:, 'vin_gvw'].isin(heavy_duty_class) ) & \
(registration_data.loc[:, 'fuel'].isin(hdv_fuels))
fuel_filter_mdt = (registration_data.loc[:, 'vin_gvw'].isin(medium_duty_class) ) & \
(registration_data.loc[:, 'fuel'].isin(mdv_fuels))
registration_data = registration_data.loc[fuel_filter_hdt | fuel_filter_mdt]
print(registration_data.totcount.sum())
#create join vehicle type
registration_data.loc[:, 'vehicle type'] = registration_data.loc[:, 'fuel'] + ' ' + \
registration_data.loc[:, 'body type'] + ' ' + registration_data.loc[:, 'vehicle_class']
# filter by fuel
# <codecell>
##### private truck fleet distribution #########
# private truck analysis
list_of_vehicle_type = ['DIESEL Vocational HDV', 'DIESEL Tractor HDV',
'DIESEL Vocational MDV', 'GAS Vocational MDV'] # select type of trucks
private_carriers = registration_data.loc[registration_data['carrier_type'] == 'PRIVATE']
private_truck_by_firm = pd.pivot_table(private_carriers, index = 'business_name',
columns = 'vehicle type',
values = 'totcount', aggfunc = np.sum)
private_truck_by_firm = private_truck_by_firm.reset_index()
private_truck_by_firm = private_truck_by_firm.fillna(0)
private_truck_by_firm.loc[:, 'totcount'] = private_truck_by_firm.loc[:, list_of_vehicle_type].sum(axis = 1)
private_truck_by_firm.head(5)
print('total private carriers ', len(private_truck_by_firm))
print('total private trucks ', private_truck_by_firm['totcount'].sum())
print('max private trucks in a firm', private_truck_by_firm['totcount'].max())
print('avg. private trucks in a firm', private_truck_by_firm['totcount'].mean())
print(private_truck_by_firm[list_of_vehicle_type].sum())
private_truck_by_firm['totcount'].hist(bins = 2000)
plt.xlim([0, 100])
plt.xlabel('truck count per firm')
plt.ylabel('count of firms')
plt.title('private truck count distribution')
plt.show()
private_truck_count_by_size = fleet_composition_processor(private_truck_by_firm)
private_truck_count_by_size['total_truck_std'] = \
private_truck_count_by_size['total_truck_std'].fillna(1)
private_truck_count_by_size.to_csv(output_dir + selected_state + '_private_fleet_size_distribution.csv',
index = False, sep = ',')
# <codecell>
##### for-hire truck fleet distribution #########
for_hire_carriers = registration_data.loc[registration_data['carrier_type'] == 'FOR HIRE']
for_hire_truck_by_firm = pd.pivot_table(for_hire_carriers, index = 'business_name',
columns = 'vehicle type',
values = 'totcount', aggfunc = np.sum)
for_hire_truck_by_firm = for_hire_truck_by_firm.reset_index()
for_hire_truck_by_firm = for_hire_truck_by_firm.fillna(0)
for_hire_truck_by_firm.loc[:, 'totcount'] = for_hire_truck_by_firm.loc[:, list_of_vehicle_type].sum(axis = 1)
print('total for-hire carriers ', len(for_hire_truck_by_firm))
print('total for-hire trucks ', for_hire_truck_by_firm['totcount'].sum())
print('max for-hire trucks in a firm', for_hire_truck_by_firm['totcount'].max())
print('avg. for-hire trucks in a firm', for_hire_truck_by_firm['totcount'].mean())
for_hire_truck_by_firm['totcount'].hist(bins = 100)
plt.xlim([0, 500])
plt.xlabel('truck count per firm')
plt.ylabel('count of firms')
plt.title('for-hire truck count distribution')
plt.show()
for_hire_truck_count_by_size = fleet_composition_processor(for_hire_truck_by_firm)
for_hire_truck_count_by_size['total_truck_std'] = \
for_hire_truck_count_by_size['total_truck_std'].fillna(1)
for_hire_truck_count_by_size.to_csv(output_dir + selected_state + '_for_hire_fleet_size_distribution.csv',
index = False, sep = ',')
# <codecell>
##### for-lease truck fleet distribution #########
# for-leasing truck analysis
for_lease_carriers = registration_data.loc[registration_data['carrier_type'].isin(['FINANCE LEASE', \
'FULL SERVICE LEASE', \
'MANUFACTURER SPONSORED LEASE'])]
for_lease_truck_by_firm = pd.pivot_table(for_lease_carriers, index = 'business_name',
columns = 'vehicle type',
values = 'totcount', aggfunc = np.sum)
for_lease_truck_by_firm = for_lease_truck_by_firm.reset_index()
for_lease_truck_by_firm = for_lease_truck_by_firm.fillna(0)
for_lease_truck_by_firm.loc[:, 'totcount'] = for_lease_truck_by_firm.loc[:, list_of_vehicle_type].sum(axis = 1)
# truck_count[int_vars]
print('total for-lease carriers ', len(for_lease_truck_by_firm))
print('total for-lease trucks ', for_lease_truck_by_firm['totcount'].sum())
print('max for-lease trucks in a firm', for_lease_truck_by_firm['totcount'].max())
print('avg. for-lease trucks in a firm', for_lease_truck_by_firm['totcount'].mean())
for_lease_truck_by_firm['totcount'].hist(bins = 100)
plt.xlim([0, 500])
plt.xlabel('truck count per firm')
plt.ylabel('count of firms')
plt.title('for-lease truck count distribution')
plt.show()
for_lease_truck_count_by_size = fleet_composition_processor(for_lease_truck_by_firm)
for_lease_truck_count_by_size['total_truck_std'] = \
for_lease_truck_count_by_size['total_truck_std'].fillna(1)
for_lease_truck_count_by_size.to_csv(output_dir + selected_state + '_for_lease_fleet_size_distribution.csv',
index = False, sep = ',')