Download the Data

In [7]:
from google.colab import drive
drive.mount('/content/drive')
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive

Importations

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  #advanced visualization library
import warnings
warnings.filterwarnings('ignore')

Fast EDA

In [0]:
import pandas as pd 
data = pd.read_csv('/content/drive/My Drive/Chicago_Traffic_Tracker_-_Historical_Congestion_Estimates_by_Region_-_2018-Current(1).csv')
In [50]:
data.head()
Out[50]:
TIME REGION_ID SPEED REGION BUS_COUNT NUM_READS HOUR DAY_OF_WEEK MONTH DESCRIPTION RECORD_ID WEST EAST SOUTH NORTH NW_LOCATION SE_LOCATION
0 04/29/2018 09:40:16 PM 15 27.89 Bridgeport-McKinley-Lower West 19 312 21 1 4 Pershing to Roosevel. Western to Stewart 15-201804300240 -87.685372 -87.636322 41.822792 41.866129 POINT (-87.685372 41.866129) POINT (-87.636322 41.822792)
1 04/29/2018 09:40:16 PM 18 27.89 South West Side 25 492 21 1 4 71st to Pershing. Cicero to Western 18-201804300240 -87.747456 -87.683730 41.764066 41.822792 POINT (-87.747456 41.822792) POINT (-87.68373 41.764066)
2 04/29/2018 09:40:16 PM 21 25.91 Hyde Park-Kenwood-Woodlawn 23 399 21 1 4 71st to Pershing. Cottage Grove to Lake Shore 21-201804300240 -87.606334 -87.566260 41.764066 41.822792 POINT (-87.606334 41.822792) POINT (-87.56626 41.764066)
3 04/29/2018 09:40:16 PM 22 29.59 Ashburn 17 254 21 1 4 91st to 71st. Cicero to Damen 22-201804300240 -87.747456 -87.672980 41.728472 41.764066 POINT (-87.747456 41.764066) POINT (-87.67298 41.728472)
4 04/29/2018 09:40:16 PM 23 23.86 Auburn Gresham-Chatham 41 585 21 1 4 91st to 71st. Damen to Cottage Grove 23-201804300240 -87.672980 -87.606334 41.728472 41.764066 POINT (-87.67298 41.764066) POINT (-87.606334 41.728472)

Data description

  • TIME : Timestamp of the record

  • RGION_ID : Unique arbitrary number to represent each region

  • SPEED : Estimated congestion level. Although expressed in miles per hour, this value is more a reflection of the congestion level in the region than it is indicative of the average raw speed vehicles are travelling within the region.

  • REGION : Name of the region.

  • BUS_COUNT : The number of buses used to estimate traffic.

  • NUM_READS : Number of GPS probes received(or used) for estimating the speed for that segment.

  • WEST: Approximate longitude of the west edge of the region.

  • EAST: Approximate longitude of the east edge of the region.

  • SOUTH : Approximate latitude of the south edge of the region.

  • NORTH : Approximate latitude of the north edge of the region.

  • NW_LOACATION : The location corresponding to the intersection of NORTH and WEST in a format that allows for creation of maps and other geographic operations on this data portal.

  • SE_LOCATION : The location corresponding to the intersection of SOUTH and EAST in a format that allows for creation of maps and other geographic operations on this data portal.

Feature engineering

In [0]:
data["TIME"]=pd.to_datetime(data["TIME"], format="%m/%d/%Y %I:%M:%S %p")
In [0]:
data["dayofweek"]=data["TIME"].dt.dayofweek
In [0]:
data['DAY'] = data['TIME'].dt.day
data['MONTH'] = data['TIME'].dt.month
data['YEAR'] = data['TIME'].dt.year
In [0]:
list_REGION = []
for i in range(29) : 
    reg = data[(data['REGION_ID']==i+1)].REGION.unique()[0]
    list_REGION.append(reg)
In [0]:
data = data.groupby(['REGION','REGION_ID','MONTH','DAY','YEAR','HOUR','NORTH','WEST','EAST', 'SOUTH','DAY_OF_WEEK'])[['SPEED','BUS_COUNT','NUM_READS']].agg('mean').reset_index()
In [0]:
data['MINUTE'] = '00'
data['Time'] = pd.to_datetime(data[['YEAR','MONTH','DAY','HOUR','MINUTE']].astype(str).agg('-'.join,axis=1),format='%Y-%m-%d-%H-%M')
In [0]:
data['CENTER_LAT']=data['NORTH']*0.5+0.5*data['SOUTH']
data['CENTER_LON']=data['EAST']*0.5+0.5*data['WEST']
data['Time'] = data.Time.dt.strftime("%a, %d %b, %Y at %l:%M %p")

Graphics

In [58]:
plt.figure(figsize=(20,10))
plt.title("Mean speed per region")
sns.barplot(data.SPEED,data.REGION)
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6d9b807cc0>
In [59]:
from numpy import median
plt.figure(figsize=(20,10))
plt.title("Median speed per region")
sns.barplot(data.SPEED,data.REGION,estimator=median)
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6d9c373f98>
In [60]:
data2=data[data["HOUR"].isin([8,9,15,16,17])]
plt.figure(figsize=(20,10))
plt.title("Median speed during rush hours")
sns.barplot(data2.SPEED,data2.REGION,estimator=median)
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6d9e2b7f28>
In [61]:
from datetime import datetime
loop=data[data["REGION"].isin(["Chicago Loop"])]
loop=loop.drop_duplicates("Time",keep='first')
fig,ax=plt.subplots(1,1,figsize=(15,15))
sub_set = loop[(loop['YEAR']==2019) & (loop['MONTH'] == 4) & (8 <= loop['DAY']) & (loop['DAY'] <= 14)].copy()
timelist=sub_set["Time"]
plt.plot(sub_set.Time,sub_set.SPEED)
plt.title("Speed during a week")
plt.ylabel("Miles per hour")
ax.set_xticks(timelist[::8])
plt.xticks(rotation=90)
Out[61]:
(array([  0.,   8.,  16.,  24.,  32.,  40.,  48.,  56.,  64.,  72.,  80.,
         88.,  96., 104., 112., 120., 128., 136., 144., 152., 160.]),
 <a list of 21 Text major ticklabel objects>)
In [0]: