diff --git a/viz_scripts/Ebike Usage Trends.ipynb b/viz_scripts/Ebike Usage Trends.ipynb new file mode 100644 index 00000000..54e8ee36 --- /dev/null +++ b/viz_scripts/Ebike Usage Trends.ipynb @@ -0,0 +1,2646 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "collaborative-class", + "metadata": {}, + "source": [ + "Kacie's Scripts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "danish-modern", + "metadata": {}, + "outputs": [], + "source": [ + "year = None\n", + "month = None\n", + "program = \"prepilot\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "square-green", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "!pip install seaborn\n", + "import seaborn as sns\n", + "from collections import defaultdict\n", + "import datetime\n", + "\n", + "sns.set_style(\"whitegrid\")\n", + "sns.set()\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "double-subscriber", + "metadata": {}, + "outputs": [], + "source": [ + "import scaffolding \n", + "from plots import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "manufactured-cradle", + "metadata": {}, + "outputs": [], + "source": [ + "# Loading mapping dictionaries from mapping_dictionaries notebook\n", + "%store -r dic_ei\n", + "%store -r dic_re\n", + "%store -r dic_pur\n", + "\n", + "# convert a dictionary to a defaultdict\n", + "dic_pur = defaultdict(lambda: 'Other',dic_pur)\n", + "dic_re = defaultdict(lambda: 'Other',dic_re)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "differential-dealer", + "metadata": {}, + "outputs": [], + "source": [ + "tq = scaffolding.get_time_query(year, month)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sorted-consensus", + "metadata": {}, + "outputs": [], + "source": [ + "participant_ct_df = scaffolding.load_all_participant_trips(program, tq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "whole-destination", + "metadata": {}, + "outputs": [], + "source": [ + "labeled_ct = scaffolding.filter_labeled_trips(participant_ct_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cognitive-transformation", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct = scaffolding.expand_userinputs(labeled_ct)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hispanic-effort", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "expensive-catch", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "breathing-screen", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct = scaffolding.data_quality_check(expanded_ct)\n", + "expanded_ct.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "loaded-blind", + "metadata": {}, + "outputs": [], + "source": [ + "## Mapping new labels with dictionaries\n", + "expanded_ct['Trip_purpose']= expanded_ct['purpose_confirm'].map(dic_pur)\n", + "expanded_ct['Mode_confirm']= expanded_ct['mode_confirm'].map(dic_re)\n", + "expanded_ct['Replaced_mode']= expanded_ct['replaced_mode'].map(dic_re)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "former-ottawa", + "metadata": {}, + "outputs": [], + "source": [ + "dic_pur" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exterior-sitting", + "metadata": {}, + "outputs": [], + "source": [ + "assert len(expanded_ct[(expanded_ct['Mode_confirm'] == 'Pilot ebike') & (expanded_ct[\"Replaced_mode\"] == \"Pilot ebike\")]) == 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "trying-flash", + "metadata": {}, + "outputs": [], + "source": [ + "# Energy Impact Calculation\n", + "scaffolding.unit_conversions(expanded_ct)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "following-eleven", + "metadata": {}, + "outputs": [], + "source": [ + "file_suffix = scaffolding.get_file_suffix(year, month, program)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "athletic-funeral", + "metadata": {}, + "outputs": [], + "source": [ + "file_suffix = scaffolding.get_file_suffix(year, month, program)\n", + "quality_text = scaffolding.get_quality_text(participant_ct_df, expanded_ct)" + ] + }, + { + "cell_type": "markdown", + "id": "derived-accuracy", + "metadata": {}, + "source": [ + "Demographics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "opening-stupid", + "metadata": {}, + "outputs": [], + "source": [ + "#All Project Participants - includes stage users and participant users\n", + "#Participant users - excludes stage users" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "weird-macintosh", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#All the demographic survey entries\n", + "demog = pd.read_csv(\"Can Do Colorado eBike Program - en.csv\")\n", + "#demog.columns\n", + "demog.columns = ['Timestamp','user_id','YearOfBirth','Gender','DriversLicense','Student','HighestDegree','Work','MoreThanOneJob','FullPartTime','Job','TranspToJobLastWeek','DailyCommuteMinsToJob','ChangeStartTime','CanWFH','DaysPerWeekWFT','DescribesYouBest','OwnRentResidence','HomeType','Income','PplInHome','ChildrenUnder18','NumOfDrivers','NumVehicles','AltTranspt','MedCond','HowLongMedCond']\n", + "demog\n", + "\n", + "es_users = pd.read_csv(\"Can Do Colorado eBike Program - es.csv\")\n", + "\n", + "# this translates the spanish df to the english df\n", + "\n", + "es_users.columns = demog.columns # translate cols to english\n", + "pd.set_option('display.max_columns', None)\n", + "# the rest of this is a little clunky, but it's good for the plots I make (so far)\n", + "es_users = es_users.replace(\"Menos de $24,999\", \"Less than $24,999\")\n", + "es_users = es_users.replace(\"Femenino.\", \"Woman\")\n", + "es_users = es_users.replace(\"Masculino.\", \"Man\")\n", + "es_users = es_users.replace(\"Si\", \"Yes\")\n", + "es_users = es_users.replace(\"Bicicleta\", \"Bicycle\")\n", + "es_users = es_users.replace(\"Teletrabajo\", \"Telecommute\")\n", + "es_users = es_users.replace(\"Caminar\", \"Walk\")\n", + "es_users = es_users.replace(\"Graduado de secundaria o GED.\", \"High school graduate or GED\")\n", + "en_es_users = demog.append(es_users) #df of both english and spanish\n", + "en_es_users" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cooked-addiction", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#df of all project partcipants, including only the last survey entry, no duplicates\n", + "#First survey entry 5-23-2021\n", + "#Last survey entry on 5-10-2022\n", + "pd.set_option('display.max_rows', 10)\n", + "demog = en_es_users\n", + "demog1 = demog.sort_values(by = 'Timestamp')\n", + "demogRecent = demog1.drop_duplicates(subset='user_id',keep = 'last')\n", + "demogRecent=demogRecent.reset_index()\n", + "demogRecent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c77ce8b", + "metadata": {}, + "outputs": [], + "source": [ + "#Dataframe that includes all survey entries that are more than 2 weeks apart\n", + "demogSep = demog.drop([247,123,254,262,30,256,33,55,70,114,239,251,237,238,206,21,219,23,94,95,121,106,91,63,59,280,283,13,182,56,233,90,60,103,153,176,145,135,27,101,132,217,5,6,1,29])\n", + "#demogSep.user_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "flexible-spyware", + "metadata": {}, + "outputs": [], + "source": [ + "#Less than $24,999\n", + "#$25,000-$49,999\n", + "#$50,000-$99,999\n", + "#$100,000 -$149,999\n", + "#$150,000-$199,999\n", + "#$200,000 or more\n", + "#Prefer not to say\n", + "def returnIncomeDistribution(demog,title):\n", + " lessThan24=0\n", + " i25to49=0\n", + " i50to99=0\n", + " i100to149=0\n", + " i150to199=0\n", + " i200orMore=0\n", + " NA=0\n", + " \n", + " for i in range(len(demog)):\n", + " if str(demog.Income.iloc[i]).__contains__('24,999'):\n", + " lessThan24+=1\n", + " elif str(demog.Income.iloc[i]).__contains__('25,000'):\n", + " i25to49+=1\n", + " elif str(demog.Income.iloc[i]).__contains__('100,000'):\n", + " i100to149+=1\n", + " elif str(demog.Income.iloc[i]).__contains__('150,000'):\n", + " i150to199+=1\n", + " elif str(demog.Income.iloc[i]).__contains__('50,000'):\n", + " i50to99+=1\n", + " elif str(demog.Income.iloc[i]).__contains__('200,000'):\n", + " i200orMore+=1\n", + " elif str(demog.Income.iloc[i]).__contains__('Prefer'):\n", + " NA+=1 \n", + "\n", + " values = [lessThan24,i25to49,i50to99,i100to149,i150to199,i200orMore,NA]\n", + " income = ['Less than $24,999','\\$25,000-\\$49,999','\\$50,000-\\$99,999','\\$100,000-\\$149,999','\\$150,000-\\$199,999','$200,000 or more','Prefer not to say']\n", + " colours = dict(zip(values, plt.cm.tab20.colors[:len(values)]))\n", + " def func(pct, values):\n", + " total = sum(values)\n", + " absolute = int(round(pct*total/100.0))\n", + " return \"{:.1f}%\\n({:d})\".format(pct, absolute) if pct > 3 else''\n", + " fig = plt.figure(figsize =(10, 10))\n", + " plt.pie(values, labels = income, colors=[colours[key] for key in values], pctdistance=.75, autopct=lambda pct: func(pct, values),textprops={'size': 16})\n", + " plt.title(title, fontsize=25)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f789770b", + "metadata": {}, + "outputs": [], + "source": [ + "def AgeDistribution(demog,title):\n", + " demog['Age']=0\n", + " for i in range(len(demog)):\n", + " #print('hi',i)\n", + " if int(demog.iloc[i].YearOfBirth) > 22 and int(demog.iloc[i].YearOfBirth) < 1000:\n", + " demog.at[i,'Age']=2022-(int(demog.iloc[i].YearOfBirth)+1900)\n", + " else:\n", + " #print('false')\n", + " demog.at[i,'Age']=2022-int(demog.YearOfBirth.iloc[i])\n", + " age = [demog.Age]\n", + " plt.hist(age, bins = len(demog.Age.unique()))\n", + " plt.title(title)\n", + " plt.xlabel('Age')\n", + " plt.ylabel('Number of Participants')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "natural-preparation", + "metadata": {}, + "outputs": [], + "source": [ + "#def AgeDistribution(demog):\n", + "# demog=demog.assign(Age=2022-demogRecent.YearOfBirth)\n", + "# demog1=demog.reset_index()\n", + "# for i in range(len(demog1)):\n", + "# if demog1.loc[i].YearOfBirth > 22 and demog1.loc[i].YearOfBirth < 1000:\n", + "# demog1.at[i,'Age']=2022-(demog1.loc[i].YearOfBirth+1900)\n", + "# age = [demog1.Age]\n", + "# plt.hist(age, bins = 13)\n", + "# plt.title('Age Distribution of Participants')\n", + "# plt.xlabel('Age')\n", + "# plt.ylabel('Number of Participants')\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dependent-water", + "metadata": {}, + "outputs": [], + "source": [ + "#Remade graph for Main Mode of Tranportation\n", + "def MainTransport(mainmode,title):\n", + " labels = mainmode.value_counts().index # the labels in the pie chart legend\n", + " sizes = mainmode.value_counts()/len(mainmode)*100\n", + " vehNum = np.array([0],dtype=np.float_)\n", + " for i in range(len(labels)):\n", + " if labels[i] == 'Car' or labels[i] == 'SUV' or labels[i] == 'Van':\n", + " vehNum+=sizes[i]\n", + " elif labels[i] == \"Buss\":\n", + " sizes['Public or commuter bus']+=sizes[i]\n", + " veh= pd.Index(['Motor Vehicle (Car, SUV, Van)'])\n", + " labels = labels.append(veh)\n", + " labels=labels.drop('Buss')\n", + " labels=labels.drop('Car')\n", + " labels=labels.drop('Van')\n", + " labels=labels.drop('SUV')\n", + " sizes=sizes.drop('Buss')\n", + " sizes=sizes.drop('Car')\n", + " sizes=sizes.drop('Van')\n", + " sizes=sizes.drop('SUV')\n", + " sizes['Motor Vehicle (Car, SUV, Van)']=vehNum[0]\n", + " sizes=sizes.reindex(['Motor Vehicle (Car, SUV, Van)','Bicycle', 'Public or commuter bus', 'Walk', 'Pickup truck',\n", + " 'Work from home', 'Taxi/Limo (including services like Uber or Lyft)',\n", + " 'Walk, bike, bus, uber or lyft.', 'Prefer not to say', 'Skateboard',\n", + " 'Car and E-Bike', 'Shared vehicle'])\n", + " labels=labels.reindex(['Motor Vehicle (Car, SUV, Van)','Bicycle', 'Public or commuter bus', 'Walk', 'Pickup truck',\n", + " 'Work from home', 'Taxi/Limo (including services like Uber or Lyft)',\n", + " 'Walk, bike, bus, uber or lyft.', 'Prefer not to say', 'Skateboard',\n", + " 'Car and E-Bike', 'Shared vehicle'])\n", + " labels=labels[0]\n", + " labelz = [f'{l}, {s:0.1f}%' for l, s in zip(labels, sizes)]\n", + " labels1=['Motor Vehicle (Car, SUV, Van)','Bicycle', 'Public or commuter bus', 'Walk', 'Pickup truck',\n", + " 'Work from home', 'Taxi/Limo (including services like Uber or Lyft)',\n", + " '', '', '','', '']\n", + " colours = dict(zip(sizes, plt.cm.tab20.colors[:len(sizes)]))\n", + " def func(pct, sizes):\n", + " total = sum(sizes)\n", + " absolute = int(round(pct*total/100.0))\n", + " return \"{:.1f}%\\n({:d})\".format(pct, absolute) if pct > 4 else''\n", + "\n", + " fig = plt.figure(figsize =(10, 10))\n", + " plt.pie(sizes, labels = labels1, colors=[colours[key] for key in sizes], pctdistance=.75, autopct=lambda pct: func(pct, sizes),textprops={'size': 16})\n", + " plt.title(title, fontsize=25)\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', labels = labelz)\n", + " plt.show()\n", + "\n", + "MainTransport(participantDemog.TranspToJobLastWeek,'Main Mode of Transportation for Participant Users')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8944d254", + "metadata": {}, + "outputs": [], + "source": [ + "def genderDistrb(Gender,title):\n", + " counts = Gender.value_counts()\n", + " labels = Gender.value_counts().index # the labels in the pie chart legend\n", + " sizes = Gender.value_counts()/len(Gender)*100 # the percentages to display\n", + " labelz = [f'{l}, {s:0.1f}%' for l, s in zip(labels, sizes)]\n", + " colours = dict(zip(counts, plt.cm.tab20.colors[:len(counts)]))\n", + " def func(pct, counts):\n", + " total = sum(counts)\n", + " absolute = int(round(pct*total/100.0))\n", + " return \"{:.1f}%\\n({:d})\".format(pct, absolute) if pct > 4 else''\n", + " fig = plt.figure(figsize =(10, 10))\n", + " plt.pie(counts, labels = labels, colors=[colours[key] for key in counts], pctdistance=.75, autopct=lambda pct: func(pct, counts),textprops={'size': 16})\n", + " plt.title(title, fontsize=25)\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', labels = labelz)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97ffce17", + "metadata": {}, + "outputs": [], + "source": [ + "#Original unedited function (see above)\n", + "def MainTransport(mainmode,title):\n", + " mainmode.value_counts().plot(title = title, kind = \"pie\", figsize = (5,5),\n", + " labels = None, )\n", + "\n", + " labels = mainmode.value_counts().index # the labels in the pie chart legend\n", + " sizes = mainmode.value_counts()/len(mainmode)*100 # the percentages to display\n", + " labels = [f'{l}, {s:0.1f}%' for l, s in zip(labels, sizes)]\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', labels = labels)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2dfeee4", + "metadata": {}, + "outputs": [], + "source": [ + "#demo is the demographic df\n", + "def haveCar(demo):\n", + " no=0\n", + " yes=0\n", + " demo['HasCar']=\"\"\n", + " for i in range(len(demo)):\n", + " if len(str(demo.iloc[i].NumVehicles)) == 1:\n", + " if int(demo.iloc[i].NumVehicles) == 0:\n", + " demo.at[i,'HasCar'] = \"Does not have a car\"\n", + " no+=1\n", + " elif int(demo.iloc[i].NumVehicles) != 0:\n", + " demo.at[i,'HasCar'] = \"Has a car\"\n", + " yes+=1\n", + " elif len(str(demo.iloc[i].NumVehicles)) == 2:\n", + " demo.at[i,'HasCar'] = \"Has a car\"\n", + " yes+=1\n", + " values=[yes,no]\n", + " have=[\"Has a car\",\"Doesn't have a car\"]\n", + " colours = dict(zip(values, plt.cm.tab20.colors[:len(values)]))\n", + " def func(pct, values):\n", + " total = sum(values)\n", + " absolute = int(round(pct*total/100.0))\n", + " return \"{:.1f}%\\n({:d})\".format(pct, absolute) if pct > 4 else''\n", + " fig = plt.figure(figsize =(10, 10))\n", + " plt.pie(values, labels = have, colors=[colours[key] for key in values], pctdistance=.75, autopct=lambda pct: func(pct, values),textprops={'size': 16})\n", + " plt.title('Distribution Of Car Owners', fontsize=25)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "simplified-board", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def student(demo):\n", + " no=0\n", + " yes=0\n", + " for i in range(len(demo)):\n", + " if demo['Student'][i].__contains__('Yes'):\n", + " yes+=1\n", + " else:\n", + " no+=1\n", + "\n", + " values=[yes,no]\n", + " student=[\"Is a student\",\"Is not a student\"]\n", + " colours = dict(zip(values, plt.cm.tab20.colors[:len(values)]))\n", + " def func(pct, values):\n", + " total = sum(values)\n", + " absolute = int(round(pct*total/100.0))\n", + " return \"{:.1f}%\\n({:d})\".format(pct, absolute) if pct > 4 else''\n", + " fig = plt.figure(figsize =(10, 10))\n", + " plt.pie(values, labels = student, colors=[colours[key] for key in values], pctdistance=.75, autopct=lambda pct: func(pct, values),textprops={'size': 16})\n", + " plt.title('Distribution Of Students and Non-Students', fontsize=25)\n", + " plt.show()\n", + " \n", + "student(participantDemog)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b12fa41b", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#Demographics of all project participants\n", + "returnIncomeDistribution(demogRecent,'Income Distribution of All Project Participants')\n", + "AgeDistribution(demogRecent,'Age Distribution of All Project Participants')\n", + "genderDistrb(demogRecent.Gender,'Gender Distribution of All Participants')\n", + "MainTransport(demogRecent.TranspToJobLastWeek,'Main Mode of Transportation for All Participants')\n", + "haveCar(demogRecent)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6e96368", + "metadata": {}, + "outputs": [], + "source": [ + "#maps user_email to user_id\n", + "import emission.core.get_database as edb \n", + "\n", + "#list(edb.get_uuid_db().find({}, {\"user_email\":1, \"_id\":0}))\n", + "list(edb.get_uuid_db().find({}, {\"user_email\":1, \"uuid\": 1, \"_id\":0}))\n", + "token = pd.DataFrame(list(edb.get_uuid_db().find({}, {\"user_email\":1, \"uuid\": 1, \"_id\":0})))\n", + "token = token.rename(columns={\"uuid\":\"user_id\"})\n", + "for i in range(len(token)):\n", + " token.user_id.iloc[i] = str(token.user_id.iloc[i]).replace(\"-\",\"\")\n", + "\n", + "token" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "helpful-coast", + "metadata": {}, + "outputs": [], + "source": [ + "df = demogRecent.merge(token,left_on=['user_id'],right_on=['user_id'],how='left')\n", + "df = df[df['user_email'].notnull()]\n", + "dfPar = df[df.user_email.str.startswith(('vail_','cc_','cc_ ','sc_','fc_','pc_','4c_'))]\n", + "dfPar = dfPar.reset_index()\n", + "\n", + "participantDemog = dfPar" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "faced-adelaide", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows', 10)\n", + "#participantDemog\n", + "#66,76:86,117:128,165:141,185\n", + "up=participantDemog.iloc[[66,86,128,141]]\n", + "#up\n", + "print(participantDemog['Age'].min(),',',participantDemog['Age'].max(),',',participantDemog['Age'].mean(),',',participantDemog['Age'].median())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "943c2683", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#Participant user demographics \n", + "df = demogRecent.merge(token,left_on=['user_id'],right_on=['user_id'],how='left')\n", + "df = df[df['user_email'].notnull()]\n", + "dfPar = df[df.user_email.str.startswith(('vail_','cc_','cc_ ','sc_','fc_','pc_','4c_'))]\n", + "dfPar = dfPar.reset_index()\n", + "\n", + "participantDemog = dfPar\n", + "participantDemog\n", + "returnIncomeDistribution(participantDemog,'Income Distribution of Participant Users')\n", + "AgeDistribution(participantDemog,'Age Distribution of Participant Users')\n", + "genderDistrb(participantDemog.Gender,'Gender Distribution of Participant Users')\n", + "MainTransport(participantDemog.TranspToJobLastWeek,'Main Mode of Transportation for Participant Users')\n", + "haveCar(participantDemog)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bf70d3f", + "metadata": {}, + "outputs": [], + "source": [ + "#Stage user demographics\n", + "df = demogRecent.merge(token,left_on=['user_id'],right_on=['user_id'],how='left')\n", + "df = df[df['user_email'].notnull()]\n", + "dfStage = df[df.user_email.str.startswith('stage')]\n", + "dfStage = dfStage.reset_index()\n", + "stageDemog = dfStage\n", + "\n", + "returnIncomeDistribution(stageDemog,'Income Distribution of Stage Users')\n", + "AgeDistribution(stageDemog,'Age Distribution of Stage Users')\n", + "genderDistrb(stageDemog.Gender,'Gender Distribution of Stage Users')\n", + "MainTransport(stageDemog.TranspToJobLastWeek,'Main Mode of Tranportation for Stage Users')\n", + "haveCar(stageDemog)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5595f72", + "metadata": {}, + "outputs": [], + "source": [ + "#tells me if a uuid exists in a df (for testing purposes)\n", + "def exists(user_id, df):\n", + " for i in range(len(df)):\n", + " if df.iloc[i].user_id == user_id:\n", + " print(\"True\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5a4a488", + "metadata": {}, + "outputs": [], + "source": [ + "#NOT BEING USED\n", + "#Max's script\n", + "# takes list of user id's and returns the overlap in en_usr and es_usr\n", + "def usr_overlap(user_id):\n", + " df_mode_users = demogRecent[demogRecent.iloc[:,1].isin(user_id)]\n", + " return df_mode_users\n", + "\n", + "# create df of all users from a certain program\n", + "def df_from_program(program):\n", + " df = all_users.loc[all_users['user_email'].str.contains(program)]\n", + " prog_users = df['uuid'].astype(str)\n", + " prog_users = [i.replace('-','') for i in prog_users] # remove all dashes from strings\n", + " return usr_overlap(prog_users)\n", + "\n", + "all_users = pd.DataFrame(list(edb.get_uuid_db().find({}, {\"user_email\":1, \"uuid\": 1, \"_id\": 0})))\n", + "#stageDemog = df_from_program('stage_')\n", + "\n", + "# create a df from all users except from the stage program\n", + "all_str_but_stage = 'cc_|sc_|vail_|fc_|pc_|4c_'\n", + "all_usr_but_stage = all_users.loc[all_users['user_email'].str.contains(all_str_but_stage)]\n", + "\n", + "prog_users = all_usr_but_stage['uuid'].astype(str)\n", + "prog_users = [i.replace('-','') for i in prog_users] # remove all dashes from strings\n", + "#participantDemog = usr_overlap(prog_users)\n", + "\n", + "def age_plots(df, title):\n", + " ages = 2022- df['YearOfBirth']\n", + " ages = ages[0 < ages < 140] # todo: maybe revisit this way of catching erroneous ages\n", + " plt.hist(ages, bins = len(ages.unique()))\n", + " plt.title(title)\n", + " plt.ylabel('Frequency')\n", + " plt.xlabel('Age (years)')\n", + " plt.xlim()\n", + " plt.show()\n", + "\n", + "#age_plots(demogRecent, 'All Users Demographic Age Distribution')\n", + "\n", + "#age_plots(stageDemog, 'Stage Demographic Age Distribution')\n", + "\n", + "#age_plots(participantDemog, 'Participant Demographic Age Distribution')" + ] + }, + { + "cell_type": "markdown", + "id": "italian-corpus", + "metadata": { + "scrolled": true + }, + "source": [ + "E-bike usage data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "suited-norman", + "metadata": {}, + "outputs": [], + "source": [ + "#RETURN DF OF ONLY PARTICIPANT USERS IN EXPANDED_CT\n", + "for i in range(len(expanded_ct)):\n", + " expanded_ct.user_id.iloc[i] = str(expanded_ct.user_id.iloc[i]).replace(\"-\",\"\")\n", + "df = expanded_ct.merge(token,left_on=['user_id'],right_on=['user_id'],how='left')\n", + "#df = df[df['user_email'].notnull()]\n", + "dfPar = df[df.user_email.str.startswith(('vail_','cc_ ','sc_','fc_','pc_','4c_'))]\n", + "dfPar = dfPar.reset_index()\n", + "\n", + "participant = dfPar\n", + "#participant" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ambient-waterproof", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#returns every users proportion of ebike trips by month with proportion max - min less than .1\n", + "\n", + "def get_ebike_percent_byMonth(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'May 2022']=percent\n", + " #keeps rows whose max - min < .1\n", + " for i in timeline.index:\n", + " if timeline.loc[i].max() - timeline.loc[i].min() > .1:\n", + " timeline.drop([i],inplace=True)\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.loc[timeline.index[-1]],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.show\n", + "\n", + "get_ebike_percent_byMonth(participant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unavailable-creature", + "metadata": {}, + "outputs": [], + "source": [ + "#returns every users proportion of ebike trips by month\n", + "#difference b/w max and min > .1, with max in 1st half, but not good visualization bc still hard to see\n", + "df = expanded_ct.merge(participantDemog,left_on=['user_id'],right_on=['user_id'],how='left')\n", + "import statistics\n", + "def get_ebike_percent_byMonth(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'May 2022']=percent\n", + " #keeps rows whose max - min < .1\n", + " #return timeline.index\n", + " #return timeline.loc['87cb88529b614c0e91ec07a33145659b'][0]==None\n", + " #return statistics.mean(timeline.iloc[0]!=None)\n", + " for i in timeline.index:\n", + " #return timeline.loc[i][:int(len(timeline.loc[i])/2)]==0\n", + " max = timeline.loc[i].max()\n", + " #return statistics.mean(timeline.loc[i])\n", + " if timeline.loc[i].max() - timeline.loc[i].min() < .1:\n", + " timeline.drop([i],inplace=True)\n", + " else:\n", + " k = int(len(timeline.loc[i])/2)\n", + " #print(i,':',k)\n", + " #print(type(timeline.loc[i][k]))\n", + " first=[]\n", + " for j in range(k):\n", + " #return timeline.loc[i][0]==None\n", + " #return timeline.loc[i][j]==None\n", + " if timeline.loc[i][j] == None:\n", + " if int(timeline.loc[i][j]) != max:\n", + " timeline.drop([i],inplace=True)\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.iloc[-1],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.show\n", + " \n", + " #for i in range(len(timeline1)-1):\n", + " # sns.lineplot(ax=ax,x=timeline1.loc[timeline1.index[-1]],y=timeline1.iloc[i],data=timeline,marker='o')\n", + " # plt.show\n", + "\n", + "get_ebike_percent_byMonth(participant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ethical-liabilities", + "metadata": {}, + "outputs": [], + "source": [ + "#returns every users proportion of ebike trips by month\n", + "#ALL PROJECT PARTCIPANTS\n", + "def get_ebike_percent_byMonth(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'May 2022']=percent\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " #return user['month'+str(7)][2]\n", + " #pd.set_option('display.max_rows', None)\n", + " #return timeline\n", + " #return timeline.iloc[1,0]\n", + " #return countOfUsers\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.loc[timeline.index[-1]],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.show\n", + "\n", + "get_ebike_percent_byMonth(expanded_ct)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "laden-voltage", + "metadata": {}, + "outputs": [], + "source": [ + "#returns every users proportion of ebike trips by month\n", + "#PARTICIPANT USERS ONLY\n", + "\n", + "def get_ebike_percent_byMonth(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].user_id.iloc[0]),'May 2022']=percent\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.loc[timeline.index[-1]],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.show\n", + "\n", + "get_ebike_percent_byMonth(participant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "incident-commitment", + "metadata": {}, + "outputs": [], + "source": [ + "#returns the AVERAGE of proportion of ebike trips by month for all users\n", + "\n", + "def get_ebike_percent_byMonth(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'user_id': user['month'+str(i)][j].user_id.iloc[0],'Date': pd.to_datetime(str(user['month'+str(i)][j].iloc[0].start_local_dt_year)+'-'+str(user['month'+str(i)][j].iloc[0].start_local_dt_month)),'percent':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " #for i in range(len(timeline)-1):\n", + " sns.lineplot(x=timeline.Date,y=timeline.percent,data=timeline,marker='o')\n", + " plt.show\n", + "\n", + "get_ebike_percent_byMonth(participant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "humanitarian-caution", + "metadata": {}, + "outputs": [], + "source": [ + "#Returns boxplots of monthly usage by income\n", + "def get_ebike_percent_byIncome_boxplot(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'User': user['month'+str(i)][j].start_local_dt_month.iloc[0],'Date': pd.to_datetime(str(user['month'+str(i)][j].iloc[0].start_local_dt_year)+'-'+str(user['month'+str(i)][j].iloc[0].start_local_dt_month)),'Proportion':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " #plt.xticks(rotation=90)\n", + " ax.set_xticklabels(timeline['Date'].dt.strftime('%Y-%m-%d'))\n", + " #for i in range(len(timeline)-1):\n", + " sns.boxplot(x = timeline.Date.dt.strftime('%Y-%m'), y = timeline.Proportion, hue=timeline.User, data=timeline,ax=ax)\n", + " plt.legend([])\n", + " plt.show\n", + "\n", + "get_ebike_percent_byIncome_boxplot(participant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "federal-applicant", + "metadata": {}, + "outputs": [], + "source": [ + "#makes the df that has only participant users with demographic info\n", + "for i in range(len(expanded_ct)):\n", + " expanded_ct.user_id.iloc[i] = str(expanded_ct.user_id.iloc[i]).replace(\"-\",\"\")\n", + "expanded_ct\n", + "\n", + "df = expanded_ct.merge(participantDemog,left_on=['user_id'],right_on=['user_id'],how='left')\n", + "#19-24,25-34,35-44,45-54,55+\n", + "df['Age_group']=''\n", + "for i in range(len(df)):\n", + " if df.Age.iloc[i] >= 19 and df.Age.iloc[i] <= 24:\n", + " df.at[i,'Age_group'] = '19-24'\n", + " if df.Age.iloc[i] >= 25 and df.Age.iloc[i] <= 34:\n", + " df.at[i,'Age_group'] = '25-34'\n", + " if df.Age.iloc[i] >= 35 and df.Age.iloc[i] <= 44:\n", + " df.at[i,'Age_group'] = '35-44'\n", + " if df.Age.iloc[i] >= 45 and df.Age.iloc[i] <= 54:\n", + " df.at[i,'Age_group'] = '45-54'\n", + " if df.Age.iloc[i] >= 55:\n", + " df.at[i,'Age_group'] = '55+'\n", + "demo = df[df['Timestamp'].notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "coated-cooler", + "metadata": {}, + "outputs": [], + "source": [ + "#returns every users proportion of ebike trips by month by age group\n", + "def Monthly_ebike_propBy_AgeGroup(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfAgeGroups = trips.Age_group.value_counts().nunique()\n", + " numOfGroups={}\n", + " user={}\n", + " for i in range(len(df)):\n", + " numOfGroups['num'+str(i)]=df[i].Age_group.value_counts().nunique() #number of age groups in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('Age_group')] #f['u'+str(i)] is the month, the indexs are the age group \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfGroups['num'+str(i)]): #iterates through age groups per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Age_group.iloc[0]),'May 2022']=percent\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Age Group')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " \n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.loc[timeline.index[-1]],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', labels = timeline.index)\n", + " plt.show\n", + "\n", + "Monthly_ebike_propBy_AgeGroup(demo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "basic-bottom", + "metadata": {}, + "outputs": [], + "source": [ + "#returns every users proportion of ebike trips by month with confidence interval\n", + "def ebike_month_age_conf(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'Age_Group': user['month'+str(i)][j].Age_group.iloc[0],'Date': pd.to_datetime(str(user['month'+str(i)][j].iloc[0].start_local_dt_year)+'-'+str(user['month'+str(i)][j].iloc[0].start_local_dt_month)),'percent':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Age Group')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " #for i in range(len(timeline)-1):\n", + " sns.lineplot(x=timeline.Date,y=timeline.percent,hue=timeline.Age_Group,data=timeline,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left')\n", + " plt.show\n", + "\n", + "ebike_month_age_conf(demo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nutritional-creativity", + "metadata": {}, + "outputs": [], + "source": [ + "def ebike_month_age_boxplot(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)):\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'Age_Group': user['month'+str(i)][j].Age_group.iloc[0],'Date': pd.to_datetime(str(user['month'+str(i)][j].iloc[0].start_local_dt_year)+'-'+str(user['month'+str(i)][j].iloc[0].start_local_dt_month)),'Proportion':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Age Group')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " ax.set_xticklabels(timeline['Date'].dt.strftime('%Y-%m-%d'))\n", + " sns.boxplot(x = timeline.Date.dt.strftime('%Y-%m'), y = timeline.Proportion, hue=timeline.Age_Group, data=timeline,ax=ax)\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left')\n", + " plt.show\n", + "\n", + "ebike_month_age_boxplot(demo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "grand-industry", + "metadata": {}, + "outputs": [], + "source": [ + "def ebike_month_gender(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfGender = trips.Gender.value_counts().nunique()\n", + " numOfGroups={}\n", + " user={}\n", + " for i in range(len(df)):\n", + " numOfGroups['num'+str(i)]=df[i].Gender.value_counts().nunique() #number of age groups in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('Gender')] #f['u'+str(i)] is the month, the indexs are the age group \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfGroups['num'+str(i)]): #iterates through age groups per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Gender.iloc[0]),'May 2022']=percent\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Gender')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " \n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.loc[timeline.index[-1]],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', labels = timeline.index)\n", + " plt.show\n", + "\n", + "ebike_month_gender(demo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rotary-assurance", + "metadata": {}, + "outputs": [], + "source": [ + "def ebike_month_gender_conf(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'gender': user['month'+str(i)][j].Gender.iloc[0],'Date': pd.to_datetime(str(user['month'+str(i)][j].iloc[0].start_local_dt_year)+'-'+str(user['month'+str(i)][j].iloc[0].start_local_dt_month)),'percent':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Gender')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " #for i in range(len(timeline)-1):\n", + " sns.lineplot(x=timeline.Date,y=timeline.percent,hue=timeline.gender,data=timeline,marker='o')\n", + " plt.show\n", + "\n", + "ebike_month_gender_conf(demo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "choice-overall", + "metadata": {}, + "outputs": [], + "source": [ + "demo1 = demo[demo['Timestamp'].notnull()]\n", + "\n", + "def ebike_month_gender_boxplot(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)):\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'Gender': user['month'+str(i)][j].Gender.iloc[0],'Date': pd.to_datetime(str(user['month'+str(i)][j].iloc[0].start_local_dt_year)+'-'+str(user['month'+str(i)][j].iloc[0].start_local_dt_month)),'Proportion':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Gender')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " ax.set_xticklabels(timeline['Date'].dt.strftime('%Y-%m-%d'))\n", + " sns.boxplot(x = timeline.Date.dt.strftime('%Y-%m'), y = timeline.Proportion, hue=timeline.Gender, data=timeline,ax=ax)\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left')\n", + " plt.show\n", + "\n", + "ebike_month_gender_boxplot(demo1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "commercial-pittsburgh", + "metadata": {}, + "outputs": [], + "source": [ + "#returns every users proportion of ebike trips by month by income\n", + "\n", + "def Monthly_ebike_propBy_Income(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfIncome = trips.Income.value_counts().nunique()\n", + " numOfGroups={}\n", + " user={}\n", + " for i in range(len(df)):\n", + " numOfGroups['num'+str(i)]=df[i].Income.value_counts().nunique()\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('Income')] #f['u'+str(i)] is the month, the indexs are the age group \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfGroups['num'+str(i)]):\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].Income.iloc[0]),'May 2022']=percent\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Income')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " \n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.loc[timeline.index[-1]],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', labels = timeline.index)\n", + " plt.show\n", + "\n", + "Monthly_ebike_propBy_Income(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "swiss-poland", + "metadata": {}, + "outputs": [], + "source": [ + "def get_ebike_percent_byIncome_conf(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'Income': user['month'+str(i)][j].Income.iloc[0],'Date': pd.to_datetime(str(user['month'+str(i)][j].iloc[0].start_local_dt_year)+'-'+str(user['month'+str(i)][j].iloc[0].start_local_dt_month)),'Proportion':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Income')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + "\n", + " sns.lineplot(x = timeline.Date.dt.strftime('%Y-%m'), y = timeline.Proportion, hue=timeline.Income, data=timeline,ax=ax,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left')\n", + " #plt.legend(labels=[\"Less than $24,000\",\"$25,000-$49,000\",\"$50,000-$99,999\",\"$150,000-$199,999\",\"Prefer not to say\"])\n", + " plt.show\n", + "\n", + "get_ebike_percent_byIncome_conf(demo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "indie-candy", + "metadata": {}, + "outputs": [], + "source": [ + "#Returns boxplots of monthly usage by income\n", + "def get_ebike_percent_byIncome_boxplot(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'Income': user['month'+str(i)][j].Income.iloc[0],'Date': pd.to_datetime(str(user['month'+str(i)][j].iloc[0].start_local_dt_year)+'-'+str(user['month'+str(i)][j].iloc[0].start_local_dt_month)),'Proportion':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Income')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " #plt.xticks(rotation=90)\n", + " ax.set_xticklabels(timeline['Date'].dt.strftime('%Y-%m-%d'))\n", + " #for i in range(len(timeline)-1):\n", + " sns.boxplot(x = timeline.Date.dt.strftime('%Y-%m'), y = timeline.Proportion, hue=timeline.Income, data=timeline,ax=ax)\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left')\n", + " plt.show\n", + "\n", + "get_ebike_percent_byIncome_boxplot(demo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "earlier-crest", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "#Monthy ebike prop by income excluding higher income\n", + "demo1 = demo\n", + "demo1=demo1.reset_index(drop=True)\n", + "demo1\n", + "#demo1.drop(demo1[demo1['Income'] == 'Prefer not to say'].index, inplace = True)\n", + "for i in range(0,len(demo1)-1):\n", + " if demo1['Income'].loc[i].__contains__('100,000') or demo1['Income'].loc[i].__contains__('150,000') or demo1['Income'].loc[i].__contains__('200,000'):\n", + " demo1.drop([i],inplace=True)\n", + "Monthly_ebike_propBy_Income(demo1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "stupid-response", + "metadata": {}, + "outputs": [], + "source": [ + "#returns trip purpose proportion of ebike trips by month excluding all NaN and 0 rows\n", + "import math\n", + "def ebike_month_purpose(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfPurposes = trips.purpose_confirm.value_counts().nunique()\n", + " numOfPurposes={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfPurposes['num'+str(i)]=df[i].purpose_confirm.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('purpose_confirm')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfPurposes['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " #else:\n", + " # percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].purpose_confirm.iloc[0]),'May 2022']=percent\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time Based on Trip Purpose')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " #timeline.reset_index(inplace=True)\n", + " # timeline1=timeline.drop([9])\n", + " for i in timeline.index:\n", + " count=0\n", + " for j in range(17):\n", + " if math.isnan(timeline.loc[i][j]) or timeline.loc[i][j]==0:\n", + " count+=1\n", + " if count == 17:\n", + " timeline.drop([i],inplace=True)\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " #return timeline\n", + " #timeline=timeline.iloc[[0,1,3,5,7,8,15,16,21,65]]\n", + " #return timeline\n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.loc[timeline.index[-1]],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', labels = timeline.index)\n", + " plt.show\n", + "\n", + "ebike_month_purpose(participant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "pressing-papua", + "metadata": {}, + "outputs": [], + "source": [ + "#ebike proportion usage by student status\n", + "df=demo\n", + "#df = df[df['Timestamp'].notnull()]\n", + "df = df[df['Student'].notnull()]\n", + "df=df.reset_index(drop=True)\n", + "\n", + "df['student']=''\n", + "for i in range(len(df)):\n", + " if df.Student.iloc[i].__contains__('Yes'):\n", + " df.at[i,'student'] = 'Student'\n", + " elif df.Student.iloc[i].__contains__('Not a student'):\n", + " df.at[i,'student'] = 'Non-Student'\n", + " else:\n", + " df.at[i,'student'] = 'Non-Student'\n", + "df = df[df['Timestamp'].notnull()]\n", + "df = df[df['student']!='']\n", + "df = df[df['student'].notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "popular-accident", + "metadata": {}, + "outputs": [], + "source": [ + "def ebike_month_student(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfStudent = trips.student.value_counts().nunique()\n", + " numOfGroups={}\n", + " user={}\n", + " for i in range(len(df)):\n", + " numOfGroups['num'+str(i)]=df[i].student.value_counts().nunique()\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('student')] #f['u'+str(i)] is the month, the indexs are the age group \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfGroups['num'+str(i)]):\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].student.iloc[0]),'May 2022']=percent\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " #return user['month'+str(6)][0]\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Student Status')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.loc[timeline.index[-1]],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', labels = timeline.index)\n", + " plt.show\n", + " \n", + "ebike_month_student(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "running-tampa", + "metadata": {}, + "outputs": [], + "source": [ + "#pd.set_option('display.max_rows', None)\n", + "def ebike_month_student_conf(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'Student': user['month'+str(i)][j].student.iloc[0],'Date': pd.to_datetime(str(int(user['month'+str(i)][j].iloc[0].start_local_dt_year))+'-'+str(int(user['month'+str(i)][j].iloc[0].start_local_dt_month))),'Proportion':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Student Status')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " sns.lineplot(x = timeline.Date.dt.strftime('%Y-%m'), y = timeline.Proportion, hue=timeline.Student, data=timeline,ax=ax,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left')\n", + " plt.show\n", + "\n", + "ebike_month_student_conf(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "skilled-conspiracy", + "metadata": {}, + "outputs": [], + "source": [ + "#Returns boxplots of monthly usage by student status\n", + "def get_ebike_percent_student_boxplot(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'Student': user['month'+str(i)][j].student.iloc[0],'Date': pd.to_datetime(str(int(user['month'+str(i)][j].iloc[0].start_local_dt_year))+'-'+str(int(user['month'+str(i)][j].iloc[0].start_local_dt_month))),'Proportion':percent}, ignore_index=True)\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Student Status')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " sns.boxplot(x = timeline.Date.dt.strftime('%Y-%m'), y = timeline.Proportion, hue=timeline.Student, data=timeline,ax=ax)\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left')\n", + " plt.show\n", + "\n", + "get_ebike_percent_student_boxplot(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "italian-square", + "metadata": {}, + "outputs": [], + "source": [ + "#Car df\n", + "df = expanded_ct.merge(participantDemog,left_on=['user_id'],right_on=['user_id'],how='left')\n", + "df = df[df['HasCar']!='']\n", + "df = df[df['HasCar'].notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "inclusive-brazil", + "metadata": {}, + "outputs": [], + "source": [ + "#ebike usage over time by car ownership status\n", + "def ebike_month_car(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfCar = trips.HasCar.value_counts().nunique()\n", + " numOfGroups={}\n", + " user={}\n", + " for i in range(len(df)):\n", + " numOfGroups['num'+str(i)]=df[i].HasCar.value_counts().nunique()\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('HasCar')] #f['u'+str(i)] is the month, the indexs are the age group \n", + " timeline = pd.DataFrame({'December 2020':[],'January 2021':[],'February 2021':[],'March 2021':[],'April 2021':[],'May 2021':[],'June 2021':[],'July 2021':[],'August 2021':[],'September 2021':[],'October 2021':[],'November 2021':[],'December 2021':[],'January 2022':[],'February 2022':[],'March 2022':[],'April 2022':[],'May 2022':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfGroups['num'+str(i)]):\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " #return percent\n", + " #timeline = timeline.append()\n", + " if user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2020:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'December 2020']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'January 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'February 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'March 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'April 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'May 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==6 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'June 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==7 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'July 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==8 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'August 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==9 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'September 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==10 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'October 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==11 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'November 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==12 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2021:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'December 2021']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==1 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'January 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==2 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'February 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==3 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'March 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==4 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'April 2022']=percent\n", + " elif user['month'+str(i)][j].start_local_dt_month.iloc[0]==5 and user['month'+str(i)][j].start_local_dt_year.iloc[0]==2022:\n", + " timeline.at[str(user['month'+str(i)][j].HasCar.iloc[0]),'May 2022']=percent\n", + " #return user['month'+str(i)][0]\n", + " timeline.loc[len(timeline.index)]=[pd.to_datetime('2020-12'),pd.to_datetime('2021-01'),pd.to_datetime('2021-02'),pd.to_datetime('2021-03'),pd.to_datetime('2021-04'),pd.to_datetime('2021-05'),pd.to_datetime('2021-06'),pd.to_datetime('2021-07'),pd.to_datetime('2021-08'),pd.to_datetime('2021-09'),pd.to_datetime('2021-10'),pd.to_datetime('2021-11'),pd.to_datetime('2021-12'),pd.to_datetime('2022-01'),pd.to_datetime('2022-02'),pd.to_datetime('2022-03'),pd.to_datetime('2022-04'),pd.to_datetime('2022-05')]\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Car Ownership Status')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " for i in range(len(timeline)-1):\n", + " sns.lineplot(ax=ax,x=timeline.loc[timeline.index[-1]],y=timeline.iloc[i],data=timeline,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', labels = timeline.index)\n", + " plt.show\n", + " \n", + "ebike_month_car(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "attached-graphic", + "metadata": {}, + "outputs": [], + "source": [ + "#pd.set_option('display.max_rows', None)\n", + "#Something wrong here but idk how to fix. In 2022-02, there are 4 unique users who don't have a car, 3 of which's trips are only ebikes, and 1 user who didn't use ebike at all. This 1 user is not counted in the graph\n", + "def ebike_month_car_conf(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'Car': user['month'+str(i)][j].HasCar.iloc[0],'Date': pd.to_datetime(str(int(user['month'+str(i)][j].iloc[0].start_local_dt_year))+'-'+str(int(user['month'+str(i)][j].iloc[0].start_local_dt_month))),'Proportion':percent}, ignore_index=True)\n", + " #for i in range(56):\n", + " # if user['month'+str(8)][i].iloc[0].HasCar.__contains__('Has a car') == False:\n", + " # print(i)\n", + " #return timeline[hi & hi2]\n", + " #filter=user['month'+str(8)][\"HasCar\"].isin([\"Does not have a car\"])\n", + " #return sum(user['month'+str(8)][48].mode_confirm.value_counts(dropna=True))\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Car Ownership Status')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " sns.lineplot(x = timeline.Date.dt.strftime('%Y-%m'), y = timeline.Proportion, hue=timeline.Car, data=timeline,ax=ax,marker='o')\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left')\n", + " plt.show\n", + "\n", + "ebike_month_car_conf(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "entitled-renaissance", + "metadata": {}, + "outputs": [], + "source": [ + "def get_ebike_percent_byCar_boxplot(trips):\n", + " df = [rows for _, rows in trips.groupby(['start_local_dt_year','start_local_dt_month'])] #spilts dataframe into month-year\n", + " count = trips.start_local_dt_month.value_counts().nunique() #count is the number of unique months in the dataset\n", + " countOfUsers = trips.user_id.value_counts().nunique()\n", + " numOfUsers={}\n", + " user={}\n", + " for i in range(len(df)): #len(df) should be 17\n", + " numOfUsers['num'+str(i)]=df[i].user_id.value_counts().nunique() #number of unique users in each month\n", + " for i in range(0, len(df)):\n", + " user['month'+str(i)] = [rows for _, rows in df[i].groupby('user_id')] #f['u'+str(i)] is the month, the indexs are the user \n", + " timeline = pd.DataFrame({'user_id':[],'Date':[],'percent':[]})\n", + " for i in range(len(df)): #iterates through month | these for loops calculate percent of ebike usage per month\n", + " for j in range(numOfUsers['num'+str(i)]): #iterates through users per month\n", + " #print(i,j)\n", + " if user['month'+str(i)][j].mode_confirm.value_counts().__contains__('pilot_ebike'):\n", + " ebike =user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " else:\n", + " ebike = 0\n", + " #ebike = user['month'+str(i)][j].mode_confirm.value_counts()['pilot_ebike']\n", + " if sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True)) != 0:\n", + " sumx = sum(user['month'+str(i)][j].mode_confirm.value_counts(dropna=True))\n", + " percent = ebike/sumx\n", + " else:\n", + " percent = 0\n", + " timeline=timeline.append({'Car': user['month'+str(i)][j].HasCar.iloc[0],'Date': pd.to_datetime(str(int(user['month'+str(i)][j].iloc[0].start_local_dt_year))+'-'+str(int(user['month'+str(i)][j].iloc[0].start_local_dt_month))),'Proportion':percent}, ignore_index=True)\n", + " # \n", + " #return type(timeline['Income'].iloc[0])\n", + " #return user['month'+str(7)][2]\n", + " #pd.set_option('display.max_rows', None)\n", + " #return timeline.loc[timeline.index[-1]]\n", + " #return timeline[0:125].to_string(index=False,header=False)\n", + " #return timeline\n", + " fig, ax = plt.subplots(figsize=(16,4))\n", + " plt.title('Proportion of E-Bike Usage Over Time by Car Ownership Status')\n", + " plt.xlabel(\"Date\")\n", + " plt.ylabel(\"Proportion of e-bike trips\")\n", + " #plt.xticks(rotation=90)\n", + " ax.set_xticklabels(timeline['Date'].dt.strftime('%Y-%m-%d'))\n", + " #for i in range(len(timeline)-1):\n", + " sns.boxplot(x = timeline.Date.dt.strftime('%Y-%m'), y = timeline.Proportion, hue=timeline.Car, data=timeline,ax=ax)\n", + " plt.legend(bbox_to_anchor=(1, 1), loc='upper left')\n", + " #plt.legend(labels=[\"Less than $24,000\",\"$25,000-$49,000\",\"$50,000-$99,999\",\"$150,000-$199,999\",\"Prefer not to say\"])\n", + " plt.show\n", + "\n", + "get_ebike_percent_byCar_boxplot(df)" + ] + }, + { + "cell_type": "markdown", + "id": "social-token", + "metadata": {}, + "source": [ + "# Travel Patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "controversial-planner", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(participant_ct_df)):\n", + " participant_ct_df.user_id.iloc[i] = str(participant_ct_df.user_id.iloc[i]).replace(\"-\",\"\")\n", + "df = participant_ct_df.merge(token,left_on=['user_id'],right_on=['user_id'],how='left')\n", + "df = df[df['user_email'].notnull()]\n", + "stage = df[df.user_email.str.startswith(('stage'))]\n", + "#dfPar = dfPar.reset_index()\n", + "\n", + "cond = df['user_email'].isin(stage['user_email'])\n", + "df.drop(df[cond].index, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "external-sally", + "metadata": {}, + "outputs": [], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sorted-confusion", + "metadata": {}, + "outputs": [], + "source": [ + "labeled_ct = scaffolding.filter_labeled_trips(df)\n", + "expanded_ct_Par = scaffolding.expand_userinputs(labeled_ct)\n", + "expanded_ct_Par = scaffolding.data_quality_check(expanded_ct_Par)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "occupied-superintendent", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct_Par = scaffolding.data_quality_check(expanded_ct_Par)\n", + "expanded_ct_Par.shape\n", + "\n", + "expanded_ct_Par['Trip_purpose']= expanded_ct_Par['purpose_confirm'].map(dic_pur)\n", + "expanded_ct_Par['Mode_confirm']= expanded_ct_Par['mode_confirm'].map(dic_re)\n", + "expanded_ct_Par['Replaced_mode']= expanded_ct_Par['replaced_mode'].map(dic_re)\n", + "\n", + "scaffolding.unit_conversions(expanded_ct_Par)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "descending-wings", + "metadata": {}, + "outputs": [], + "source": [ + "file_suffix = scaffolding.get_file_suffix(year, month, program)\n", + "quality_text = scaffolding.get_quality_text(df,expanded_ct_Par)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adult-device", + "metadata": {}, + "outputs": [], + "source": [ + "#Mode share for PARTICIPANT USERS ONLY\n", + "labels_mc = expanded_ct_Par['Mode_confirm'].value_counts(dropna=True).keys().tolist()\n", + "values_mc = expanded_ct_Par['Mode_confirm'].value_counts(dropna=True).tolist()\n", + "plot_title= \"Number of trips for each mode (selected by users)\\n%s\" % quality_text\n", + "file_name= 'ntrips_mode_confirm%s.png' % file_suffix\n", + "pie_chart_mode(plot_title,labels_mc,values_mc,file_name)\n", + "print(expanded_ct_Par['Mode_confirm'].value_counts(dropna=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "designing-wildlife", + "metadata": {}, + "outputs": [], + "source": [ + "labels_mc = expanded_ct_Par.query(\"Trip_purpose == 'Work'\").Mode_confirm.value_counts(dropna=True).keys().tolist()\n", + "values_mc = expanded_ct_Par.query(\"Trip_purpose == 'Work'\").Mode_confirm.value_counts(dropna=True).tolist()\n", + "plot_title= \"Number of commute trips for each mode (selected by users)\\n%s\" % quality_text\n", + "file_name= 'ntrips_commute_mode_confirm%s.png' % file_suffix\n", + "pie_chart_mode(plot_title,labels_mc,values_mc,file_name)\n", + "print(expanded_ct_Par.query(\"Trip_purpose == 'Work'\").Mode_confirm.value_counts(dropna=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "corresponding-shame", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct_Par.loc[expanded_ct_Par['Replaced_mode'] == 'Pilot ebike'].Mode_confirm.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "subjective-agent", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct_Par.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "irish-toddler", + "metadata": {}, + "outputs": [], + "source": [ + "labels_d10 = expanded_ct_Par.loc[(expanded_ct_Par['distance_miles'] <= 10)].Mode_confirm.value_counts(dropna=True).keys().tolist()\n", + "values_d10 = expanded_ct_Par.loc[(expanded_ct_Par['distance_miles'] <= 10)].Mode_confirm.value_counts(dropna=True).tolist()\n", + "plot_title=\"Mode confirmations for trips under 10 Miles\\n%s\" % quality_text\n", + "file_name ='ntrips_under10miles_mode_confirm%s.png' % file_suffix\n", + "pie_chart_mode(plot_title,labels_d10,values_d10,file_name)\n", + "print(expanded_ct_Par.loc[(expanded_ct_Par['distance_miles'] <= 10)].Mode_confirm.value_counts(dropna=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rubber-induction", + "metadata": {}, + "outputs": [], + "source": [ + "miles = expanded_ct_Par.groupby('Mode_confirm').agg({'distance_miles': ['sum', 'count' , 'mean']})\n", + "miles.columns = ['Total (miles)', 'Count', 'Average (miles)']\n", + "miles = miles.reset_index()\n", + "miles =miles.sort_values(by=['Total (miles)'], ascending=False)\n", + "\n", + "#data\n", + "miles_dict = dict(zip(miles['Mode_confirm'], miles['Total (miles)']))\n", + "\n", + "labels_m = []\n", + "values_m = []\n", + "\n", + "for x, y in miles_dict.items():\n", + " labels_m.append(x)\n", + " values_m.append(y)\n", + "\n", + "plot_title=\"Miles for each mode (selected by users)\\n%s\" % quality_text\n", + "file_name ='miles_mode_confirm%s.png' % file_suffix\n", + "pie_chart_mode(plot_title,labels_m,values_m,file_name)\n", + "print(miles)" + ] + }, + { + "cell_type": "markdown", + "id": "athletic-speaker", + "metadata": {}, + "source": [ + "Energy Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dietary-bookmark", + "metadata": {}, + "outputs": [], + "source": [ + "# Loading mapping dictionaries from mapping_dictionaries notebook\n", + "\n", + "%store -r df_EI\n", + "%store -r dic_re\n", + "%store -r dic_pur\n", + "%store -r dic_fuel\n", + "\n", + "# convert a dictionary to a defaultdict\n", + "dic_pur = defaultdict(lambda: 'Other',dic_pur)\n", + "dic_re = defaultdict(lambda: 'Other',dic_re)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "empty-accent", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct_Par['Mode_confirm_fuel']= expanded_ct_Par['Mode_confirm'].map(dic_fuel)\n", + "expanded_ct_Par['Replaced_mode_fuel']= expanded_ct_Par['Replaced_mode'].map(dic_fuel)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "associate-disney", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct_Par = scaffolding.energy_intensity(expanded_ct_Par, df_EI, 'distance','Replaced_mode', 'Mode_confirm')\n", + "expanded_ct_Par = scaffolding.energy_impact_kWH(expanded_ct_Par, 'distance_miles','Replaced_mode', 'Mode_confirm')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "proved-snapshot", + "metadata": {}, + "outputs": [], + "source": [ + "data=expanded_ct_Par.loc[(participant['distance_miles'] <= 40)].sort_values(by=['Energy_Impact(kWH)'], ascending=False) \n", + "x='Energy_Impact(kWH)'\n", + "y='distance_miles'\n", + "legend ='Mode_confirm'\n", + "plot_title=\"Sketch of Energy Impact (kWH) by Travel Mode Selected\\n%s\" % quality_text\n", + "file_name ='sketch_distance_energy_impact%s.png' % file_suffix\n", + "distancevsenergy(data,x,y,legend,plot_title,file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "square-surveillance", + "metadata": {}, + "outputs": [], + "source": [ + "#eirp : energy impact replaced_mode\n", + "eirc=expanded_ct_Par.groupby('Replaced_mode').agg({'Energy_Impact(kWH)': ['sum', 'mean']},)\n", + "eirc.columns = ['Sketch of Total Energy_Impact(kWH)', 'Sketch of Average Energy_Impact(kWH)']\n", + "eirc = eirc.reset_index()\n", + "eirc = eirc.sort_values(by=['Sketch of Total Energy_Impact(kWH)'], ascending=False)\n", + "eirc['boolean'] = eirc['Sketch of Total Energy_Impact(kWH)'] > 0\n", + "\n", + "#eimc : energy impact mode_confirm\n", + "eimc=expanded_ct_Par.groupby('Mode_confirm').agg({'Energy_Impact(kWH)': ['sum', 'mean']},)\n", + "eimc.columns = ['Sketch of Total Energy_Impact(kWH)', 'Sketch of Average Energy_Impact(kWH)']\n", + "eimc = eimc.reset_index()\n", + "eimc = eimc.sort_values(by=['Sketch of Total Energy_Impact(kWH)'], ascending=False)\n", + "\n", + "\n", + "subset1 = eirc [['Replaced_mode', 'Sketch of Total Energy_Impact(kWH)']].copy()\n", + "subset1.rename(columns = {'Replaced_mode':'Transport Mode','Sketch of Total Energy_Impact(kWH)':'Replaced_Mode' }, inplace=True)\n", + "\n", + "subset2 = eimc [['Mode_confirm', 'Sketch of Total Energy_Impact(kWH)']].copy()\n", + "subset2.rename(columns = {'Mode_confirm':'Transport Mode','Sketch of Total Energy_Impact(kWH)':'Mode_Confirm' }, inplace=True)\n", + "\n", + "df_plot = pd.merge(subset1, subset2, on=\"Transport Mode\")\n", + "df = pd.melt(df_plot , id_vars=['Transport Mode'], value_vars=['Replaced_Mode','Mode_Confirm'], var_name='selection')\n", + "df.rename(columns = {'value':'Energy Impact (kWH)'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "verbal-compatibility", + "metadata": {}, + "outputs": [], + "source": [ + "#Sketch of Energy Impact by E-Bike Trips\n", + "data_eb = expanded_ct_Par.query(\"Mode_confirm == 'Pilot ebike'\")\n", + "# ebei : ebike energy impact\n", + "ebei=data_eb.groupby('Replaced_mode').agg({'Energy_Impact(kWH)': ['sum', 'mean']},)\n", + "ebei.columns = ['Sketch of Total Energy_Impact(kWH)', 'Sketch of Average Energy_Impact(kWH)']\n", + "ebei= ebei.reset_index()\n", + "ebei = ebei.sort_values(by=['Sketch of Total Energy_Impact(kWH)'], ascending=False)\n", + "ebei['boolean'] = ebei['Sketch of Total Energy_Impact(kWH)'] > 0\n", + "net_energy_saved = round(sum(ebei['Sketch of Total Energy_Impact(kWH)']), 2)\n", + "\n", + "x = ebei['Sketch of Total Energy_Impact(kWH)']\n", + "y = ebei['Replaced_mode']\n", + "color =ebei['boolean']\n", + "\n", + "plot_title=\"Sketch of Energy Impact of E-Bike trips\\n Contribution by replaced mode towards a total of %s (kWH)\\n %s\" % (net_energy_saved, quality_text)\n", + "file_name ='sketch_energy_impact_ebike%s.png' % file_suffix\n", + "energy_impact(x,y,color,plot_title,file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "civic-mineral", + "metadata": {}, + "outputs": [], + "source": [ + "expanded_ct_Par = scaffolding.CO2_impact_lb(expanded_ct_Par,'distance_miles','Replaced_mode', 'Mode_confirm')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hindu-preservation", + "metadata": {}, + "outputs": [], + "source": [ + "data_eb = expanded_ct_Par.query(\"Mode_confirm == 'Pilot ebike'\")\n", + "\n", + "ebco2=data_eb.groupby('Replaced_mode').agg({'CO2_Impact(lb)': ['sum', 'mean']},)\n", + "ebco2.columns = ['total_lb_CO2_emissions', 'average_lb_CO2_emission']\n", + "ebco2 = ebco2.reset_index()\n", + "ebco2 = ebco2.sort_values(by=['total_lb_CO2_emissions'], ascending=False)\n", + "ebco2['boolean'] = ebco2['total_lb_CO2_emissions'] > 0\n", + "net_CO2_emissions = round(sum(ebco2['total_lb_CO2_emissions']), 2)\n", + "\n", + "x = ebco2['total_lb_CO2_emissions']\n", + "y = ebco2['Replaced_mode']\n", + "color = ebco2['boolean']\n", + "\n", + "plot_title=\"Sketch of Total Pounds of CO2 Emissions of E-Bike trips\\n Contribution by replaced mode towards a total of %s (lb CO2 Emissions )\\n %s\" % (net_CO2_emissions, quality_text)\n", + "file_name ='sketch_CO2impact_ebike%s.png' % file_suffix\n", + "CO2_impact(x,y,color,plot_title,file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "approved-sculpture", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}