From ad0c8c33e5cd0eb2132e786feb790fc98e2a5ee0 Mon Sep 17 00:00:00 2001
From: Nikolay Ilyin <irishwithaxe@gmail.com>
Date: Thu, 31 Aug 2023 10:16:48 +0300
Subject: [PATCH 1/5] using named parameter

---
 .../scala/beam/agentsim/agents/modalbehaviors/ChoosesMode.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/beam/agentsim/agents/modalbehaviors/ChoosesMode.scala b/src/main/scala/beam/agentsim/agents/modalbehaviors/ChoosesMode.scala
index 4579aef3a6b..ec44923851b 100755
--- a/src/main/scala/beam/agentsim/agents/modalbehaviors/ChoosesMode.scala
+++ b/src/main/scala/beam/agentsim/agents/modalbehaviors/ChoosesMode.scala
@@ -1012,7 +1012,7 @@ trait ChoosesMode {
       beamServices.geo.wgs2Utm(legs.head.travelPath.startPoint.loc),
       legs.head.startTime,
       beamServices.geo.wgs2Utm(legs.last.travelPath.endPoint.loc),
-      wheelchairUser,
+      withWheelchair = wheelchairUser,
       requestTime = _currentTick,
       requester = self,
       rideHailServiceSubscription = attributes.rideHailServiceSubscription,

From 030bb08256ed25b363621bd8ab5fe71feef82b62 Mon Sep 17 00:00:00 2001
From: Nikolay Ilyin <irishwithaxe@gmail.com>
Date: Mon, 4 Sep 2023 16:56:43 +0300
Subject: [PATCH 2/5] boto3 added

---
 docker/jupyter-enhanced/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/jupyter-enhanced/Dockerfile b/docker/jupyter-enhanced/Dockerfile
index 79cf3455264..12a07b71f81 100644
--- a/docker/jupyter-enhanced/Dockerfile
+++ b/docker/jupyter-enhanced/Dockerfile
@@ -2,4 +2,5 @@ FROM jupyter/scipy-notebook
 
 RUN pip install geopandas pandas pygeos boto s3fs shapely gcsfs
 RUN pip install p2j pyproj pyrosm
-RUN pip install pyshp
\ No newline at end of file
+RUN pip install pyshp
+RUN pip install boto3
\ No newline at end of file

From 307db4f9795ef6684e0061c25dfdcf83ee0a9e97 Mon Sep 17 00:00:00 2001
From: Nikolay Ilyin <irishwithaxe@gmail.com>
Date: Tue, 5 Sep 2023 16:09:18 +0300
Subject: [PATCH 3/5] speeds analysis for physsim network

---
 .../physsim_network_example.ipynb             | 158 ++++++++++++++----
 .../map_analysis/physsim_network_example.py   | 140 +++++++++++++---
 2 files changed, 244 insertions(+), 54 deletions(-)

diff --git a/jupyter/map_analysis/physsim_network_example.ipynb b/jupyter/map_analysis/physsim_network_example.ipynb
index 804bb6e3dc3..1818c6cec29 100644
--- a/jupyter/map_analysis/physsim_network_example.ipynb
+++ b/jupyter/map_analysis/physsim_network_example.ipynb
@@ -21,7 +21,32 @@
     "\n",
     "pd.set_option('display.max_rows', 500)\n",
     "pd.set_option('display.max_columns', 500)\n",
-    "pd.set_option('display.width', 1000)"
+    "pd.set_option('display.width', 1000)\n",
+    "pd.set_option('max_colwidth', None)\n",
+    "\n",
+    "\n",
+    "unspecified_link_type_name = 'unspecified'\n",
+    "\n",
+    "def get_direction(row):\n",
+    "    d_x = 'N'\n",
+    "    d_y = 'E'\n",
+    "    if row['fromLocationX'] > row['toLocationX']:\n",
+    "        d_x = 'S'\n",
+    "    if row['fromLocationY'] > row['toLocationY']:\n",
+    "        d_y = 'W'\n",
+    "    return d_x + d_y\n",
+    "\n",
+    "def read_network(file_path):\n",
+    "    df = pd.read_csv(file_path)\n",
+    "\n",
+    "    df['direction'] = df.apply(get_direction, axis=1)\n",
+    "    df['freespeed_mph'] = df['linkFreeSpeed'] * 2.237\n",
+    "    df['attributeOrigType'].fillna(value=unspecified_link_type_name, inplace=True)\n",
+    "\n",
+    "    df[\"attributeOrigId\"] = df['attributeOrigId'].astype(str)\n",
+    "    df[\"attributeOrigId\"] = pd.to_numeric(df[\"attributeOrigId\"], errors='coerce').fillna(0).astype('Int64')\n",
+    "    \n",
+    "    return df"
    ]
   },
   {
@@ -35,25 +60,22 @@
    "source": [
     "# reading network.csv.gz file from simulation output folder\n",
     "\n",
-    "dtype = { 'attributeOrigId': str }\n",
-    "out_network = pd.read_csv(\"../beam_root/output/sf-light/sf-light-0.5k__2023-06-15_16-11-49_obh/network.csv.gz\", dtype=dtype)\n",
-    "out_network[\"str_id\"] = out_network['attributeOrigId'].astype(str)\n",
-    "out_network[\"id\"] = pd.to_numeric(out_network[\"str_id\"], errors='coerce').fillna(0).astype('Int64')\n",
-    "\n",
-    "# out_network.drop(columns=['str_id', 'attributeOrigId'], inplace=True)\n",
-    "out_network.head(3)"
+    "network = read_network(\"../local_files/sfbay-baseline-20230815-30pct-convergence_beam_year-2018-iteration--1_network.csv.gz\")\n",
+    "network.head(2)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1ca82ecd-48b5-46af-8815-d44b22cb1e56",
+   "id": "4857212d-1c76-4348-93a0-176d99127603",
    "metadata": {},
    "outputs": [],
    "source": [
     "# converting CRS of output network to lat-lon CRS epsg:4326 and creating a geometry for new geopandas data frame\n",
     "\n",
-    "crs_to = CRS.from_epsg(4326) # the lat lon CRS\n",
+    "new_crs = 4326\n",
+    "\n",
+    "crs_to = CRS.from_epsg(new_crs) # the lat lon CRS\n",
     "crs_from = CRS.from_epsg(26910) # sf crs\n",
     "transformer = Transformer.from_crs(crs_from, crs_to)\n",
     "\n",
@@ -63,22 +85,79 @@
     "    mls = MultiLineString([[[from_y, from_x], [to_y, to_x]]])\n",
     "    return mls\n",
     "    \n",
-    "out_geometry = out_network.apply(out_row_to_geometry, axis=1)\n",
-    "out_geometry.head(2)"
+    "geometry = network.apply(out_row_to_geometry, axis=1)\n",
+    "geometry.head(2)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e37337a3-3f21-48e1-9b0f-73a273188294",
+   "id": "89cc417c-8d01-419a-a065-b70190c594ff",
    "metadata": {},
    "outputs": [],
    "source": [
     "# creating geopandas data frame\n",
     "\n",
-    "out_network = gpd.GeoDataFrame(out_network, crs='epsg:4326', geometry=out_geometry)\n",
-    "# out_network.drop(columns=['fromLocationX', 'fromLocationY', 'toLocationX', 'toLocationY'], inplace=True)\n",
-    "out_network.head(2)"
+    "geo_df = gpd.GeoDataFrame(network, crs=f'epsg:{new_crs}', geometry=geometry)\n",
+    "display(geo_df.head(2))\n",
+    "\n",
+    "# saving GeoDataFrame as shape file\n",
+    "# geo_df.to_file(\"../local_files/sfbay-baseline-20230815-30pct-convergence_beam_year-2018-iteration--1_network.shp\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c477309e-d081-4071-bccf-401449afa99a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## how many of each road type are there with speed less than threshold\n",
+    "\n",
+    "speed_field = 'freespeed_mph'\n",
+    "threshold = 20\n",
+    "moreX_name = f\"more than {threshold} mph\"\n",
+    "lessX_name = f\"less than {threshold} mph\"\n",
+    "\n",
+    "grouped_df = network.groupby('attributeOrigType')[[speed_field]].agg(\n",
+    "    less_than_X=(speed_field, lambda gr:gr[gr < threshold].count()),\n",
+    "    more_than_X=(speed_field, lambda gr:gr[gr >= threshold].count())\n",
+    ")\n",
+    "\n",
+    "grouped_df.rename({'less_than_X':lessX_name, 'more_than_X':moreX_name}, axis='columns', inplace=True)\n",
+    "\n",
+    "ax = grouped_df.plot(kind='bar', stacked=False, rot=0, figsize=(20,4))\n",
+    "ax.set_xlabel(\"\")\n",
+    "\n",
+    "# plt.savefig('../local_files/vta-beam-network/link_speed_graph.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53ae2b66-1dd4-4fd1-8b6a-7992590f91f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## description of speeds for each road type\n",
+    "\n",
+    "dfs = []\n",
+    "\n",
+    "road_types = network['attributeOrigType'].unique()\n",
+    "for road_type in road_types:\n",
+    "    filtered_network = network[network['attributeOrigType'] == road_type]\n",
+    "        \n",
+    "    df = filtered_network[['freespeed_mph']].describe()\n",
+    "    df.rename(columns={'freespeed_mph':f'{road_type}'}, inplace=True)\n",
+    "    dfs.append(df.transpose())\n",
+    "    \n",
+    "speed_df = pd.concat(dfs)\n",
+    "\n",
+    "speed_description = grouped_df.merge(speed_df[['mean']], left_index=True, right_index=True, how='outer')\n",
+    "speed_description.sort_values('mean', ascending=False, inplace=True)\n",
+    "speed_description.rename(columns={'mean':'mean speed in mph'}, inplace=True)\n",
+    "\n",
+    "speed_description.style.set_properties(**{'width': '150px'})"
    ]
   },
   {
@@ -90,19 +169,16 @@
    "source": [
     "# filtering and plotting\n",
     "\n",
-    "fig, axs = plt.subplots(1, 3, figsize=(13, 5), dpi=100, subplot_kw={'aspect': 1})\n",
-    "fig.tight_layout(pad=5.0)\n",
+    "fig, ax = plt.subplots(1, 1, figsize=(20, 20), dpi=300, subplot_kw={'aspect': 1})\n",
     "\n",
-    "filtered_1 = out_network[(out_network['id'] > 4 * 10e7)]\n",
-    "filtered_2 = out_network[out_network['id'] == 0]\n",
-    "print(f\"There are {len(filtered_1)} and {len(filtered_2)} lines in filtered network.\")\n",
+    "filtered_1 = out_network[(out_network['attributeOrigType'] == unspecified_link_type_name)]\n",
+    "filtered_2 = out_network[(out_network['attributeOrigType'] != unspecified_link_type_name)]\n",
     "\n",
-    "filtered_1.plot(ax=axs[0])\n",
-    "filtered_2.plot(ax=axs[1])\n",
-    "filtered_1['linkLength'].hist(ax=axs[2], bins=20, color='green', alpha=0.4, label='filtered')\n",
-    "filtered_2['linkLength'].hist(ax=axs[2], bins=20, color='blue', alpha=0.4, label='with OSM ID 0')\n",
+    "filtered_1.plot(ax=ax, label=f'unspecified ({len(filtered_1)} links)', color='red', lw=0.6)\n",
+    "filtered_2.plot(ax=ax, label=f'the rest of ({len(filtered_2)} links)', color='blue', lw=0.2)\n",
     "\n",
-    "axs[2].legend()"
+    "ax.legend()\n",
+    "plt.savefig('../local_files/vta-beam-network/specified_vs_unspecified_links.png')"
    ]
   },
   {
@@ -112,9 +188,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# save the network as shape file\n",
+    "# filtering and plotting\n",
+    "\n",
+    "fig, ax = plt.subplots(1, 1, figsize=(20, 20), dpi=300, subplot_kw={'aspect': 1})\n",
+    "\n",
+    "filtered_1 = out_network[(out_network['attributeOrigType'] != unspecified_link_type_name) & (out_network['freespeed_mph'] < 20)]\n",
+    "filtered_2 = out_network[(out_network['attributeOrigType'] != unspecified_link_type_name) & (out_network['freespeed_mph'] >= 20)]\n",
     "\n",
-    "out_network.to_file(\"output_network.shp\")"
+    "filtered_1.plot(ax=ax, label=f'specified link types and free speed < 20 mph ({len(filtered_1)} links)', color='red', lw=2.6)\n",
+    "filtered_2.plot(ax=ax, label=f'specified link types and free speed > 20 mph ({len(filtered_2)} links)', color='blue', lw=0.2)\n",
+    "\n",
+    "ax.legend()\n",
+    "plt.savefig('../local_files/vta-beam-network/specified_links_slow_vs_fast.png')"
    ]
   },
   {
@@ -123,7 +208,22 @@
    "id": "209d667b-fc15-47f3-a413-144e5112c0a8",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# filtering and plotting\n",
+    "\n",
+    "link_type = 'secondary'\n",
+    "\n",
+    "fig, ax = plt.subplots(1, 1, figsize=(20, 20), dpi=300, subplot_kw={'aspect': 1})\n",
+    "\n",
+    "filtered_1 = out_network[(out_network['attributeOrigType'] == link_type) & (out_network['freespeed_mph'] < 20)]\n",
+    "filtered_2 = out_network[(out_network['attributeOrigType'] == link_type) & (out_network['freespeed_mph'] >= 20)]\n",
+    "\n",
+    "filtered_1.plot(ax=ax, label=f'{link_type} and free speed < 20 mph ({len(filtered_1)} links)', color='red', lw=2.6)\n",
+    "filtered_2.plot(ax=ax, label=f'{link_type} and free speed > 20 mph ({len(filtered_2)} links)', color='blue', lw=0.2)\n",
+    "\n",
+    "ax.legend()\n",
+    "# plt.savefig('../local_files/vta-beam-network/specified_links_slow_vs_fast.png')"
+   ]
   }
  ],
  "metadata": {
diff --git a/jupyter/map_analysis/physsim_network_example.py b/jupyter/map_analysis/physsim_network_example.py
index e9026c16fc2..6e407a9ad27 100644
--- a/jupyter/map_analysis/physsim_network_example.py
+++ b/jupyter/map_analysis/physsim_network_example.py
@@ -19,6 +19,31 @@
 pd.set_option('display.max_rows', 500)
 pd.set_option('display.max_columns', 500)
 pd.set_option('display.width', 1000)
+pd.set_option('max_colwidth', None)
+
+
+unspecified_link_type_name = 'unspecified'
+
+def get_direction(row):
+    d_x = 'N'
+    d_y = 'E'
+    if row['fromLocationX'] > row['toLocationX']:
+        d_x = 'S'
+    if row['fromLocationY'] > row['toLocationY']:
+        d_y = 'W'
+    return d_x + d_y
+
+def read_network(file_path):
+    df = pd.read_csv(file_path)
+
+    df['direction'] = df.apply(get_direction, axis=1)
+    df['freespeed_mph'] = df['linkFreeSpeed'] * 2.237
+    df['attributeOrigType'].fillna(value=unspecified_link_type_name, inplace=True)
+
+    df["attributeOrigId"] = df['attributeOrigId'].astype(str)
+    df["attributeOrigId"] = pd.to_numeric(df["attributeOrigId"], errors='coerce').fillna(0).astype('Int64')
+    
+    return df
 
 
 # In[ ]:
@@ -26,13 +51,8 @@
 
 # reading network.csv.gz file from simulation output folder
 
-dtype = { 'attributeOrigId': str }
-out_network = pd.read_csv("../beam_root/output/sf-light/sf-light-0.5k__2023-06-15_16-11-49_obh/network.csv.gz", dtype=dtype)
-out_network["str_id"] = out_network['attributeOrigId'].astype(str)
-out_network["id"] = pd.to_numeric(out_network["str_id"], errors='coerce').fillna(0).astype('Int64')
-
-# out_network.drop(columns=['str_id', 'attributeOrigId'], inplace=True)
-out_network.head(3)
+network = read_network("../local_files/sfbay-baseline-20230815-30pct-convergence_beam_year-2018-iteration--1_network.csv.gz")
+network.head(2)
 
 
 # In[ ]:
@@ -40,7 +60,9 @@
 
 # converting CRS of output network to lat-lon CRS epsg:4326 and creating a geometry for new geopandas data frame
 
-crs_to = CRS.from_epsg(4326) # the lat lon CRS
+new_crs = 4326
+
+crs_to = CRS.from_epsg(new_crs) # the lat lon CRS
 crs_from = CRS.from_epsg(26910) # sf crs
 transformer = Transformer.from_crs(crs_from, crs_to)
 
@@ -50,8 +72,8 @@ def out_row_to_geometry(df_row):
     mls = MultiLineString([[[from_y, from_x], [to_y, to_x]]])
     return mls
     
-out_geometry = out_network.apply(out_row_to_geometry, axis=1)
-out_geometry.head(2)
+geometry = network.apply(out_row_to_geometry, axis=1)
+geometry.head(2)
 
 
 # In[ ]:
@@ -59,9 +81,58 @@ def out_row_to_geometry(df_row):
 
 # creating geopandas data frame
 
-out_network = gpd.GeoDataFrame(out_network, crs='epsg:4326', geometry=out_geometry)
-# out_network.drop(columns=['fromLocationX', 'fromLocationY', 'toLocationX', 'toLocationY'], inplace=True)
-out_network.head(2)
+geo_df = gpd.GeoDataFrame(network, crs=f'epsg:{new_crs}', geometry=geometry)
+display(geo_df.head(2))
+
+# saving GeoDataFrame as shape file
+# geo_df.to_file("../local_files/sfbay-baseline-20230815-30pct-convergence_beam_year-2018-iteration--1_network.shp")
+
+
+# In[ ]:
+
+
+## how many of each road type are there with speed less than threshold
+
+speed_field = 'freespeed_mph'
+threshold = 20
+moreX_name = f"more than {threshold} mph"
+lessX_name = f"less than {threshold} mph"
+
+grouped_df = network.groupby('attributeOrigType')[[speed_field]].agg(
+    less_than_X=(speed_field, lambda gr:gr[gr < threshold].count()),
+    more_than_X=(speed_field, lambda gr:gr[gr >= threshold].count())
+)
+
+grouped_df.rename({'less_than_X':lessX_name, 'more_than_X':moreX_name}, axis='columns', inplace=True)
+
+ax = grouped_df.plot(kind='bar', stacked=False, rot=0, figsize=(20,4))
+ax.set_xlabel("")
+
+# plt.savefig('../local_files/vta-beam-network/link_speed_graph.png')
+
+
+# In[ ]:
+
+
+## description of speeds for each road type
+
+dfs = []
+
+road_types = network['attributeOrigType'].unique()
+for road_type in road_types:
+    filtered_network = network[network['attributeOrigType'] == road_type]
+        
+    df = filtered_network[['freespeed_mph']].describe()
+    df.rename(columns={'freespeed_mph':f'{road_type}'}, inplace=True)
+    dfs.append(df.transpose())
+    
+speed_df = pd.concat(dfs)
+
+speed_description = grouped_df.merge(speed_df[['mean']], left_index=True, right_index=True, how='outer')
+speed_description.sort_values('mean', ascending=False, inplace=True)
+speed_description.rename(columns={'mean':'mean speed in mph'}, inplace=True)
+
+speed_description.style.set_properties(**{'width': '150px'})
 
 
 # In[ ]:
@@ -69,31 +140,50 @@ def out_row_to_geometry(df_row):
 
 # filtering and plotting
 
-fig, axs = plt.subplots(1, 3, figsize=(13, 5), dpi=100, subplot_kw={'aspect': 1})
-fig.tight_layout(pad=5.0)
+fig, ax = plt.subplots(1, 1, figsize=(20, 20), dpi=300, subplot_kw={'aspect': 1})
 
-filtered_1 = out_network[(out_network['id'] > 4 * 10e7)]
-filtered_2 = out_network[out_network['id'] == 0]
-print(f"There are {len(filtered_1)} and {len(filtered_2)} lines in filtered network.")
+filtered_1 = out_network[(out_network['attributeOrigType'] == unspecified_link_type_name)]
+filtered_2 = out_network[(out_network['attributeOrigType'] != unspecified_link_type_name)]
 
-filtered_1.plot(ax=axs[0])
-filtered_2.plot(ax=axs[1])
-filtered_1['linkLength'].hist(ax=axs[2], bins=20, color='green', alpha=0.4, label='filtered')
-filtered_2['linkLength'].hist(ax=axs[2], bins=20, color='blue', alpha=0.4, label='with OSM ID 0')
+filtered_1.plot(ax=ax, label=f'unspecified ({len(filtered_1)} links)', color='red', lw=0.6)
+filtered_2.plot(ax=ax, label=f'the rest of ({len(filtered_2)} links)', color='blue', lw=0.2)
 
-axs[2].legend()
+ax.legend()
+plt.savefig('../local_files/vta-beam-network/specified_vs_unspecified_links.png')
 
 
 # In[ ]:
 
 
-# save the network as shape file
+# filtering and plotting
+
+fig, ax = plt.subplots(1, 1, figsize=(20, 20), dpi=300, subplot_kw={'aspect': 1})
+
+filtered_1 = out_network[(out_network['attributeOrigType'] != unspecified_link_type_name) & (out_network['freespeed_mph'] < 20)]
+filtered_2 = out_network[(out_network['attributeOrigType'] != unspecified_link_type_name) & (out_network['freespeed_mph'] >= 20)]
 
-out_network.to_file("output_network.shp")
+filtered_1.plot(ax=ax, label=f'specified link types and free speed < 20 mph ({len(filtered_1)} links)', color='red', lw=2.6)
+filtered_2.plot(ax=ax, label=f'specified link types and free speed > 20 mph ({len(filtered_2)} links)', color='blue', lw=0.2)
+
+ax.legend()
+plt.savefig('../local_files/vta-beam-network/specified_links_slow_vs_fast.png')
 
 
 # In[ ]:
 
 
+# filtering and plotting
+
+link_type = 'secondary'
+
+fig, ax = plt.subplots(1, 1, figsize=(20, 20), dpi=300, subplot_kw={'aspect': 1})
+
+filtered_1 = out_network[(out_network['attributeOrigType'] == link_type) & (out_network['freespeed_mph'] < 20)]
+filtered_2 = out_network[(out_network['attributeOrigType'] == link_type) & (out_network['freespeed_mph'] >= 20)]
+
+filtered_1.plot(ax=ax, label=f'{link_type} and free speed < 20 mph ({len(filtered_1)} links)', color='red', lw=2.6)
+filtered_2.plot(ax=ax, label=f'{link_type} and free speed > 20 mph ({len(filtered_2)} links)', color='blue', lw=0.2)
 
+ax.legend()
+# plt.savefig('../local_files/vta-beam-network/specified_links_slow_vs_fast.png')
 

From 6cf902f0686b0859ecab0fe05f4519e01e70b910 Mon Sep 17 00:00:00 2001
From: Nikolay Ilyin <irishwithaxe@gmail.com>
Date: Tue, 5 Sep 2023 18:45:06 +0300
Subject: [PATCH 4/5] analysis of linkstats and network files as a whole

---
 jupyter/map_analysis/network_linkstats.ipynb | 435 +++++++++++++++----
 jupyter/map_analysis/network_linkstats.py    | 387 ++++++++++++++---
 2 files changed, 654 insertions(+), 168 deletions(-)

diff --git a/jupyter/map_analysis/network_linkstats.ipynb b/jupyter/map_analysis/network_linkstats.ipynb
index e5152bcf6c0..3d712272d0d 100644
--- a/jupyter/map_analysis/network_linkstats.ipynb
+++ b/jupyter/map_analysis/network_linkstats.ipynb
@@ -10,12 +10,103 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
-    "import math\n",
+    "import geopandas as gpd\n",
+    "\n",
+    "from pyproj import CRS, Transformer\n",
+    "from geopandas import GeoDataFrame\n",
+    "from shapely.geometry import Point\n",
+    "from shapely.geometry.multilinestring import MultiLineString\n",
+    "from IPython.display import clear_output\n",
     "\n",
     "pd.set_option('display.max_rows', 500)\n",
     "pd.set_option('display.max_columns', 500)\n",
     "pd.set_option('display.width', 1000)\n",
-    "pd.set_option('max_colwidth', None)"
+    "pd.set_option('max_colwidth', None)\n",
+    "\n",
+    "\n",
+    "unspecified_link_type_name = 'unspecified'\n",
+    "\n",
+    "\n",
+    "def get_direction(row):\n",
+    "    d_x = 'North'\n",
+    "    d_y = 'East'\n",
+    "    if row['fromLocationX'] > row['toLocationX']:\n",
+    "        d_x = 'South'\n",
+    "    if row['fromLocationY'] > row['toLocationY']:\n",
+    "        d_y = 'West'\n",
+    "    return f'{d_x} - {d_y}'\n",
+    "\n",
+    "\n",
+    "def read_network(file_path):\n",
+    "    df = pd.read_csv(file_path)\n",
+    "\n",
+    "    df['attributeOrigType'].fillna(value=unspecified_link_type_name, inplace=True)\n",
+    "\n",
+    "    df[\"attributeOrigId\"] = df['attributeOrigId'].astype(str)\n",
+    "    df[\"attributeOrigId\"] = pd.to_numeric(df[\"attributeOrigId\"], errors='coerce').fillna(0).astype('Int64')\n",
+    "    df['direction'] = df.apply(get_direction, axis=1)\n",
+    "    \n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "def read_linkstats(file_path):\n",
+    "    df = pd.read_csv(file_path)\n",
+    "\n",
+    "    df['speed_ms'] = df['length'] / df['traveltime']\n",
+    "    df['speed_mph'] = df['speed_ms'] * 2.237\n",
+    "    df['freespeed_mph'] = df['freespeed'] * 2.237\n",
+    "\n",
+    "    df['speed_delta_mph'] = df['freespeed_mph'] - df['speed_mph']\n",
+    "    \n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "# sf-bay area CRS is 26910\n",
+    "def read_geo_network(network_path, network_crs):\n",
+    "    n_df = read_network(network_path)\n",
+    "    \n",
+    "    crs_to_id = 4326 # the LAT LON CRS\n",
+    "    crs_to = CRS.from_epsg(crs_to_id)\n",
+    "    crs_from = CRS.from_epsg(network_crs)\n",
+    "    transformer = Transformer.from_crs(crs_from, crs_to)\n",
+    "\n",
+    "    def out_row_to_geometry(df_row):\n",
+    "        (from_x, from_y) = transformer.transform(df_row['fromLocationX'], df_row['fromLocationY'])\n",
+    "        (to_x, to_y) = transformer.transform(df_row['toLocationX'], df_row['toLocationY'])\n",
+    "        mls = MultiLineString([[[from_y, from_x], [to_y, to_x]]])\n",
+    "        return mls\n",
+    "\n",
+    "    geometry = n_df.apply(out_row_to_geometry, axis=1)\n",
+    "    geo_df = gpd.GeoDataFrame(n_df, crs=f'epsg:{crs_to_id}', geometry=geometry)\n",
+    "    return geo_df\n",
+    "    \n",
+    "\n",
+    "def read_full_network(linkstats_path, network_path):\n",
+    "    n_df = read_network(network_path)\n",
+    "    l_df = read_linkstats(linkstats_path)\n",
+    "    \n",
+    "    # simple sanity check\n",
+    "    links_ids_1 = set(l_df['link'].unique())\n",
+    "    links_ids_2 = set(n_df['linkId'].unique())\n",
+    "    if (len(links_ids_1 - links_ids_2) != 0):\n",
+    "        print(f\"SOMETHING IS WRONG!! Links are not the same from both files!\")\n",
+    "\n",
+    "    full_df = l_df.merge(n_df, left_on='link', right_on='linkId', how='outer')\n",
+    "    return full_df\n",
+    "\n",
+    "\n",
+    "## to do sanity check after merging linkstats and network\n",
+    "## to ensure that the linkstats was produced from the network\n",
+    "def do_sanity_check(full_df):\n",
+    "\n",
+    "    def row_sanity_check(row):\n",
+    "        return row['to'] == row['toNodeId'] \\\n",
+    "           and row['from'] == row['fromNodeId'] \\\n",
+    "           and row['freespeed'] == row['linkFreeSpeed']\n",
+    "\n",
+    "    return full_df.apply(lambda r: 'equal' if row_sanity_check(r) else 'NOT equal', axis=1).value_counts()\n",
+    "\n",
+    "'init complete'"
    ]
   },
   {
@@ -25,168 +116,286 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "## reading files\n",
+    "## reading network file\n",
+    "# network = read_network(\"../local_files/vta-beam-network/network.csv.gz\")\n",
+    "# print(f\"there are {len(network)} records\")\n",
+    "# display(network.head(2))\n",
     "\n",
-    "network = pd.read_csv(\"../local_files/network.1.csv.gz\")\n",
-    "network['attributeOrigType'].fillna(value='UNKNOWN', inplace=True)\n",
+    "## reading linkstats file\n",
+    "# linkstats = read_linkstats(\"../local_files/vta-beam-network/1.0.linkstats.csv.gz\")\n",
+    "# print(f\"there are {len(linkstats)} records\")\n",
+    "# display(linkstats.head(2))\n",
     "\n",
-    "linkstats = pd.read_csv(\"../local_files/linkstats.1.csv.gz\")\n",
+    "# reading full network\n",
+    "full_network = read_full_network(\n",
+    "    linkstats_path = \"../local_files/vta-beam-network/1.0.linkstats.csv.gz\", \n",
+    "    network_path = \"../local_files/vta-beam-network/network.csv.gz\",\n",
+    ")\n",
     "\n",
-    "print(network.shape, linkstats.shape)\n",
-    "# network['attributeOrigType'].value_counts()"
+    "display(full_network.head(2))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4a1443c5-9aef-407b-a133-153b133046f8",
+   "id": "651a3e08-e221-41ef-910a-f867c86af2aa",
    "metadata": {},
    "outputs": [],
    "source": [
-    "network.head(2)"
+    "def avg_speed_vol(data):\n",
+    "    if data['volume'].sum() > 0:\n",
+    "        return np.average(data['speed_mph'], weights=data['volume'])\n",
+    "    return np.average(data['speed_mph'])\n",
+    "\n",
+    "def total_volume(data):\n",
+    "    return np.sum(data['volume'])\n",
+    "\n",
+    "def average_freespeed(data):\n",
+    "    return np.average(data['freespeed_mph'])\n",
+    "\n",
+    "def speeds_analysis(data):\n",
+    "    return pd.Series({\n",
+    "        \"average weighted speed\" : avg_speed_vol(data),\n",
+    "        \"total volume\": total_volume(data),\n",
+    "        \"average freespeed\": average_freespeed(data),\n",
+    "        \"5 percentile of speed\": np.percentile(data['speed_mph'], 5),\n",
+    "        \"95 percentile of speed\": np.percentile(data['speed_mph'], 95),\n",
+    "    })\n",
+    "\n",
+    "\n",
+    "df = full_network #.head(100000)\n",
+    "df_g = df.groupby(['hour','attributeOrigType']).apply(speeds_analysis).reset_index()\n",
+    "df_g['total volume weighted'] = df_g['total volume'] / df_g['total volume'].max()\n",
+    "\n",
+    "road_types_df = df_g.groupby('attributeOrigType')['total volume'].sum().reset_index().sort_values('total volume', ascending=False)\n",
+    "road_types_ordered_by_volume = list(road_types_df['attributeOrigType'])\n",
+    "\n",
+    "hours = df_g['hour'].unique()\n",
+    "\n",
+    "plt.figure(figsize=(15,7))\n",
+    "plt.ylim(0, 70)\n",
+    "\n",
+    "N_road_types = 6\n",
+    "\n",
+    "for road_type in road_types_ordered_by_volume[:6]:\n",
+    "    df_g_f = df_g[df_g['attributeOrigType'] == road_type]\n",
+    "    avg_speed = df_g_f['average weighted speed']\n",
+    "    \n",
+    "    size_multiplier =  df_g_f['total volume weighted'].sum() * 20\n",
+    "    size = df_g_f['total volume weighted'] * size_multiplier\n",
+    "    \n",
+    "    plt.scatter(x=hours, y=avg_speed, s=size, label=f\"avg speed mph [{road_type}]\", alpha=0.6)\n",
+    "    plt.plot(hours, avg_speed, alpha=0.2)\n",
+    "\n",
+    "plt.legend()\n",
+    "plt.title(f\"Average speed for road types weighted by volume (top {N_road_types} road types by volume)\")\n",
+    "plt.show()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fab21b08-a08c-4982-869b-e62fee36f3e1",
+   "id": "82d6d8f1-96fa-4721-961c-240546fe05e5",
    "metadata": {},
    "outputs": [],
    "source": [
-    "linkstats.head(2)"
+    "def avg_speed_vol(data):\n",
+    "    if data['volume'].sum() > 0:\n",
+    "        return np.average(data['speed_mph'], weights=data['volume'])\n",
+    "    return np.average(data['speed_mph'])\n",
+    "\n",
+    "def total_volume(data):\n",
+    "    return np.sum(data['volume'])\n",
+    "\n",
+    "def average_freespeed(data):\n",
+    "    return np.average(data['freespeed_mph'])\n",
+    "\n",
+    "def speeds_analysis(data):\n",
+    "    return pd.Series({\n",
+    "        \"average weighted speed\" : avg_speed_vol(data),\n",
+    "        \"total volume\": total_volume(data),\n",
+    "        \"average freespeed\": average_freespeed(data),\n",
+    "        \"5 percentile of speed\": np.percentile(data['speed_mph'], 5),\n",
+    "        \"95 percentile of speed\": np.percentile(data['speed_mph'], 95),\n",
+    "    })\n",
+    "\n",
+    "\n",
+    "links_with_most_volume = set(full_network[full_network['volume'] > np.percentile(full_network['volume'], 99.5)]['link'].unique())\n",
+    "df = full_network[full_network['link'].isin(links_with_most_volume)]\n",
+    "\n",
+    "df_g = df.groupby(['hour','attributeOrigType']).apply(speeds_analysis).reset_index()\n",
+    "df_g['total volume weighted'] = df_g['total volume'] / df_g['total volume'].max()\n",
+    "\n",
+    "road_types_df = df_g.groupby('attributeOrigType')['total volume'].sum().reset_index().sort_values('total volume', ascending=False)\n",
+    "# display(road_types_df)\n",
+    "\n",
+    "road_types_ordered_by_volume = list(road_types_df['attributeOrigType'])\n",
+    "\n",
+    "hours = df_g['hour'].unique()\n",
+    "\n",
+    "plt.figure(figsize=(15,4))\n",
+    "plt.ylim(0, 70)\n",
+    "\n",
+    "N_road_types = 3\n",
+    "\n",
+    "for road_type in road_types_ordered_by_volume[:N_road_types]:\n",
+    "    df_g_f = df_g[df_g['attributeOrigType'] == road_type]\n",
+    "    avg_speed = df_g_f['average weighted speed']\n",
+    "    min_speed = df_g_f[\"5 percentile of speed\"]\n",
+    "    \n",
+    "    size_multiplier =  df_g_f['total volume weighted'].sum() * 20\n",
+    "    size = df_g_f['total volume weighted'] * size_multiplier\n",
+    "    \n",
+    "    scatterplot = plt.scatter(x=hours, y=avg_speed, s=size, label=f\"avg speed mph [{road_type}, volume {df_g_f['total volume'].sum()}]\", alpha=0.4)\n",
+    "    col = scatterplot.get_facecolors()[0].tolist()\n",
+    "    plt.plot(hours, avg_speed, alpha=0.2, c=col)\n",
+    "    plt.plot(hours, min_speed, alpha=0.2, label=f\"5 percentile for speed mph [{road_type}]\", c=col)\n",
+    "\n",
+    "plt.legend()\n",
+    "plt.title(f\"Average speed for top 0.5% links by volume, weighted by volume (top {N_road_types} road types by volume)\")\n",
+    "plt.show()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1f42656e-aba7-495f-a029-480a27164316",
+   "id": "cca66295-b323-4498-8f91-cd7929bbfd6f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "## merging together based on link Id \n",
-    "## assuming files are for the same network and has the same number of links\n",
-    "\n",
-    "full_df = linkstats.merge(network, left_on='link', right_on='linkId', how='outer')\n",
-    "\n",
-    "full_df.head(2)"
+    "# sf-bay area CRS is 26910\n",
+    "geo_network = read_geo_network(\n",
+    "    network_path = \"../local_files/vta-beam-network/network.csv.gz\", \n",
+    "    network_crs=26910\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2b161ac7-73da-4524-a792-9950d9a914dd",
+   "id": "40c32b34-160b-4757-9861-1d45eaf829f8",
    "metadata": {},
    "outputs": [],
    "source": [
-    "## doing sanity check\n",
+    "# filtering and plotting\n",
+    "\n",
+    "links_with_most_volume = set(full_network[full_network['volume'] > np.percentile(full_network['volume'], 99.5)]['link'].unique())\n",
+    "\n",
+    "fig, ax = plt.subplots(1, 1, figsize=(20, 20), dpi=300, subplot_kw={'aspect': 1})\n",
     "\n",
-    "def row_sanity_check(row):\n",
-    "    return row['to'] == row['toNodeId'] \\\n",
-    "       and row['from'] == row['fromNodeId'] \\\n",
-    "       and row['freespeed'] == row['linkFreeSpeed']\n",
+    "filtered_1 = geo_network[(geo_network['linkId'].isin(links_with_most_volume))]\n",
+    "filtered_2 = geo_network[(~geo_network['linkId'].isin(links_with_most_volume))]\n",
     "\n",
-    "print(f\"the shape of df is {full_df.shape}\")\n",
-    "full_df.apply(lambda r: 'equal' if row_sanity_check(r) else 'NOT equal', axis=1).value_counts()"
+    "filtered_1.plot(ax=ax, label=f'({len(filtered_1)} links) with 0.5% top volume', color='red', lw=1.2)\n",
+    "filtered_2.plot(ax=ax, label=f'the rest of ({len(filtered_2)} links)', color='blue', lw=0.2)\n",
+    "\n",
+    "ax.legend()\n",
+    "\n",
+    "# base_file_name = '../local_files/vta-beam-network/links_with_most_volume'\n",
+    "# network[(network['linkId'].isin(slow_links_ids))].to_csv(base_file_name + \".csv.gz\")\n",
+    "# filtered_1.to_csv(base_file_name + \".geo.csv.gz\")\n",
+    "# filtered_1.to_file(base_file_name + \".shp\")\n",
+    "# plt.savefig(base_file_name + \".png\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0f3ce36d-4e27-444d-addf-d53b1ed3a116",
+   "id": "8c3a9e75-d599-488e-acd8-9a9cc6269e7d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "## calculating speed, free speed, speed in km/h, speed in mph\n",
+    "## how many of each road type are there with speed less than threshold\n",
     "\n",
-    "full_df['speed'] = full_df['length'] / full_df['traveltime']\n",
+    "speed_field = 'speed_mph'\n",
+    "threshold = 20\n",
+    "moreX_name = f\"more than {threshold} mph\"\n",
+    "lessX_name = f\"less than {threshold} mph\"\n",
     "\n",
-    "full_df['freespeed_km_h'] = full_df['freespeed'] * 3.6\n",
-    "full_df['speed_km_h'] = full_df['speed'] * 3.6\n",
+    "grouped_df = full_df.groupby('attributeOrigType')[[speed_field]].agg(\n",
+    "    less_than_X=(speed_field, lambda gr:gr[gr < threshold].count()),\n",
+    "    more_than_X=(speed_field, lambda gr:gr[gr >= threshold].count())\n",
+    ")\n",
+    "\n",
+    "grouped_df.rename({'less_than_X':lessX_name, 'more_than_X':moreX_name}, axis='columns', inplace=True)\n",
     "\n",
-    "full_df['freespeed_mph'] = full_df['freespeed'] * 2.237\n",
-    "full_df['speed_mph'] = full_df['speed'] * 2.237\n",
+    "ax = grouped_df.plot(kind='bar', stacked=False, rot=0, figsize=(20,4))\n",
+    "ax.set_xlabel(\"\")\n",
     "\n",
-    "full_df.head(2)"
+    "plt.savefig('../local_files/vta-beam-network/link_speed_graph.png')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6877eef2-4d44-4dc8-9b4c-ab3f74037d57",
+   "id": "e13ef8bd-5b13-483c-8b9e-74dcc524e6c9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "## speed distribution\n",
+    "# an example of analysis of speed of selected links subset\n",
+    "\n",
+    "east_bound_left = set([59118,80745]) # NE\n",
+    "west_bound_left = set([59119,80744]) # SW\n",
+    "\n",
+    "east_bound_right = set([20062,34374]) # NE\n",
+    "west_bound_right = set([20063,34375]) # SW\n",
     "\n",
-    "max_speed = int(full_df['speed_km_h'].max())\n",
-    "full_df['speed_km_h'].hist(bins=max_speed, figsize=(15,2))\n",
-    "print(f'max speed and number of buckets is {max_speed}')"
+    "display(network[network['linkId'].isin(east_bound_left)][['linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','direction']])\n",
+    "display(network[network['linkId'].isin(west_bound_left)][['linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','direction']])\n",
+    "display(network[network['linkId'].isin(east_bound_right)][['linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','direction']])\n",
+    "display(network[network['linkId'].isin(west_bound_right)][['linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','direction']])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "037b8214-2bcb-4c14-8e76-050cc4250635",
+   "id": "add2697e-ab23-4fea-aac6-596c32f1f1e9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "## description of speeds for each road type\n",
+    "# an example of analysis of speed of selected links subset\n",
+    "# here are two bridges analysis\n",
     "\n",
-    "# dfs = []\n",
-    "dfs_free = []\n",
+    "left_road = set([59118,80745,59119,80744])\n",
+    "right_road = set([20062,34374,20063,34375]) \n",
     "\n",
-    "road_types = network['attributeOrigType'].unique()\n",
-    "for road_type in road_types:\n",
-    "    filtered_network = full_df[full_df['attributeOrigType'] == road_type]\n",
-    "    \n",
-    "    # df = filtered_network[['speed']].describe()\n",
-    "    # df.rename(columns={'speed':f'{road_type} speed'}, inplace=True)\n",
-    "    # dfs.append(df.transpose())\n",
+    "fig,axs = plt.subplots(1,2,figsize=(18,4))\n",
+    "fig.subplots_adjust(wspace=0.1, hspace=0.4)\n",
+    "\n",
+    "def plot_aggregated_speed(df, set_of_ids, label, axis, save_to_csv=False):\n",
+    "    df_filtered = df[(df['link'].isin(set_of_ids))]\n",
+    "    directions = df_filtered['direction'].unique()\n",
     "    \n",
-    "    df = filtered_network[['freespeed']].describe()\n",
-    "    df.rename(columns={'freespeed':f'{road_type} free speed'}, inplace=True)\n",
-    "    dfs_free.append(df.transpose())\n",
+    "    function_name = 'mean'\n",
+    "    for direction in sorted(directions):\n",
+    "        df_filtered_direction = df_filtered[df_filtered['direction'] == direction]\n",
+    "        df_grouped = df_filtered_direction.groupby('hour')[['speed_mph']].agg(function_name).reset_index()\n",
+    "        df_grouped.plot(x='hour', y='speed_mph', marker='o', linewidth=4, linestyle='-', label=f'{function_name} speed  {label}  [{direction}]', ax=axis, alpha=0.5)\n",
     "    \n",
-    "# speed_df = pd.concat(dfs)\n",
-    "# display(speed_df)\n",
+    "    if save_to_csv:\n",
+    "        df_filtered.to_csv(f'../local_files/vta-beam-network/linkstats_1_filtered_links_{label}.csv'.replace(' ', '.'))\n",
     "\n",
-    "free_speed_df = pd.concat(dfs_free)\n",
-    "display(free_speed_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8c3a9e75-d599-488e-acd8-9a9cc6269e7d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## how many of each road type are there with speed less than threshold\n",
-    "\n",
-    "grouped_df = full_df.groupby('attributeOrigType')[['linkFreeSpeed']].agg(\n",
-    "    less_than_20=('linkFreeSpeed', lambda gr:gr[gr < 20].count()),\n",
-    "    more_than_20=('linkFreeSpeed', lambda gr:gr[gr >= 20].count())\n",
-    ")\n",
+    "    \n",
+    "plot_aggregated_speed(full_network, left_road, 'left road', axs[0])\n",
+    "plot_aggregated_speed(full_network, right_road, 'right road', axs[1])\n",
     "\n",
-    "grouped_df.rename({'less_than_20':\"less than 20\", 'more_than_20':\"more than 20\"}, axis='columns', inplace=True)\n",
+    "fig.suptitle(\"Full population simulation (linkstats #1 analysis)\")\n",
     "\n",
-    "ax = grouped_df.plot(kind='bar', stacked=True, rot=20, figsize=(20,4))\n",
+    "for ax in axs:\n",
+    "    ax.set_ylim(bottom=0)\n",
+    "    ax.legend(loc='lower right')\n",
+    "    ax.set_ylabel('speed mph')\n",
     "\n",
-    "# if numbers are required on top of bars:\n",
-    "#\n",
-    "# for (container, pdd, color) in zip(ax.containers, [0,10], ['blue', 'orange']):\n",
-    "#     ax.bar_label(container, padding=pdd, color=color)"
+    "# plt.savefig(f'../local_files/vta-beam-network/linkstats_1_filtered_links.png')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "acebf0fc-cc25-4e83-8576-b1ed78b30a33",
+   "id": "31837f31-0b03-4289-a7c8-6003c99a2a68",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "grouped_df.sort_values('more than 20')"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
@@ -206,11 +415,11 @@
     "\n",
     "linkId_to_values = {}\n",
     "\n",
-    "links_with_speed_less_than_threshold = set(linkstats2[linkstats2['freespeed'] < threshold_ms]['link'].unique())\n",
+    "links_with_speed_less_than_threshold = set(full_df[full_df['freespeed'] < threshold_ms]['link'].unique())\n",
     "print(f\"there are {len(links_with_speed_less_than_threshold)} links with free speed less than {threshold_ms} meters per second.\")\n",
     "\n",
     "selected_columns = ['linkId','linkCapacity','linkFreeSpeed','linkLength','numberOfLanes','attributeOrigType']\n",
-    "df = network2[network2['linkId'].isin(links_with_speed_less_than_threshold)][selected_columns]\n",
+    "df = full_df[full_df['linkId'].isin(links_with_speed_less_than_threshold)][selected_columns]\n",
     "df.rename(columns={'linkId':'link_id', \n",
     "                   'linkCapacity':'capacity', \n",
     "                   'linkFreeSpeed':'free_speed', \n",
@@ -245,7 +454,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ba2cdeeb-20c5-4d55-914d-ebc4a0ec9b21",
+   "id": "2c85d25d-92a6-452e-bb5b-304036089f38",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5242850b-04d1-4e36-8e2c-bc178e92c8a3",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -253,7 +470,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "cc09b33c-5a14-41f4-b3fe-72eb5fbc28c7",
+   "id": "266491e3-52db-4542-8a11-fa3181b9b379",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -261,25 +478,51 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9c8906ff-8e9c-418d-813c-2138fdd7ccfc",
+   "id": "1675d562-dcfe-48dd-b4c9-2a1541cbe6ed",
    "metadata": {},
    "outputs": [],
    "source": [
-    "dd = {\n",
-    "    'l1' : [1,2,3] * 10,\n",
-    "    'l2' : list(range(12,12 + 15)) * 2\n",
-    "}\n",
+    "# group by and apply example\n",
     "\n",
-    "df = pd.DataFrame.from_dict(dd)\n",
-    "display(df.head())\n",
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.DataFrame({'x':[2, 3, -10, -10],\n",
+    "                   'y':[10, 13, 20, 30],\n",
+    "                   'id':['a', 'a', 'b', 'b']})\n",
+    "\n",
+    "def mindist(data):\n",
+    "     return min(data['y'] - data['x'])\n",
+    "\n",
+    "def maxdist(data):\n",
+    "    return max(data['y'] - data['x'])\n",
+    "\n",
+    "def fun(data):\n",
+    "    return pd.Series({\"maxdist\":maxdist(data),\n",
+    "                      \"mindist\":mindist(data)})\n",
     "\n",
-    "df.groupby('l1')[['l2']].agg(l3=('l2', lambda gr: gr[gr >= 16].count()), l4=('l2', lambda gr: gr[gr < 16].count()) )"
+    "df.groupby('id').apply(fun)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "df98f6c0-d777-45b5-9172-faf0410d2ac6",
+   "id": "3ea18a05-39a3-41ca-a884-30eabd465c97",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac8f73db-054d-4013-89e3-980095946bac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98c3b0d3-3b73-47f1-a640-353fe563f7d0",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/jupyter/map_analysis/network_linkstats.py b/jupyter/map_analysis/network_linkstats.py
index 8a709fcaafe..a02b1a26f30 100644
--- a/jupyter/map_analysis/network_linkstats.py
+++ b/jupyter/map_analysis/network_linkstats.py
@@ -7,7 +7,13 @@
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-import math
+import geopandas as gpd
+
+from pyproj import CRS, Transformer
+from geopandas import GeoDataFrame
+from shapely.geometry import Point
+from shapely.geometry.multilinestring import MultiLineString
+from IPython.display import clear_output
 
 pd.set_option('display.max_rows', 500)
 pd.set_option('display.max_columns', 500)
@@ -15,108 +21,258 @@
 pd.set_option('max_colwidth', None)
 
 
-# In[ ]:
+unspecified_link_type_name = 'unspecified'
 
 
-## reading files
+def get_direction(row):
+    d_x = 'North'
+    d_y = 'East'
+    if row['fromLocationX'] > row['toLocationX']:
+        d_x = 'South'
+    if row['fromLocationY'] > row['toLocationY']:
+        d_y = 'West'
+    return f'{d_x} - {d_y}'
 
-network = pd.read_csv("../local_files/network.1.csv.gz")
-network['attributeOrigType'].fillna(value='UNKNOWN', inplace=True)
 
-linkstats = pd.read_csv("../local_files/linkstats.1.csv.gz")
+def read_network(file_path):
+    df = pd.read_csv(file_path)
 
-print(network.shape, linkstats.shape)
-# network['attributeOrigType'].value_counts()
+    df['attributeOrigType'].fillna(value=unspecified_link_type_name, inplace=True)
 
+    df["attributeOrigId"] = df['attributeOrigId'].astype(str)
+    df["attributeOrigId"] = pd.to_numeric(df["attributeOrigId"], errors='coerce').fillna(0).astype('Int64')
+    df['direction'] = df.apply(get_direction, axis=1)
+    
+    return df
 
-# In[ ]:
 
+def read_linkstats(file_path):
+    df = pd.read_csv(file_path)
 
-network.head(2)
+    df['speed_ms'] = df['length'] / df['traveltime']
+    df['speed_mph'] = df['speed_ms'] * 2.237
+    df['freespeed_mph'] = df['freespeed'] * 2.237
 
+    df['speed_delta_mph'] = df['freespeed_mph'] - df['speed_mph']
+    
+    return df
 
-# In[ ]:
 
+# sf-bay area CRS is 26910
+def read_geo_network(network_path, network_crs):
+    n_df = read_network(network_path)
+    
+    crs_to_id = 4326 # the LAT LON CRS
+    crs_to = CRS.from_epsg(crs_to_id)
+    crs_from = CRS.from_epsg(network_crs)
+    transformer = Transformer.from_crs(crs_from, crs_to)
+
+    def out_row_to_geometry(df_row):
+        (from_x, from_y) = transformer.transform(df_row['fromLocationX'], df_row['fromLocationY'])
+        (to_x, to_y) = transformer.transform(df_row['toLocationX'], df_row['toLocationY'])
+        mls = MultiLineString([[[from_y, from_x], [to_y, to_x]]])
+        return mls
+
+    geometry = n_df.apply(out_row_to_geometry, axis=1)
+    geo_df = gpd.GeoDataFrame(n_df, crs=f'epsg:{crs_to_id}', geometry=geometry)
+    return geo_df
+    
 
-linkstats.head(2)
+def read_full_network(linkstats_path, network_path):
+    n_df = read_network(network_path)
+    l_df = read_linkstats(linkstats_path)
+    
+    # simple sanity check
+    links_ids_1 = set(l_df['link'].unique())
+    links_ids_2 = set(n_df['linkId'].unique())
+    if (len(links_ids_1 - links_ids_2) != 0):
+        print(f"SOMETHING IS WRONG!! Links are not the same from both files!")
 
+    full_df = l_df.merge(n_df, left_on='link', right_on='linkId', how='outer')
+    return full_df
 
-# In[ ]:
 
+## to do sanity check after merging linkstats and network
+## to ensure that the linkstats was produced from the network
+def do_sanity_check(full_df):
 
-## merging together based on link Id 
-## assuming files are for the same network and has the same number of links
+    def row_sanity_check(row):
+        return row['to'] == row['toNodeId'] \
+           and row['from'] == row['fromNodeId'] \
+           and row['freespeed'] == row['linkFreeSpeed']
 
-full_df = linkstats.merge(network, left_on='link', right_on='linkId', how='outer')
+    return full_df.apply(lambda r: 'equal' if row_sanity_check(r) else 'NOT equal', axis=1).value_counts()
 
-full_df.head(2)
+'init complete'
 
 
 # In[ ]:
 
 
-## doing sanity check
+## reading network file
+# network = read_network("../local_files/vta-beam-network/network.csv.gz")
+# print(f"there are {len(network)} records")
+# display(network.head(2))
+
+## reading linkstats file
+# linkstats = read_linkstats("../local_files/vta-beam-network/1.0.linkstats.csv.gz")
+# print(f"there are {len(linkstats)} records")
+# display(linkstats.head(2))
 
-def row_sanity_check(row):
-    return row['to'] == row['toNodeId'] \
-       and row['from'] == row['fromNodeId'] \
-       and row['freespeed'] == row['linkFreeSpeed']
+# reading full network
+full_network = read_full_network(
+    linkstats_path = "../local_files/vta-beam-network/1.0.linkstats.csv.gz", 
+    network_path = "../local_files/vta-beam-network/network.csv.gz",
+)
 
-print(f"the shape of df is {full_df.shape}")
-full_df.apply(lambda r: 'equal' if row_sanity_check(r) else 'NOT equal', axis=1).value_counts()
+display(full_network.head(2))
 
 
 # In[ ]:
 
 
-## calculating speed, free speed, speed in km/h, speed in mph
+def avg_speed_vol(data):
+    if data['volume'].sum() > 0:
+        return np.average(data['speed_mph'], weights=data['volume'])
+    return np.average(data['speed_mph'])
 
-full_df['speed'] = full_df['length'] / full_df['traveltime']
+def total_volume(data):
+    return np.sum(data['volume'])
 
-full_df['freespeed_km_h'] = full_df['freespeed'] * 3.6
-full_df['speed_km_h'] = full_df['speed'] * 3.6
+def average_freespeed(data):
+    return np.average(data['freespeed_mph'])
 
-full_df['freespeed_mph'] = full_df['freespeed'] * 2.237
-full_df['speed_mph'] = full_df['speed'] * 2.237
+def speeds_analysis(data):
+    return pd.Series({
+        "average weighted speed" : avg_speed_vol(data),
+        "total volume": total_volume(data),
+        "average freespeed": average_freespeed(data),
+        "5 percentile of speed": np.percentile(data['speed_mph'], 5),
+        "95 percentile of speed": np.percentile(data['speed_mph'], 95),
+    })
 
-full_df.head(2)
 
+df = full_network #.head(100000)
+df_g = df.groupby(['hour','attributeOrigType']).apply(speeds_analysis).reset_index()
+df_g['total volume weighted'] = df_g['total volume'] / df_g['total volume'].max()
 
-# In[ ]:
+road_types_df = df_g.groupby('attributeOrigType')['total volume'].sum().reset_index().sort_values('total volume', ascending=False)
+road_types_ordered_by_volume = list(road_types_df['attributeOrigType'])
+
+hours = df_g['hour'].unique()
+
+plt.figure(figsize=(15,7))
+plt.ylim(0, 70)
 
+N_road_types = 6
 
-## speed distribution
+for road_type in road_types_ordered_by_volume[:6]:
+    df_g_f = df_g[df_g['attributeOrigType'] == road_type]
+    avg_speed = df_g_f['average weighted speed']
+    
+    size_multiplier =  df_g_f['total volume weighted'].sum() * 20
+    size = df_g_f['total volume weighted'] * size_multiplier
+    
+    plt.scatter(x=hours, y=avg_speed, s=size, label=f"avg speed mph [{road_type}]", alpha=0.6)
+    plt.plot(hours, avg_speed, alpha=0.2)
 
-max_speed = int(full_df['speed_km_h'].max())
-full_df['speed_km_h'].hist(bins=max_speed, figsize=(15,2))
-print(f'max speed and number of buckets is {max_speed}')
+plt.legend()
+plt.title(f"Average speed for road types weighted by volume (top {N_road_types} road types by volume)")
+plt.show()
 
 
 # In[ ]:
 
 
-## description of speeds for each road type
+def avg_speed_vol(data):
+    if data['volume'].sum() > 0:
+        return np.average(data['speed_mph'], weights=data['volume'])
+    return np.average(data['speed_mph'])
 
-# dfs = []
-dfs_free = []
+def total_volume(data):
+    return np.sum(data['volume'])
 
-road_types = network['attributeOrigType'].unique()
-for road_type in road_types:
-    filtered_network = full_df[full_df['attributeOrigType'] == road_type]
-    
-    # df = filtered_network[['speed']].describe()
-    # df.rename(columns={'speed':f'{road_type} speed'}, inplace=True)
-    # dfs.append(df.transpose())
+def average_freespeed(data):
+    return np.average(data['freespeed_mph'])
+
+def speeds_analysis(data):
+    return pd.Series({
+        "average weighted speed" : avg_speed_vol(data),
+        "total volume": total_volume(data),
+        "average freespeed": average_freespeed(data),
+        "5 percentile of speed": np.percentile(data['speed_mph'], 5),
+        "95 percentile of speed": np.percentile(data['speed_mph'], 95),
+    })
+
+
+links_with_most_volume = set(full_network[full_network['volume'] > np.percentile(full_network['volume'], 99.5)]['link'].unique())
+df = full_network[full_network['link'].isin(links_with_most_volume)]
+
+df_g = df.groupby(['hour','attributeOrigType']).apply(speeds_analysis).reset_index()
+df_g['total volume weighted'] = df_g['total volume'] / df_g['total volume'].max()
+
+road_types_df = df_g.groupby('attributeOrigType')['total volume'].sum().reset_index().sort_values('total volume', ascending=False)
+# display(road_types_df)
+
+road_types_ordered_by_volume = list(road_types_df['attributeOrigType'])
+
+hours = df_g['hour'].unique()
+
+plt.figure(figsize=(15,4))
+plt.ylim(0, 70)
+
+N_road_types = 3
+
+for road_type in road_types_ordered_by_volume[:N_road_types]:
+    df_g_f = df_g[df_g['attributeOrigType'] == road_type]
+    avg_speed = df_g_f['average weighted speed']
+    min_speed = df_g_f["5 percentile of speed"]
     
-    df = filtered_network[['freespeed']].describe()
-    df.rename(columns={'freespeed':f'{road_type} free speed'}, inplace=True)
-    dfs_free.append(df.transpose())
+    size_multiplier =  df_g_f['total volume weighted'].sum() * 20
+    size = df_g_f['total volume weighted'] * size_multiplier
     
-# speed_df = pd.concat(dfs)
-# display(speed_df)
+    scatterplot = plt.scatter(x=hours, y=avg_speed, s=size, label=f"avg speed mph [{road_type}, volume {df_g_f['total volume'].sum()}]", alpha=0.4)
+    col = scatterplot.get_facecolors()[0].tolist()
+    plt.plot(hours, avg_speed, alpha=0.2, c=col)
+    plt.plot(hours, min_speed, alpha=0.2, label=f"5 percentile for speed mph [{road_type}]", c=col)
+
+plt.legend()
+plt.title(f"Average speed for top 0.5% links by volume, weighted by volume (top {N_road_types} road types by volume)")
+plt.show()
+
 
-free_speed_df = pd.concat(dfs_free)
-display(free_speed_df)
+# In[ ]:
+
+
+# sf-bay area CRS is 26910
+geo_network = read_geo_network(
+    network_path = "../local_files/vta-beam-network/network.csv.gz", 
+    network_crs=26910
+)
+
+
+# In[ ]:
+
+
+# filtering and plotting
+
+links_with_most_volume = set(full_network[full_network['volume'] > np.percentile(full_network['volume'], 99.5)]['link'].unique())
+
+fig, ax = plt.subplots(1, 1, figsize=(20, 20), dpi=300, subplot_kw={'aspect': 1})
+
+filtered_1 = geo_network[(geo_network['linkId'].isin(links_with_most_volume))]
+filtered_2 = geo_network[(~geo_network['linkId'].isin(links_with_most_volume))]
+
+filtered_1.plot(ax=ax, label=f'({len(filtered_1)} links) with 0.5% top volume', color='red', lw=1.2)
+filtered_2.plot(ax=ax, label=f'the rest of ({len(filtered_2)} links)', color='blue', lw=0.2)
+
+ax.legend()
+
+# base_file_name = '../local_files/vta-beam-network/links_with_most_volume'
+# network[(network['linkId'].isin(slow_links_ids))].to_csv(base_file_name + ".csv.gz")
+# filtered_1.to_csv(base_file_name + ".geo.csv.gz")
+# filtered_1.to_file(base_file_name + ".shp")
+# plt.savefig(base_file_name + ".png")
 
 
 # In[ ]:
@@ -124,25 +280,84 @@ def row_sanity_check(row):
 
 ## how many of each road type are there with speed less than threshold
 
-grouped_df = full_df.groupby('attributeOrigType')[['linkFreeSpeed']].agg(
-    less_than_20=('linkFreeSpeed', lambda gr:gr[gr < 20].count()),
-    more_than_20=('linkFreeSpeed', lambda gr:gr[gr >= 20].count())
+speed_field = 'speed_mph'
+threshold = 20
+moreX_name = f"more than {threshold} mph"
+lessX_name = f"less than {threshold} mph"
+
+grouped_df = full_df.groupby('attributeOrigType')[[speed_field]].agg(
+    less_than_X=(speed_field, lambda gr:gr[gr < threshold].count()),
+    more_than_X=(speed_field, lambda gr:gr[gr >= threshold].count())
 )
 
-grouped_df.rename({'less_than_20':"less than 20", 'more_than_20':"more than 20"}, axis='columns', inplace=True)
+grouped_df.rename({'less_than_X':lessX_name, 'more_than_X':moreX_name}, axis='columns', inplace=True)
+
+ax = grouped_df.plot(kind='bar', stacked=False, rot=0, figsize=(20,4))
+ax.set_xlabel("")
 
-ax = grouped_df.plot(kind='bar', stacked=True, rot=20, figsize=(20,4))
+plt.savefig('../local_files/vta-beam-network/link_speed_graph.png')
 
-# if numbers are required on top of bars:
-#
-# for (container, pdd, color) in zip(ax.containers, [0,10], ['blue', 'orange']):
-#     ax.bar_label(container, padding=pdd, color=color)
+
+# In[ ]:
+
+
+# an example of analysis of speed of selected links subset
+
+east_bound_left = set([59118,80745]) # NE
+west_bound_left = set([59119,80744]) # SW
+
+east_bound_right = set([20062,34374]) # NE
+west_bound_right = set([20063,34375]) # SW
+
+display(network[network['linkId'].isin(east_bound_left)][['linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','direction']])
+display(network[network['linkId'].isin(west_bound_left)][['linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','direction']])
+display(network[network['linkId'].isin(east_bound_right)][['linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','direction']])
+display(network[network['linkId'].isin(west_bound_right)][['linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','direction']])
+
+
+# In[ ]:
+
+
+# an example of analysis of speed of selected links subset
+# here are two bridges analysis
+
+left_road = set([59118,80745,59119,80744])
+right_road = set([20062,34374,20063,34375]) 
+
+fig,axs = plt.subplots(1,2,figsize=(18,4))
+fig.subplots_adjust(wspace=0.1, hspace=0.4)
+
+def plot_aggregated_speed(df, set_of_ids, label, axis, save_to_csv=False):
+    df_filtered = df[(df['link'].isin(set_of_ids))]
+    directions = df_filtered['direction'].unique()
+    
+    function_name = 'mean'
+    for direction in sorted(directions):
+        df_filtered_direction = df_filtered[df_filtered['direction'] == direction]
+        df_grouped = df_filtered_direction.groupby('hour')[['speed_mph']].agg(function_name).reset_index()
+        df_grouped.plot(x='hour', y='speed_mph', marker='o', linewidth=4, linestyle='-', label=f'{function_name} speed  {label}  [{direction}]', ax=axis, alpha=0.5)
+    
+    if save_to_csv:
+        df_filtered.to_csv(f'../local_files/vta-beam-network/linkstats_1_filtered_links_{label}.csv'.replace(' ', '.'))
+
+    
+plot_aggregated_speed(full_network, left_road, 'left road', axs[0])
+plot_aggregated_speed(full_network, right_road, 'right road', axs[1])
+
+fig.suptitle("Full population simulation (linkstats #1 analysis)")
+
+for ax in axs:
+    ax.set_ylim(bottom=0)
+    ax.legend(loc='lower right')
+    ax.set_ylabel('speed mph')
+
+# plt.savefig(f'../local_files/vta-beam-network/linkstats_1_filtered_links.png')
 
 
 # In[ ]:
 
 
-grouped_df.sort_values('more than 20')
+
 
 
 # In[ ]:
@@ -159,11 +374,11 @@ def row_sanity_check(row):
 
 linkId_to_values = {}
 
-links_with_speed_less_than_threshold = set(linkstats2[linkstats2['freespeed'] < threshold_ms]['link'].unique())
+links_with_speed_less_than_threshold = set(full_df[full_df['freespeed'] < threshold_ms]['link'].unique())
 print(f"there are {len(links_with_speed_less_than_threshold)} links with free speed less than {threshold_ms} meters per second.")
 
 selected_columns = ['linkId','linkCapacity','linkFreeSpeed','linkLength','numberOfLanes','attributeOrigType']
-df = network2[network2['linkId'].isin(links_with_speed_less_than_threshold)][selected_columns]
+df = full_df[full_df['linkId'].isin(links_with_speed_less_than_threshold)][selected_columns]
 df.rename(columns={'linkId':'link_id', 
                    'linkCapacity':'capacity', 
                    'linkFreeSpeed':'free_speed', 
@@ -210,15 +425,43 @@ def get_mean_speed(row):
 # In[ ]:
 
 
-dd = {
-    'l1' : [1,2,3] * 10,
-    'l2' : list(range(12,12 + 15)) * 2
-}
 
-df = pd.DataFrame.from_dict(dd)
-display(df.head())
 
-df.groupby('l1')[['l2']].agg(l3=('l2', lambda gr: gr[gr >= 16].count()), l4=('l2', lambda gr: gr[gr < 16].count()) )
+
+# In[ ]:
+
+
+# group by and apply example
+
+import pandas as pd
+
+df = pd.DataFrame({'x':[2, 3, -10, -10],
+                   'y':[10, 13, 20, 30],
+                   'id':['a', 'a', 'b', 'b']})
+
+def mindist(data):
+     return min(data['y'] - data['x'])
+
+def maxdist(data):
+    return max(data['y'] - data['x'])
+
+def fun(data):
+    return pd.Series({"maxdist":maxdist(data),
+                      "mindist":mindist(data)})
+
+df.groupby('id').apply(fun)
+
+
+# In[ ]:
+
+
+
+
+
+# In[ ]:
+
+
+
 
 
 # In[ ]:

From 5dd5bf411d3c0e77d396d01bc6db134a790e6bfd Mon Sep 17 00:00:00 2001
From: Nikolay Ilyin <irishwithaxe@gmail.com>
Date: Tue, 5 Sep 2023 18:46:37 +0300
Subject: [PATCH 5/5] new line added

---
 docker/jupyter-enhanced/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/jupyter-enhanced/Dockerfile b/docker/jupyter-enhanced/Dockerfile
index 12a07b71f81..15039f1d483 100644
--- a/docker/jupyter-enhanced/Dockerfile
+++ b/docker/jupyter-enhanced/Dockerfile
@@ -3,4 +3,4 @@ FROM jupyter/scipy-notebook
 RUN pip install geopandas pandas pygeos boto s3fs shapely gcsfs
 RUN pip install p2j pyproj pyrosm
 RUN pip install pyshp
-RUN pip install boto3
\ No newline at end of file
+RUN pip install boto3