cycle-count-data/paris.qmd at main · NERDSITU/cycle-count-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
---
format: gfm
execute:
  eval: false
---

French cycle count datasets are available from https://opendata.paris.fr/explore/dataset/comptage-velo-historique-donnees-compteurs/information/

<!-- https://opendata.paris.fr/api/datasets/1.0/comptage-velo-historique-donnees-compteurs/attachments/2023_comptage_velo_donnees_compteurs_zip/ -->
<!-- https://opendata.paris.fr/api/datasets/1.0/comptage-velo-historique-donnees-compteurs/attachments/2023_comptage_velo_donnees_sites_comptage_zip/ -->

There are two main zip files for each year:

- `comptage_velo_donnees_compteurs.zip` contains the count data
- `comptage_velo_donnees_sites_comptage.zip` contains the location data

Let's import some data for 2023 using DuckDB, a high performance in-memory SQL database.

```{r}
#| eval: true
library(duckdb)
library(tidyverse)
```

```{r}
u = "https://opendata.paris.fr/api/datasets/1.0/comptage-velo-historique-donnees-compteurs/attachments/2023_comptage_velo_donnees_compteurs_zip/"
f = basename(u)
download.file(u, paste0(f, ".zip"))
dir.create(f)
unzip(paste0(f, ".zip"), exdir = f)
list.files(f)
f_csv = list.files(f, pattern = "csv", full.names = TRUE)
system(paste("head", f_csv))
con = DBI::dbConnect(duckdb::duckdb(), ":memory:")
counts_paris_2023 = arrow::read_delim_arrow(f_csv, delim = ";")
names(counts_paris_2023)

#  [1] "Identifiant du compteur"
#  [2] "Nom du compteur"
#  [3] "Identifiant du site de comptage"
#  [4] "Nom du site de comptage"
#  [5] "Comptage horaire"
#  [6] "Date et heure de comptage"
#  [7] "Date d'installation du site de comptage"
#  [8] "Lien vers photo du site de comptage"
#  [9] "Coordonnées géographiques"
# [10] "Identifiant technique compteur"
# [11] "ID Photos"
# [12] "test_lien_vers_photos_du_site_de_comptage_"
# [13] "id_photo_1"
# [14] "url_sites"
# [15] "type_dimage"
# [16] "mois_annee_comptage"
```

Let's keep only the most important columns and rename them to English:

```{r}
counts_paris_2023 = counts_paris_2023 |>
  select(
    counter_id = `Identifiant du compteur`,
    # counter_name = `Nom du compteur`,
    # site_id = `Identifiant du site de comptage`,
    # site_name = `Nom du site de comptage`,
    hourly_count = `Comptage horaire`,
    count_datetime = `Date et heure de comptage`,
    # site_install_date = `Date d'installation du site de comptage`,
    # lat_long = `Coordonnées géographiques`,
    # tech_counter_id = `Identifiant technique compteur`,
    # photos_id = `ID Photos`,
    # photos_url = `url_sites`,
    # image_type = `type_dimage`,
    # month_year = `mois_annee_comptage`
  )
dplyr::glimpse(counts_paris_2023)
```

Let's create basic tables showing the counts per hour, day-of-the-week, and per month:

```{r}
counts_hourly = counts_paris_2023 |>
  mutate(hour = lubridate::hour(count_datetime)) |>
  group_by(hour) |>
  summarise(total_count = sum(hourly_count))
```

The 'rush hour' peaks can be seen from the 05:00-06:00 to 09:00-10:00 and 16:00-17:00 to 20:00-21:00 bins:

```{r}
counts_hourly |>
  slice(c(6:10, 16:21)) |>
  knitr::kable()
```

The busiest days of the week are shown in the table below.

```{r}
counts_dow = counts_paris_2023 |>
  mutate(dow = lubridate::wday(count_datetime, label = TRUE)) |>
  group_by(dow) |>
  summarise(total_count = sum(hourly_count))
counts_dow |>
  knitr::kable()
```

Let's read in all count datasets:

<!-- https://opendata.paris.fr/api/datasets/1.0/comptage-velo-historique-donnees-compteurs/attachments/2018_comptage_velo_donnees_compteurs_csv_zip/ -->


```{r}
years = 2018:2023
year = 2022
data_list = lapply(years, function(year) {
  u = paste0("https://opendata.paris.fr/api/datasets/1.0/comptage-velo-historique-donnees-compteurs/attachments/", year, "_comptage_velo_donnees_compteurs_csv_zip/")
  # u2 = "https://opendata.paris.fr/api/datasets/1.0/comptage-velo-historique-donnees-compteurs/attachments/2018_comptage_velo_donnees_compteurs_csv_zip/"
  # waldo::compare(u, u2)
  # If year is 2022 or later, remove "_csv" from the URL
  if (year >= 2022) {
    u = gsub("_csv", "", u)
  }
  f = basename(u)
  download.file(u, paste0(f, ".zip"))
  dir.create(f)
  unzip(paste0(f, ".zip"), exdir = f)
  f_csv = list.files(f, pattern = "csv", full.names = TRUE)
  counts = arrow::read_delim_arrow(f_csv, delim = ";")
  counts = counts |>
    select(
      counter_id = `Identifiant du compteur`,
      hourly_count = `Comptage horaire`,
      count_datetime = `Date et heure de comptage`
    ) |>
    mutate(year = year)
  counts
})
data_full = bind_rows(data_list)
# write parquet file
arrow::write_parquet(data_full, "counts_paris.parquet")
```

Let's plot the growth in cycling based on simple average count values each month:

```{r}
#| eval: true
data_full = arrow::read_parquet("counts_paris.parquet")
data_full |>
  mutate(year_month = lubridate::floor_date(count_datetime, "month")) |>
  summarise(average_count = mean(hourly_count), .by = year_month) |>
  ggplot(aes(x = year_month, y = average_count)) +
  geom_line() +
  labs(title = "Monthly cycling counts in Paris")
```