-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain_notebook.Rmd
217 lines (153 loc) · 5.91 KB
/
main_notebook.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
---
title: "Patstat to Postgre SQL Database"
output: html_notebook
---
# Setup
```{r}
# Not sure if necessary... was an issue with a version conflict... try new version in the future again...
# devtools::install_version("RPostgres", version = "1.2.1", repos = "http://cran.us.r-project.org")
### general options
Sys.setenv(LANG = "en")
options("scipen" = 100, "digits" = 4) # override R's tendency to use scientific notation
### Clean workspace
rm(list=ls())
graphics.off()
### setWD (maybe needs installation of rstudioapi package)
# require(rstudioapi); setwd(dirname(rstudioapi::getActiveDocumentContext()$path)); getwd()
### Load packages (maybe need to be installed first)
# Standard
library(tidyverse) # General DS toolkit
library(magrittr) # For advanced piping
# Databases
library(DBI) # GEneral R database interface
library(RPostgres) # PostgreSQL interface driver
library(dbplyr) # for dplyr with databases
```
```{r}
# Outsourced in this script
source("R/functions.R")
```
# Preamble
## Define variables
First, we set up some variables, so the script runs on its own.
* The first thing we have to do is to enter the information for connecting with the Postgre SQL database we want to populate with the PATSTAT data
```{r}
# Database access
var_dbname = "INSERT_YOUR_DB"
var_host = "INSERT_YOUR_HOST_ID"
var_port = 5432 # OR other
var_user = "INSERT_OUR_USER_NAME"
var_password = "INSERT_YOUR_PASSWORD"
```
* Next, we need to define the path (relative to our current working directory), where the downloaded PATSTAT files can be found.
* This files all have to be in the same folder.
* Therefore, unzip the `zip` files downloaded from PATSTAT twice until the results are `.csv` files (eg. `tls201_part01.csv`), and copy them all in the same folder.
* Attention: The `readr` package can also diectly read `zip` files, but it will truncate files >4gb (as all unzip options in R), therefore we have to manually do this.
```{r}
# PATSTAT folder
var_folder = "YORUDRIVE/YOURFOLDER"
```
... or load a scipt where you define the variables
```{r}
# LOad script with variables
source("../variables.R")
```
## Connect to database
We now establish a connection to the database.
```{r}
# set up connection to existing PostgreSQL database, just plug in own details
con <- dbConnect(drv = RPostgres::Postgres(),
dbname = var_dbname,
host = var_host,
port = var_port,
user = var_user,
password = var_password,
sslmode = 'require'
)
```
## Load adittional data:
```{r}
# Overview of PATSTAT tables
table_names <- read_csv2("data/PATSTAT_table_description.csv")
```
```{r}
file_list <- tibble(tbl_id = NA,
tbl_part = list.files(path = var_folder, pattern = "tls.*csv"),
tbl_path = paste(var_folder, tbl_part, sep = '/')) %>%
mutate(tbl_part = gsub("\\.csv", "", .$tbl_part),
tbl_id = gsub("\\_.*", "", .$tbl_part) ) %>%
left_join(table_names, by = c('tbl_id' = 'table_id') ) %>%
mutate(tbl_name = paste(tbl_id, name, sep = '_'))
```
# Define the tables in Postgres
```{r}
con %>% dbListTables() %>% sort()
```
```{r}
# # To delete all tables
# rem <- dbListTables(con); rem <- rem[rem != 'users']; for(i in 1:length(rem )){dbRemoveTable(con, rem[i])}; rm(rem)
# # To delete specific tables
# dbRemoveTable(con, "tls902_ipc_nace2")
```
The following outsourced
```{r}
# # To create all tables. If you only want to create a single table, select that one
# source("R/script_create_tables.R")
```
# Populate the Potgres with PATSTAT data
##
```{r}
# Select or disselect certain rows
include_number <- 1:nrow(file_list)
include_name <- c('tls902') # Indicate by name the tables to include, if you want to exclude some by name
exclude_name <- c() # Indicate by name the tables to exclude, if you want to exclude some by name
```
```{r}
# subset the table
tbl_select <- file_list[include_number, ] %>% filter((tbl_id %in% include_name) & !(tbl_id %in% exclude_name))
rm(include_name, exclude_name, include_number)
```
## Execute
```{r}
for(i in 1:nrow(tbl_select) ) {
patstat_import_postgre(con = con,
tbl_path = tbl_select[i,] %>% pull(tbl_path),
tbl_name = tbl_select[i,] %>% pull(tbl_name),
append = TRUE,
overwrite = FALSE
) }
```
<!---
Errors to solve
!!! "tls206_person" ERROR: null value in column "person_id" violates not-null constraint
!!! "tls214_npl_publn" ERROR: null value in column "npl_publn_id" violates not-null constraint DETAIL: Failing row contains (null, null, null, ... (Happens at the end)
#
!!! "tls231_inpadoc_legal_event" ERROR: duplicate key value violates unique constraint "tls230_appln_techn_field_pkey"
--->
```{r}
tbl(con, "tls206_person") %>% head(100)
```
### Count rows
# TODO
##############################################################################
# Indexing
##############################################################################
# Outsourced in this script: You however might want to run it one by one
# Warning: This takes some hours...
source("000_create_index.R")
##############################################################################
# Create Foreign Key
##############################################################################
# Outsourced in this script: You however might want to run it one by one
source("000_create_foreign_key.R")
##############################################################################
# Clustering
##############################################################################
# Outsourced in this script: You however might want to run it one by one
# source("000_clustering.R")
# Note: Not done yet...
##############################################################################
# Disconnect
##############################################################################
dbDisconnect(con)
dbUnloadDriver(drv)