ht_prs_preg_final.Rmd

---
title: "BP PRS and pregnancy complications"
editor_options:
  chunk_output_type: console
output:
  html_document:
    number_sections: true
    code_folding: hide
    toc: true
    toc_float: 
        collapsed: false
        smooth_scroll: false
    theme: flatly
    highlight: haddock
    df_print: paged
---


```{r setup}

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=F)

```


<details><summary>**Libraries**</summary>


```{r reinstall packages with newer version, eval=F}

detach("package:pROC", unload=TRUE)
detach("package:forestplot", unload=TRUE)


#Local installation
#install.packages("packages/PBSmodelling_2.68.8.tar.gz", repos=NULL)
#install.packages("packages/PredictABEL_1.2-4.tar.gz", repos=NULL)
install.packages("packages/pROC_1.18.0.tar.gz", repos=NULL)
install.packages("packages/forestplot_2.0.1.tar.gz", repos=NULL)

#survcomp dependences
install.packages("packages/SuppDists_1.1-9.7.tar.gz", repos=NULL)
install.packages("packages/survivalROC_1.0.3.tar.gz", repos=NULL)
install.packages("packages/rmeta_3.0.tar.gz", repos=NULL)
install.packages("packages/bootstrap_2019.6.tar.gz", repos=NULL)
install.packages("packages/survcomp_1.46.0.tar.gz", repos=NULL)

#DescTools & dependences:
install.packages("packages/rootSolve_1.8.2.3.tar.gz", repos=NULL)
install.packages("packages/lmom_2.8.tar.gz", repos=NULL)
install.packages("packages/Exact_3.1.tar.gz", repos=NULL)
install.packages("packages/gld_2.6.4.tar.gz", repos=NULL)
install.packages("packages/expm_0.999-6.tar.gz", repos=NULL) 
install.packages("packages/DescTools_0.99.44.tar.gz", repos=NULL)


```


```{r libraries, class.source = 'fold-show'}

library(tidyverse)
library(survival)   # survival analysis
library(data.table) # fread() function
library(gridExtra)  # plots to grid
library(survminer)  # ggbased visualization and extra diagnostics
library(visdat)     # visualization of tibble and na's
library(tableone)   # Characteristics table
library(compareC)   # Compare c-index
library(Hmisc)
library(PredictABEL) #NRI and IDI
library(pROC)
library(kableExtra) # Pretty tables
library(forestplot)
library(DescTools)  #Pseudo R2
library(survcomp)   #hr.comp2

source('functions_article.R')

fg_pheno_path <- "/finngen/library-red/finngen_R9/phenotype_1.0/data"
data_path <- "data"
fig_path <- "figs5"

```

</details>
<br>


# Data
<br>


## Variables
<br>


**Endpoints**


* O15_GESTAT_HYPERT, gestational hypertension
* O15_PREECLAMPS, pre-eclampsia


**Polygenic risk scores**

* Female specific SBP polygenic risk score
* Preeclampsia risk score

<br>

## Download & preprocess data
<br>


**Preselect phenotype columns**

Entries of interest are selected from endpoint file are preselected at 'rather big machine', because the original phenotype file is very large.

```{r, eval=F}


pheno_cols <- fread("fg_pheno_cols.txt", header=FALSE) %>%
  pull(V1)

pheno_tmp <- fread(str_glue("{fg_pheno_path}/finngen_R9_endpoint_1.0.txt.gz")) %>%  
  select(all_of(pheno_cols))

fwrite(pheno_tmp, 'data/finngen_R9_preg.tsv.gz', sep='\t')

```


<br>

**Load and combine the data**

```{r load & combine}

# We have calculated all PRS scores by FinnGen PRS-CS pipeline.

#PRS score
prs_sbp <- fread("data/ukb_ega_prs/ukb.sbp.female.gwas.tsv.sscore") %>%
  select(IID, SCORE1_AVG) %>%
  rename(FINNGENID = IID, SBP_SCORE = SCORE1_AVG)

prs_prec <- fread("data/ukb_ega_prs/ega2.pe.gwas.tsv.sscore") %>%
  select(IID, SCORE1_AVG) %>%
  rename(FINNGENID = IID, PREC_SCORE = SCORE1_AVG) 
 

#endpoints and covariates
phenotypes <- fread(str_glue("{data_path}/finngen_R9_preg.tsv.gz"), sep = "\t") 
  #Should NA's removed from disease variables - they are introduced for the purpose of genetic analysis
  #mutate(across(c(-FINNGENID, -ends_with("_AGE"), -ends_with("_YEAR")),                    
  #              ~if_else(is.na(.), 0L, .)))                                           
                                                                                                    

covs_cols <- fread("fg_cov_cols.txt", header=FALSE) %>% pull(V1)
covs_in <- fread(str_glue("/finngen/library-red/finngen_R9/analysis_covariates/R9_COV_V1.FID.txt.gz")) %>%
  rename(FINNGENID = IID) %>%
  select(all_of(covs_cols))

#birth_register
birth_reg <- fread("/finngen/library-red/finngen_R9/birth_and_dvv_register_1.0/data/finngen_R9_birth_dvv_register_mothers.txt.gz") %>%
  select(MOTHER_FINNGENID, MOTHER_AGE, BIRTH_YEAR, PARITY, DURATION_WEEKS, NRO_FETUSES, IVF, ICSI) %>%
  rename(FINNGENID = MOTHER_FINNGENID) %>%
  filter(BIRTH_YEAR >= 1969) %>% 
  group_by(FINNGENID) %>%
    mutate(CHILD_COUNT = 1:n()) %>%   #This count refers to nro of child since 1969
  ungroup()                           #Parity would describe the actual first baby     

### definition of birth from birth register.
first_birth <- birth_reg %>%
  filter(CHILD_COUNT==1) %>%  
  mutate(FIRST = 1L) %>%
  rename(FIRST_AGE = MOTHER_AGE, FIRST_YEAR = BIRTH_YEAR, FIRST_DURATION = DURATION_WEEKS) %>%
  select(FINNGENID, FIRST, FIRST_AGE, FIRST_YEAR, FIRST_DURATION) %>%
  group_by(FINNGENID) %>%
     summarise(FIRST, FIRST_AGE = min(FIRST_AGE), FIRST_YEAR = min(FIRST_YEAR), FIRST_DURATION) %>%
  ungroup() 

#combining the data
df <- phenotypes %>%
  left_join(first_birth, by = "FINNGENID") %>%
  mutate(FIRST = if_else(is.na(FIRST), 0L, FIRST)) %>%
  left_join(prs_sbp, by = "FINNGENID") %>%
  left_join(prs_prec, by = "FINNGENID") %>%
  left_join(covs_in, by = "FINNGENID") %>%
  as_tibble() %>%
  mutate_at(c("batch","cohort"),as.factor) 

#Number of individuals:
c("Number of individuals:", dim(df)[1])


```

<br>

**Males are removed**. Also endpoint names are simplified.

```{r preprocess}

names_ep <- list("GESTAT_HYPERT", "PREECLAMPS")

#glimpse(df)
#vis_dat(sample_n(df, 10000))

#Filter women and simplify endpointanmes
df.f <- df %>% 
  filter(SEX_IMPUTED == 1) %>%  #Only women
  rename_at(vars(contains('O15_')), list(~str_remove(., "O15_"))) %>%
  rename_at(vars(contains('XV_')), list(~str_remove(., "XV_"))) %>%
  rename_at(vars(contains('I9_')), list(~str_remove(., "I9_"))) %>%
  rename_at(vars(contains('E4_')), list(~str_remove(., "E4_"))) %>%
  rename_at(vars(contains('N14_')), list(~str_remove(., "N14_"))) %>%
  rename_at(vars(contains('D3_')), list(~str_remove(., "D3_"))) %>%

#Include gestational diabetes to diabetes
  mutate(DIABETES_ALL = if_else(DIABETES == 1 | GEST_DIABETES == 1, 1L, 0L,)) %>%
  rowwise() %>%
    mutate(DIABETES_ALL_AGE = min(DIABETES_AGE, GEST_DIABETES_AGE),
           DIABETES_ALL_YEAR = case_when(!is.na(DIABETES_YEAR) & !is.na(GEST_DIABETES_YEAR) ~ min(DIABETES_YEAR, GEST_DIABETES_YEAR),
                                         !is.na(DIABETES_YEAR) ~ DIABETES_YEAR,
                                         !is.na(GEST_DIABETES_YEAR) ~ GEST_DIABETES_YEAR)) %>%
             ungroup()


#vis_dat(sample_n(df.f, 10000))

#Number of individuals:
c("Number of women:",dim(df.f)[1])

```

<br>

We will **end the followup at the age of 55 years**. 

```{r, followup until 55v}


#FU_END_AGE and PREGNANCY_BIRTH variables modifed according new followup time
df.f <- df.f %>%
  mutate(FU_END_AGE = if_else(FU_END_AGE>55, 55, FU_END_AGE),
         PREGNANCY_BIRTH = if_else(PREGNANCY_BIRTH_AGE > 55, as.integer(0), PREGNANCY_BIRTH),
         PREGNANCY_BIRTH_AGE = if_else(PREGNANCY_BIRTH_AGE > 55, 55, PREGNANCY_BIRTH_AGE))

#endpoint events and ages modified according new followup time
for(ep in unlist(names_ep)) {
  df.f <- df.f %>%
    mutate(!!ep := if_else(get(str_glue("{ep}_AGE")) > 55, as.integer(0), get(ep)),
           !!(str_glue("{ep}_AGE")) := if_else(get(str_glue("{ep}_AGE")) > 55, 55, get(str_glue("{ep}_AGE")))  )
}


```

<br>

**Sex-specific Polygenic risk score** is **scaled** (variable SBP_SCALED) and **categorized** (variable SBP_CAT) using values from remaining women.


```{r scaled & categorical}

my_probs = c(0,0.025,0.2,0.8,0.975,1)
my_labels =  c("<2.5%","2.5-20%","20-80%","80-97.5%",">97.5%")

df.f <- df.f %>%  
  mutate(
    SBP_SCALED = scale(SBP_SCORE)[,1], 
    PREC_SCALED = scale(PREC_SCORE)[,1], 
    SBP_CAT = cut(SBP_SCORE, breaks=quantile(df.f$SBP_SCORE, probs = my_probs), 
                         labels=my_labels, include.lowest = T),
    PREC_CAT = cut(PREC_SCORE, breaks=quantile(df.f$PREC_SCORE, probs = my_probs), 
                         labels=my_labels, include.lowest = T)
    )

```

<br>

**Only women with pregnancy** are included to the analysis.

```{r only pregnancy}
#Only persons with pregnancy 

df.p <- df.f %>%
  filter(FIRST==1)

```

<br>


## Create clinical covarariates & DIFF-variable

<br>

**New age variable 'INDEX_AGE'**

For cases patiant 'INDEX_AGE' is age of first preeclampsia and for controls its the first pregnancy. Covariate events before preeclampsia are defined (*_PRE). In addition, variable 'IS_FIRST' defines whether first preeclampsia was in first pregnancy and 'IS_MULTIPLE' whether it is multiple pregnancy.


There was no column for 'D3_OTHTHROMBOPHILIA'.


```{r}

#names_outcomes = list("GESTAT_HYPERT", "PREECLAMPS", "ECLAMPSIA")
names_outcomes = list("GESTAT_HYPERT", "PREECLAMPS", "ECLAMPSIA", "PRE_OR_ECLAMPSIA", "ABORT_SPONTAN", "PLAC_PREMAT_SEPAR", "PRETERM", "LABOUR_INTRAPART_HAEMORRH", "LABOUR_FETAL_STRESS", "LABOUR_UMBILICAL", "POSTPART_HEAMORRH", "MEMBR_PREMAT_RUPT", "AMNIOT_OTHER")

#clin_covs <- c("INDEX_AGE", "IS_MULTIP", "IS_IVF", "HYPTENS_PRE", "OBESITY_PRE", "DIABETES_ALL_PRE", "RENFAIL_PRE", "CHD_PRE", "PAD_PRE", "STR_PRE")
clin_covs <- c("INDEX_AGE", "IS_MULTIP", "IS_IVF", "HYPTENS_PRE", "OBESITY_PRE", "DIABETES_ALL_PRE", "RENFAIL_PRE")
clin_covs_age <- c("HYPTENS_AGE", "OBESITY_AGE", "DIABETES_ALL_AGE", "RENFAIL_AGE", "CHD_AGE", "PAD_AGE", "STR_AGE")
gen_covs <- c("batch", "BL_YEAR","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10")
clin_covs_prelim <- c("HYPTENS", "OBESITY", "DIABETES_ALL", "RENFAIL")


#list.df.p <- vector(mode = "list", length = 3)
list.df.p = list(GESTAT_HYPERT=NULL, PREECLAMPS=NULL, ECLAMPSIA = NULL, PRE_OR_ECLAMPSIA=NULL, ABORT_SPONTAN=NULL, PLAC_PREMAT_SEPAR=NULL, PRETERM=NULL, LABOUR_INTRAPART_HAEMORRH=NULL, LABOUR_FETAL_STRESS=NULL, LABOUR_UMBILICAL=NULL, POSTPART_HEAMORRH=NULL, MEMBR_PREMAT_RUPT=NULL, AMNIOT_OTHER=NULL)
outcome = "GESTAT_HYPERT"
#outcome = "PREECLAMPS"

 for (outcome in unlist(names_outcomes))
{
 
  tmp <- df.p %>%
     left_join(birth_reg, by="FINNGENID") %>% 
     select(FINNGENID, MOTHER_AGE, DURATION_WEEKS, PARITY, CHILD_COUNT, NRO_FETUSES, IVF, ICSI, contains(outcome), contains("FIRST")) %>%
     mutate(PREG_START_AGE = if_else(!is.na(DURATION_WEEKS), MOTHER_AGE - DURATION_WEEKS/52, MOTHER_AGE - 0.75), TMP = DURATION_WEEKS/52)
  
  #cases
  tmp_cases <- tmp %>%
     filter(get(outcome) == 1) %>%     
     mutate(INDEX_AGE = get(str_glue("{outcome}_AGE")),  #This line differs for cases and controls!!
            DIFF_WEEKS = (INDEX_AGE - PREG_START_AGE)*52, 
            DIFF_TMP = DIFF_WEEKS) %>%  #Mock variable for 'slice'
   #Set DIFF as NA if not same pregnancy
   mutate(DIFF_WEEKS = if_else((INDEX_AGE >= PREG_START_AGE) & (INDEX_AGE <= MOTHER_AGE + 1/12), DIFF_WEEKS, NA_real_)) %>%
   #Horrible hack, but I did not come with better idea.  We will primarly use lines linked with pregnancy and secondarily other lines.
   mutate(DIFF_TMP = if_else(is.na(DIFF_WEEKS), abs(DIFF_TMP)+100, abs(DIFF_TMP)) )  %>%
   #filter(INDEX_AGE >= PREG_START_AGE, INDEX_AGE <= MOTHER_AGE + 1/12) %>% #Exclude if not same pregnancy
   group_by(FINNGENID) %>% #Select the line with smallest time difference
     slice(which.min(DIFF_TMP)) %>%
   ungroup() 
    
   #controls
   tmp_controls <- tmp %>%
     filter(get(outcome) == 0) %>%     
     mutate(INDEX_AGE = FIRST_AGE,  #This line differs for cases and controls!!
             DIFF_WEEKS = (INDEX_AGE - PREG_START_AGE)*52) %>%
     #Exclude if not same pregnancy
     filter(INDEX_AGE >= PREG_START_AGE, INDEX_AGE <= MOTHER_AGE + 1/12) %>%  #Only 'current pregnancy' is left.     
     group_by(FINNGENID) %>%   #Select the line with smallest time difference
       slice(which.min(abs(DIFF_WEEKS))) %>%
     ungroup()

   tmp_br <- bind_rows(tmp_cases, tmp_controls) %>%
     arrange(FINNGENID) %>%
     mutate(IS_MULTIP = if_else(NRO_FETUSES >=2, 1L, 0L),
            IS_IVF = if_else(IVF==1 | ICSI==1, 1L, 0L),
            IS_IVF = if_else(is.na(IS_IVF), 0L, IS_IVF),  #Remove NA's. 
            IS_FIRST = if_else(PARITY ==1, 1L, 0L)) %>%
     select(FINNGENID, PREG_START_AGE, INDEX_AGE, DIFF_WEEKS, IS_MULTIP, IS_IVF, IS_FIRST)
     
   #Tähän asti index-age on määritelty kaikille ja lukumäärät mätsäävät.
   
  #New variables
  df.p2 <-df.p %>%
    left_join(tmp_br, by = "FINNGENID") %>%  
    mutate( #AGE_SCALED = scale(INDEX_AGE),
           "{outcome}_NA_AGE" := if_else(get(outcome) == 1, get(str_glue("{outcome}_AGE")), NA_real_) )  %>%   #age for the summary

    #Keep disease covariate, only if before event
     mutate(across(all_of(clin_covs_prelim), 
                list(PRE=~if_else(get(str_glue("{cur_column()}_AGE")) < INDEX_AGE, ., 0L))))  %>%

    select(FINNGENID, contains("PREG"), contains("FIRST"), INDEX_AGE, contains("DIFF"), 
           contains(outcome), ends_with("SCALED"), ends_with("_CAT"), 
           all_of(clin_covs), all_of(clin_covs_age), all_of(gen_covs)) %>%
    #select(FINNGENID, GESTAT_HYPERT, INDEX_AGE, contains("GESTAT_HYPERT")) %>% 
    filter(!is.na(INDEX_AGE)) 
  
   list.df.p[[outcome]] = df.p2
   
  
  #df.p2 %>% 
  #  mutate_if(is.integer, as.factor) %>% 
  #  mutate_at(vars(contains('_PRE'), IS_MULTIP, IS_IVF), as.factor) %>%
  #  summary()
  
}


```

<br>


## Characteristics

**Percentage of pregnant women** 

and **average age** for the first pregnancy.

```{r pregnancy & age}

tmp1 <- df.f %>%
  summarise(N_fem=n(),N_preg=sum(FIRST), preg_perc = mean(FIRST)*100)

tmp2 <- df.p %>%
  summarise(preg_age = mean(FIRST_AGE), preg_age_sd = sd(FIRST_AGE))

cbind(tmp1,tmp2)%>%
  knitr::kable(digits=2) %>%
  kable_styling(full_width=F, position="left") %>%
  kable_classic(position="left") %>%
  row_spec(0,bold=TRUE)


```

<br>

**Correlation, scores:**

```{r}

cor(df.p$SBP_SCALED, df.p$PREC_SCALED)

```

<br>


**Characteristics**
```{r}

cont_vars <- c("INDEX_AGE")
bin_vars <- c("IS_MULTIP", "IS_IVF", "HYPTENS_PRE", "OBESITY_PRE", "DIABETES_ALL_PRE", "RENFAIL_PRE")
names_outcomes = list("GESTAT_HYPERT", "PREECLAMPS")

lapply(names_outcomes, function(outcome){

  bind_rows(n_to_chartable(outcome,list.df.p[[outcome]]),
            contvar_to_chartable(cont_vars, outcome, list.df.p[[outcome]]), 
            binvar_to_chartable(bin_vars, outcome, list.df.p[[outcome]])) 
  
}) %>% setNames(names_outcomes) %>%
  do.call(cbind,.)  %>%
  select(-PREECLAMPS.Characteristics) %>%
  mutate_all(~str_replace(., "^[0-4] .*", "<5")) %>%
  knitr::kable() %>%
  kable_classic() %>%
  row_spec(0,bold=TRUE)

```


# Survival analysis

<br>


## PH assumption: log log curve {.tabset}

<br>

**Model by survfit**

```{r km}

kms <- 
    lapply(names_ep, function(ep){
      survfit(as.formula(str_glue("Surv({ep}_AGE, {ep}) ~ SBP_CAT")), data=list.df.p[[ep]])
    }) %>% setNames(names_ep)

kms.prec <- 
    lapply(names_ep, function(ep){
      survfit(as.formula(str_glue("Surv({ep}_AGE, {ep}) ~ PREC_CAT")), data=list.df.p[[ep]])
    }) %>% setNames(names_ep)

```

<br>

### KM: SBP PRS

```{r km plot, results="asis"}


 ep=names_ep[[1]]   #Tämän alustuksen tarve on kyllä ihme juttu. Suoritus kaatuu ilman sitä.
  
  plot_list <- lapply(names_ep, function(ep){
    my_ggkmplot(kms[[ep]], str_glue("{ep}"), my_labels, xlim=c(15,50)) 
  }) 
  plots <- arrange_ggsurvplots(plot_list, nrow = 1, ncol = 2,  title = "Kaplan Meier curves", print=F)
  ggsave(file = str_glue("{fig_path}/km.prs.preg.png"), plot = plots, height = 4, width = 10, dpi = 150)

```

![KM curves](figs5/km.prs.preg.png)


### KM: Pre-eclampsia PRS

```{r kmplot prec,results="asis"}


 ep=names_ep[[1]]   #Tämän alustuksen tarve on kyllä ihme juttu. Suoritus kaatuu ilman sitä.
  
  plot_list.prec <- lapply(names_ep, function(ep){
    my_ggkmplot(kms.prec[[ep]], str_glue("{ep}"), my_labels, xlim=c(15,50)) 
  }) 
  plots.prec <- arrange_ggsurvplots(plot_list.prec, nrow = 1, ncol = 2,  title = "Kaplan Meier curves", print=F)
  ggsave(file = str_glue("{fig_path}/km.prec.prs.preg.png"), plot = plots.prec,height = 4, width = 10, dpi = 150)

```

![KM curves](figs5/km.prec.prs.preg.png)


### log log: SBP PRS

```{r log log, results="asis"}

 ep=names_ep[[1]]   #Tämän alustuksen tarve on kyllä ihme juttu. Suoritus kaatuu ilman sitä.
  
  plot_list <- lapply(names_ep, function(ep){
    my_ggkmplot(kms[[ep]], str_glue("{ep}"), my_labels, xlim=c(15,50), fun="cloglog") 
  }) 
  plots <- arrange_ggsurvplots(plot_list, nrow = 1, ncol = 2, title = "log log curves", print=F)
  ggsave(file = str_glue("{fig_path}/log.prs.preg.png"), plot = plots, height = 4, width = 10, dpi = 150)

  
```

![log log curves](figs5/log.prs.preg.png)


### log log: Pre-eclampsia PRS


```{r log log prec, fig.width=5, fig.height=7, results="asis"}

 ep=names_ep[[1]]   #Tämän alustuksen tarve on kyllä ihme juttu. Suoritus kaatuu ilman sitä.
  
  plot_list.prec <- lapply(names_ep, function(ep){
    my_ggkmplot(kms.prec[[ep]], str_glue("{ep}"), my_labels, xlim=c(15,50), fun="cloglog") 
  }) 
  plots.prec <- arrange_ggsurvplots(plot_list.prec, nrow = 1, ncol = 2, title = "log log curves", print=F)
  ggsave(file = str_glue("{fig_path}/log.prec.prs.preg.png"), plot = plots.prec, height = 4, width = 10, dpi = 150)
  
```

![log log curves](figs5/log.prec.prs.preg.png)


## {-}

log log curves are not straight lines, probably because propability of pregnancy changes over time. However for most curves the distance between curves stays about constant by time.

<br>

## Cox model: Continuous score {.tabset}

<br>

We will use covariate adjusted Cox models to study associations between *continuous* SBP and preeclampsia PRS's and different endpoints. 

Covariates:

* Batch
* Data collection year
* Genetic PC1-PC10

<br>

### SBP PRS

```{r cox cs sbp prs, warnings=F}

gen_covs <- c("batch", "BL_YEAR","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10")
gen_covs_f<- paste(gen_covs, collapse = " + ")

cxs.cs <- 
    lapply(names_ep, function(ep){
      my_formula <- str_glue("Surv({ep}_AGE, {ep}) ~ SBP_SCALED + {gen_covs_f}")
      coxph(as.formula(my_formula), data=list.df.p[[ep]])
    }) %>% setNames(names_ep)

table.cxs.cs.sbp <- 
  my_cxlist_to_hrtable2(cxs.cs, list.df.p, ep_title="Endpoint", select="SBP_") 

table.cxs.cs.sbp %>% 
    knitr::kable(digits=2) %>%
    kable_classic() %>%
    row_spec(0,bold=TRUE)
  

```


### Preeclampsia PRS

```{r cox cs prec prs, warnings=F}

cxs.cs.prec <- 
    lapply(names_ep, function(ep){
      my_formula <- str_glue("Surv({ep}_AGE, {ep}) ~ PREC_SCALED + {gen_covs_f}")
      coxph(as.formula(my_formula), data=list.df.p[[ep]])
    }) %>% setNames(names_ep)


table.cxs.cs.prec <- 
  my_cxlist_to_hrtable2(cxs.cs.prec, list.df.p, ep_title="Endpoint", select="PREC_") 

table.cxs.cs.prec %>% 
    knitr::kable(digits=2) %>%
    kable_classic() %>%
    row_spec(0,bold=TRUE)
  

```


### Significance: SBP PRS vs. preeclampsia PRS


**GTH**

```{r}

cx1 <- summary(cxs.cs$GESTAT_HYPERT)
cx2 <- summary(cxs.cs.prec$GESTAT_HYPERT)

x1    <- list.df.p$GESTAT_HYPERT$SBP_SCALED
beta1 <- cx1$coefficients["SBP_SCALED","coef"]
se1   <- cx1$coefficients["SBP_SCALED","se(coef)"]
x2    <- list.df.p$GESTAT_HYPERT$PREC_SCALED
beta2 <- cx2$coefficients["PREC_SCALED","coef"]
se2   <- cx2$coefficients["PREC_SCALED","se(coef)"]
n     <- cx2$n

hr.comp2(x1, beta1, se1, x2, beta2, se2, n)

```


**Preeclampsia**

```{r}

cx1 <- summary(cxs.cs$PREECLAMPS)
cx2 <- summary(cxs.cs.prec$PREECLAMPS)

x1    <- list.df.p$PREECLAMPS$SBP_SCALED
beta1 <- cx1$coefficients["SBP_SCALED","coef"]
se1   <- cx1$coefficients["SBP_SCALED","se(coef)"]
x2    <- list.df.p$PREECLAMPS$PREC_SCALED
beta2 <- cx2$coefficients["PREC_SCALED","coef"]
se2   <- cx2$coefficients["PREC_SCALED","se(coef)"]
n     <- cx2$n

hr.comp2(x1, beta1, se1, x2, beta2, se2, n)

```


### Forest plots

```{r fig.width=8, fig.height=4}

eps_to_forest <- c("PREECLAMPS", "GESTAT_HYPERT")
pretty_names_to_forest <- c(PREECLAMPS = "Preeclampsia", ECLAMPSIA = "Eclampsia", GESTAT_HYPERT = "Gestational\nHypertension")
order_forest <- c(PREECLAMPS = 1, ECLAMPSIA = 2, GESTAT_HYPERT = 3)

table_to_forest <- 
  bind_rows(table.cxs.cs.sbp %>% mutate(prs = "BP PRS"),
          table.cxs.cs.prec %>% mutate(prs="Preeclampsia PRS")) %>% 
  filter(Endpoint %in% eps_to_forest) %>%
  mutate(`HR (95% CI)` = str_replace(`HR (95% CI)`, "^(.+) .(.+-.+).$", "\\1-\\2")) %>%
  separate("HR (95% CI)", c("mean","lower","upper"), sep = "-") %>%
  mutate(nr = order_forest[Endpoint]) %>%
  mutate(Endpoint = pretty_names_to_forest[Endpoint]) %>%
  mutate(empty = "                             ") %>%
  arrange(nr) %>%
  select(empty,Endpoint, mean, lower, upper, prs)  %>%
  mutate_at(c("mean","lower","upper"), as.double)


forest <- table_to_forest %>%
  group_by(prs) %>%
  forestplot(labeltext = c(empty, Endpoint),
             graph.pos=2,
             clip = c(0.8,1.5),
             zero = 1,
             xlog = F,
             boxsize = 0.1,
             lty.ci = 1,
             lwd.ci = 1.8,
             ci.vertices = F,
             col = fpColors(box = c("#F8766D", "#00BFC4"),
                            line = c("#F8766D", "#00BFC4")),
             legend_args = fpLegend(pos = list(x = -0.45, y = 0.86)),
             txt_gp = fpTxtGp(legend=gpar(cex=1),
                              ticks=gpar(cex=1),
                              label=gpar(cex=1.1)),
             xticks = c(1, 1.2, 1.4),
             fn.ci_norm = list(list(fpDrawNormalCI, fpDrawCircleCI),
                               list(fpDrawNormalCI, fpDrawCircleCI))
  )

#c("#CA0020", "#0571B0")

pdf(file= str_glue("{fig_path}/graph_abs.pdf"), height = 2.5, width = 6) 
   forest
dev.off()

forest

```


### {-}

<details><summary>**All coefficients**</summary>


```{r all coefs}

lapply(cxs.cs, print)
lapply(cxs.cs.prec, print)

```


</details>
<br>

<details><summary>**Proportional hazard assumption**</summary>

Schoenfeld residuals against the transformed time:


```{r shoenfeld}

lapply(names_ep, function(ep){
    tryCatch(cox.zph(cxs.cs[[ep]]), error=function(e) NULL)
}) %>% setNames(names_ep) 

lapply(names_ep, function(ep){
    tryCatch(cox.zph(cxs.cs.prec[[ep]]), error=function(e) NULL)
}) %>% setNames(names_ep) 

```

</details>
<br>


## Cox model: Categorized score

<br>

We will use covariate adjusted Cox models to study associations between SBP PRS score *categories* and different endpoints. Endpoints with significant results from previous model are selected.

Covariates:

* Batch
* Data collection year
* Genetic PC1-PC10

<br>

### HR table {.tabset}

#### SPB PRS

```{r cat sbp prs hr table, warnings=F}

names_ep <- list("GESTAT_HYPERT", "PREECLAMPS")
pretty_names_ep <- list(GESTAT_HYPERT = "Gestational Hypertension", PREECLAMPS = "Preeclampsia")


#Change reference
list.dfr.p.nona <- lapply(names_ep, function(ep){
    list.df.p[[ep]] %>%  
       mutate(SBP_CAT= relevel(SBP_CAT, ref = "20-80%"), 
              PREC_CAT= relevel(PREC_CAT, ref = "20-80%"))
  }) %>% setNames(names_ep)

#dfr.p <- df.p %>%
 # mutate(SBP_CAT= relevel(SBP_CAT, ref = "20-80%"), 
  #       PREC_CAT= relevel(PREC_CAT, ref = "20-80%"))

cxs <- 
    lapply(names_ep, function(ep){
      my_formula <- str_glue("Surv({ep}_AGE, {ep}) ~ SBP_CAT + {gen_covs_f}")
      coxph(as.formula(my_formula), data=list.dfr.p.nona[[ep]])
    }) %>% setNames(names_ep)

lapply(cxs, royston)

lapply(names_ep, function(ep){
    
    cat_name <- "SBP_CAT"
    
      tmp <- my_extr_coef(cxs[[ep]], select= cat_name, title=ep) %>%
      mutate(names = str_remove(names, cat_name))
    
    extract_ns2(ep, cat_name, list.df.p[[ep]]) %>%
      left_join(tmp, by = c("quantile" = "names")) %>%
      my_tidy_table() %>%
      select(quantile, cases, controls, est, pval) %>% 
      add_row(quantile="", cases=NA, controls=NA, est="", pval=NA, .before=1)
    
  }) %>%
  bind_rows() %>%
  rename(PRS=quantile, `HR (95% CI)` = est, `P-value` = pval, Cases=cases, Controls=controls) %>%
  mutate_all(as.character) %>%
  mutate_all(replace_na,"") %>% 
  knitr::kable(digits=2) %>%
  kable_classic() %>%
  pack_rows(pretty_names_ep[[1]],1, 6) %>%
  pack_rows(pretty_names_ep[[2]],7, 12) %>% 
  #pack_rows(pretty_names_ep[[3]],13, 18) %>%
  #pack_rows(pretty_names_ep[[4]],19, 24) %>% 
  row_spec(0,bold=TRUE)


```  


#### Preeclampsia PRS

```{r cat preec prs hr table, warnings=F}

cxs.prec <- 
    lapply(names_ep, function(ep){
      my_formula <- str_glue("Surv({ep}_AGE, {ep}) ~ PREC_CAT + {gen_covs_f}")
      coxph(as.formula(my_formula), data=list.dfr.p.nona[[ep]])
    }) %>% setNames(names_ep)

lapply(cxs.prec, royston)

lapply(names_ep, function(ep){
    
    cat_name <- "PREC_CAT"
    
      tmp <- my_extr_coef(cxs.prec[[ep]], select= cat_name, title=ep) %>%
      mutate(names = str_remove(names, cat_name))
    
    extract_ns2(ep, cat_name, list.df.p[[ep]]) %>%
      left_join(tmp, by = c("quantile" = "names")) %>%
      my_tidy_table() %>%
      select(quantile, cases, controls, est, pval) %>% 
      add_row(quantile="", cases=NA, controls=NA, est="", pval=NA, .before=1)
    
  }) %>%
  bind_rows() %>%
  rename(PRS=quantile, `HR (95% CI)` = est, `P-value` = pval, Cases=cases, Controls=controls) %>%
  mutate_all(as.character) %>%
  mutate_all(replace_na,"") %>% 
  knitr::kable(digits=2) %>%
  kable_classic() %>%
  pack_rows(pretty_names_ep[[1]],1, 6) %>%
  pack_rows(pretty_names_ep[[2]],7, 12) %>% 
  #pack_rows(pretty_names_ep[[3]],13, 18) %>%
  #pack_rows(pretty_names_ep[[4]],19, 24) %>% 
  row_spec(0,bold=TRUE)

```  

### {-}


<br>

### Cox-plots

<br>

<details><summary>Choosing covariate values for the plot</summary>

We used *mean* values for all numeric variables. For 'batch', we use the level with *lowest significance*. Let us look the lowest significance from a couple of example cases:

```{r select covar values for cox, eval=T}

cx.test <- cxs$GESTAT_HYPERT

summary(cx.test)$coefficients %>%
  as.data.frame() %>%
  mutate(name = row.names(.)) %>%
  rename(pval="Pr(>|z|)") %>%
  select("name", "coef", "exp(coef)","se(coef)", "pval") %>%  
  arrange(pval) %>%
  tail(30)

cx.test <- cxs$PREECLAMPS

summary(cx.test)$coefficients %>%
  as.data.frame() %>%
  mutate(name = row.names(.)) %>%
  rename(pval="Pr(>|z|)") %>%
  select("name", "coef", "exp(coef)","se(coef)", "pval") %>%  
  arrange(pval) %>%
  tail(30)

```


</details>
<br>

**SBP PRS**

```{r fig.show = 'hide'}

#Covariate combinations
exp.tmp.covs <-  my_expand_covs(df.p, batch = "AxiomGT1_b77_V5")
exp.tmp.vars <- expand.grid(SBP_CAT = levels(df.p[["SBP_CAT"]]))
exp.cxs <- bind_cols(exp.tmp.vars, exp.tmp.covs)


fit.cxs <- 
  lapply(names_ep, function(ep){
        survfit(cxs[[ep]], newdata = exp.cxs)
  }) %>% setNames(names_ep)


#Created without loop, because there are so many exceptions with these...

ep = "GESTAT_HYPERT"
p1 <- my_ggcoxplot(fit.cxs[[ep]], exp.cxs, title=pretty_names_ep[[ep]], labels=my_labels, 
                   ylim=c(0,0.13), conf.int=F, legend="none", xlab="", ylab="Cumulative incidence")[["plot"]]

ep = "PREECLAMPS"
p2 <- my_ggcoxplot(fit.cxs[[ep]], exp.cxs, title=pretty_names_ep[[ep]], labels=my_labels, 
                   ylim=c(0,0.13), conf.int=F, legend="none", xlab="Years", ylab="Cumulative incidence")[["plot"]] 


plot_htprs <- grid.arrange(p1,p2,ncol = 1, top = text_grob("\n           BP PRS\n", size=15, face="bold", lineheight = 0.5)) 

```

<br>

**Preeclampsia PRS**


```{r cox preeclampsia prs, fig.show = 'hide'}

#Covariate combinations

exp.tmp.vars.prec <- expand.grid(PREC_CAT = levels(df.p[["PREC_CAT"]]))
exp.cxs.prec <- bind_cols(exp.tmp.vars.prec, exp.tmp.covs)


fit.cxs.prec <- 
  lapply(names_ep, function(ep){
        survfit(cxs.prec[[ep]], newdata = exp.cxs.prec)
  }) %>% setNames(names_ep)


#Created without loop, because there are so many exceptions with these...

ep = "GESTAT_HYPERT"
p1.p <- my_ggcoxplot(fit.cxs.prec[[ep]], exp.cxs.prec, title="", labels=my_labels, 
                   ylim=c(0,0.13), conf.int=F, legend="none", xlab="", ylab="", legend.title="PRS")[["plot"]] +  
  theme(legend.position = c(0.2,0.62)) +
  guides(col = guide_legend(reverse = TRUE))
  #  scale_linetype_manual(guide = guide_legend(reverse = TRUE))

# guides(col = guide_legend(reverse = TRUE))

ep = "PREECLAMPS"
p2.p <- my_ggcoxplot(fit.cxs.prec[[ep]], exp.cxs.prec, title="", labels=my_labels, 
                   ylim=c(0,0.13), conf.int=F, legend="none", xlab="Years", ylab="")[["plot"]] 


plot_preclprs <- grid.arrange(p1.p,p2.p, ncol = 1, top = text_grob("\n          Preeclampsia PRS\n", size=15, face="bold", lineheight = 0.5))

```


<br>

**Combined plot**

```{r, combined plot, fig.show = 'hide'}

plot_prs <- grid.arrange(plot_htprs, plot_preclprs, ncol = 2) 
ggsave(file = str_glue("{fig_path}/cx.both.prs.preg.png"), plot = plot_prs, height = 7, width = 7, dpi = 300)
ggsave(file = str_glue("{fig_path}/cx.both.prs.preg.eps"), plot = plot_prs, device = cairo_ps, height = 7, width = 7, dpi = 300)

```

![**Cox plots**](figs3/cx.both.prs.preg.png)


## C-statistics for pre-eclampsia

<br>

### Models

**Model 1: base model**

```{r model1 for c-index}

df.p2 <- list.df.p$PREECLAMPS %>%
  filter(!is.na(INDEX_AGE))

clin_covs_f<- paste(clin_covs, collapse = " + ")

my_formula1 <- as.formula(str_glue("PREECLAMPS ~ {clin_covs_f}"))
glm1 <- glm(my_formula1, data=df.p2, family=binomial())
glm1


#Veall and Zimmermann concluded that from a set of six widely used measures the measure suggested by McKelvey and Zavoina had the closest correspondance to ordinary least square R2. The Aldrich-Nelson pseudo-R2 with the Veall-Zimmermann correction is the best approximation of the McKelvey-Zavoina pseudo-R2. Efron, Aldrich-Nelson, McFadden and Nagelkerke approaches severely underestimate the "true R2".

R2.1 <- PseudoR2(glm1, which="McKelveyZavoina")

glm1 %>% 
  extract_glm_table() #%>%
#  knitr::kable() %>%
#  kable_classic() %>%
#  row_spec(0,bold=TRUE)


```

<br>

**Model 2: pre-eclampsia PRS + covariates**


```{r model2 for c-index}

#vars_for_table <- "_SCALED|PREG|HYP|OBES|DIAB|RENF"

my_formula2 <- as.formula(str_glue("PREECLAMPS ~ PREC_SCALED + {clin_covs_f}"))
glm2 <- glm(my_formula2, data=df.p2, family=binomial())
glm2

R2.2 <- PseudoR2(glm2, which="McKelveyZavoina")

extract_glm_table(glm2) #%>%
#  knitr::kable() %>%
#  kable_classic() %>%
#  row_spec(0,bold=TRUE)


```

<br>

**Model 3: pre-eclampsia PRS + SBP PRS + covariates**


```{r model3 for c-index}


my_formula3 <- as.formula(str_glue("PREECLAMPS ~ SBP_SCALED + PREC_SCALED + {clin_covs_f}"))
glm3 <- glm(my_formula3, data=df.p2, family=binomial())
glm3

R2.3 <- PseudoR2(glm3, which="McKelveyZavoina")

extract_glm_table(glm3) #%>%
#  knitr::kable() %>%
#  kable_classic() %>%
#  row_spec(0,bold=TRUE)

R2 <- bind_rows(R2.1, R2.2, R2.3) %>% 
  rename(PseudoR2 = McKelveyZavoina) %>%
  round(3)

```

<br>

<details><summary>**Test models: Only SBP PRS, Only PREC PRS, SBP PRS + clinical covariates**</summary>


```{r model test}


my_formula4 <- as.formula(str_glue("PREECLAMPS ~ SBP_SCALED"))
glm_onlysbp <- glm(my_formula4, data=df.p2, family=binomial())
print(glm_onlysbp)

onlysbp <- extract_glm_table(glm_onlysbp)


my_formula5 <- as.formula(str_glue("PREECLAMPS ~ PREC_SCALED"))
glm_onlyprec <- glm(my_formula5, data=df.p2, family=binomial())
print(glm_onlyprec)

onlyprec <- extract_glm_table(glm_onlyprec)

my_formula6 <- as.formula(str_glue("PREECLAMPS ~ SBP_SCALED + {clin_covs_f}"))
glm_sbpclin <- glm(my_formula6, data=df.p2, family=binomial())
print(glm_sbpclin)

sbpclin <- extract_glm_table(glm_sbpclin)


my_formula7 <- as.formula(str_glue("PREECLAMPS ~ PREC_SCALED + {clin_covs_f}"))
glm_precclin <- glm(my_formula7, data=df.p2, family=binomial())
print(glm_precclin)

precclin <- extract_glm_table(glm_precclin)

print(onlysbp)
print(sbpclin)
print(onlyprec)
print(precclin)

```

</details>

<br>

### C index


**C-index**

For glm C-index is corresponds AUC, while for cox-model a distinct 'Harrel's C-index' would be used. We use AUC.


```{r, c-index}

# ROC curves

roc.base  <-  roc(PREECLAMPS ~ fitted(glm1),  data = df.p2)
roc.prs1  <-  roc(PREECLAMPS ~ fitted(glm2),  data = df.p2)
roc.prs2  <-  roc(PREECLAMPS ~fitted(glm3),  data = df.p2)

#junk <- plot(roc.base,         lty = 1, col = 1, add = F, legacy.axes = T)
#junk <- plot(roc.prs1,         lty = 2, col = 2, add = T)
#junk <- plot(roc.prs2,         lty = 3, col = 3, add = T)
#legend(0.4, 0.2, lty = 1:2, col = 1:2, legend = c("base","prs1", "prs2"), bty = "n")


cindex.table <- 
  rbind(my.roc.test(roc.prs1, roc.base, "+ Preeclampsia PRS"),
        my.roc.test(roc.prs2, roc.prs1, "+ Blood Pressure PRS")) %>%
  add_row(Name = "Base model", `C-index` = round(roc.base$auc, 4), .before=1)

```


**NRI & IDI**

We use 8% cutoff for categorical NRI.

<details><summary>**NRI**</summary>

```{r class.source = 'fold-show', nri & idi}

#par(mfrow=c(1,3))
#boxplot(glm1$fitted.values ~ df.p.noNA$PREECLAMPS)
#boxplot(glm2$fitted.values ~ df.p.noNA$PREECLAMPS)
#boxplot(glm2$fitted.values ~ df.p.noNA$PREECLAMPS)
#par(mfrow=c(1,1))

df.p.tmp <- df.p2 %>%
  select(PREECLAMPS, SBP_SCALED, PREC_SCALED, all_of(clin_covs)) %>%
  as.data.frame()


nri.1 <- my.reclassification(data = df.p.tmp, cOutcome = 1,
                 predrisk1 = fitted(glm1),
                 predrisk2 = fitted(glm2),
                 cutoff = c(0, 0.08, 1)
                 )
nri.2 <- my.reclassification(data = df.p.tmp, cOutcome = 1,
                 predrisk1 = fitted(glm2),
                 predrisk2 = fitted(glm3),
                 cutoff = c(0, 0.08, 1)
                 )


nri.table <- 
  rbind(my.reclas.wrapper(nri.1, "\\+ Preeclampsia PRS"),
        my.reclas.wrapper(nri.2, "\\+ Blood Pressure PRS"))%>%
  add_row(Name = "Base model", .before=1)

```

</details>

**The table is created**

```{r, cindex, nri and idi tables}

options(knitr.kable.NA = '')

cindex.table %>%
 cbind(nri.table %>% select(-Name), R2) %>%
  knitr::kable() %>%
  kable_classic(font=11) %>%
  row_spec(0,bold=TRUE) %>%
  column_spec(c(1,3,5, 9), width_min = "4cm") %>%
  column_spec(c(7,8), width = "2.5cm") %>%
  column_spec(c(2,4,6,10,11), width_min = "1.5cm") %>%
  scroll_box(width="100%")

```