-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFinal_GWAS_all_QC_preprocessing_script.R
More file actions
151 lines (110 loc) · 5.77 KB
/
Copy pathFinal_GWAS_all_QC_preprocessing_script.R
File metadata and controls
151 lines (110 loc) · 5.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#*********************************************************************************
# GWAS QC1-4 :
# R 4.3.1
# by Puneet 27.05.2025
#*********************************************************************************
################################################################################################
# Before using this script, comment the configuration part in individual scripts (ex. GWAS QC1)
################################################################################################
# Required installations (system): Samtools, BCFtools, HTSlib
# URLs:
# https://www.htslib.org/download/
# https://gist.github.com/adefelicibus/f6fd06df1b4bb104ceeaccdd7325b856
## Required R packages
# install.packages(c("tidyverse","rstudioapi","data.table","plinkQC","getopt","ggplot2","GGally","dplyr","purrr","ggpubr"))
# Load necessary libraries
library(readxl)
library(rstudioapi)
library(tidyverse)
library(data.table)
library(plinkQC)
library(ggplot2)
library(ggpubr)
library(GGally)
library(dplyr)
library(purrr)
library(glue)
library(stringr)
#*********************************************************************************
# GWAS QC1 : Basic QC after data merging and filtering [GWAS_QC1.R]
#*********************************************************************************
#========================= CONFIGURATION =============================#
# Tool and data paths
TOOLS_DIR <- "/media/icaro/T7/CRC_Genetic_Data/Tools"
PROJECT_DIR <- "/media/icaro/T7"
QC_DIR <- "PD_PRS_GWAS_QC/COF/Data_QC"
DATA_DIR <- "PD_PRS_GWAS_QC/COF"
# Sample name (prefix of binary PLINK files)
DATA_NAME <- "COF_Data"
#=====================================================================#
source(file.path(PROJECT_DIR, "CRC_Genetic_Data/Codes/Puneet/Final_GWAS_QC_scripts/GWAS_QC1.R"))
#***********************************************************************************************
# GWAS QC2 : Using 1000 genome data to match allele frequency & check population stratification [GWAS_QC2_1000G_generic.R]
#***********************************************************************************************
#========================= CONFIGURATION =============================#
# Tool and data paths
TOOLS_DIR <- "/media/icaro/T7/CRC_Genetic_Data/Tools"
PROJECT_DIR <- "/media/icaro/T7"
QC_DIR <- "PD_PRS_GWAS_QC/COF/Data_QC"
DATA_DIR <- "PD_PRS_GWAS_QC/COF"
PATH_TO_GENOME1000 <- "/media/icaro/T7/CRC_Genetic_Data/common_files/1000genome"
# Sample name (prefix of binary PLINK files)
DATA_NAME <- "COF_Data"
#=====================================================================#
source(file.path(PROJECT_DIR, "CRC_Genetic_Data/Codes/Puneet/Final_GWAS_QC_scripts/GWAS_QC2_1000G.R"))
#*********************************************************************************
# GWAS QC3 : Preimputation QC and VCF Check [GWAS_QC3_preimputation.R]
#*********************************************************************************
#========================= CONFIGURATION =============================#
TOOLS_DIR <- "/media/icaro/T7/CRC_Genetic_Data/Tools"
PROJECT_DIR <- "/media/icaro/T7"
DATA_DIR <- "/media/icaro/T7/PD_PRS_GWAS_QC/COF"
COMMON_FILES <- "/media/icaro/T7/CRC_Genetic_Data/common_files"
INPUT_PREFIX <- "final_QC2_common_flipped_ref_outlier"
REF_LEGEND_FILE <- file.path(COMMON_FILES, "1000genome/1000GP_Phase3_combined.legend")
REF_FASTA <- file.path(COMMON_FILES, "1000genome/human_g1k_v37.fasta")
CHECKVCF_SCRIPT <- file.path(TOOLS_DIR, "checkVCF.py")
# Derived paths
VCF_DIR <- file.path(DATA_DIR, "Data_vcf_check")
PREIMP_DIR <- file.path(VCF_DIR, "preimputation_qc")
IMPUTE_INPUT_DIR <- file.path(PREIMP_DIR, "Input_imputation")
PLINK_EXEC <- file.path(TOOLS_DIR, "plink1.9")
#=====================================================================#
source(file.path(PROJECT_DIR, "CRC_Genetic_Data/Codes/Puneet/Final_GWAS_QC_scripts/GWAS_QC3_preimputation.R"))
#*********************************************************************************
# GWAS : Run Imputation on the Michigan Server
#*********************************************************************************
# Upload your chr wise vcf.gz files to the Michigan imputation server https://imputationserver.sph.umich.edu/.
# Further guidance at https://imputationserver.readthedocs.io/en/latest/.
# These are the settings used:
# Ref panel: HRC r1.1 2016
# rsq filter: OFF (instead 0.3 can be used but then remove "--exclude-if-info 'R2 < 0.3' " in the QC4 script)
# Phasing: Eagle v2.4
# Population: EUR
# Mode: Quality control and imputation
# Download the data into Data_Imputed folder
#*********************************************************************************
# GWAS QC4 : Post imputation QC
#*********************************************************************************
# Use bash terminal to unzip Michigan output files
#***************************************************************
# Use 7zip (as recommended) [sudo apt install p7zip-full p7zip-rar]
# cd to the folder "/media/icaro/T7/PD_PRS_GWAS_QC/COF/Data_Imputed"
# unzip Michigan output files using the password received from Michigan server in Email
# Password : XChk02xP}?bYRi
#
# for chr in $(seq 1 22)
# do
# 7z e chr_$chr.zip
# done
#========================= CONFIGURATION =============================#
TOOLS_DIR <- "/media/icaro/T7/CRC_Genetic_Data/Tools"
PROJECT_DIR <- "/media/icaro/T7"
DATA_DIR <- "/media/icaro/T7/PD_PRS_GWAS_QC/COF"
COMMON_FILES <- "/media/icaro/T7/CRC_Genetic_Data/common_files"
DATASET_NAME <- "COF" # Change for each dataset
VCF_DIR <- file.path(DATA_DIR, "Data_Imputed")
OUTPUT_DIR <- file.path(DATA_DIR, "Post_imputation_QC")
HRC_VCF <- file.path(COMMON_FILES, "HRC.r1-1.GRCh37.wgs.mac5.sites.vcf.gz")
#=====================================================================#
source(file.path(PROJECT_DIR, "CRC_Genetic_Data/Codes/Puneet/Final_GWAS_QC_scripts/GWAS_QC4_Post_imputation.R"))