1+ # !/usr/bin/env Rscript
2+ array <- Sys.getenv(" SLURM_ARRAY_TASK_ID" )
3+ id.job <- as.numeric(array )
4+
5+ # Packages
6+ library(BEDMatrix )
7+ suppressMessages(library(data.table ))
8+ suppressMessages(library(ddpcr ))
9+ suppressMessages(library(dplyr ))
10+ library(optparse )
11+
12+ # Options
13+ option_list <- list (
14+ make_option(" --path.ref" , type = " character" , default = FALSE , action = " store" , help = " Path of the reference panel plus the prefixes"
15+ ),
16+ make_option(" --trait" , type = " character" , default = FALSE , action = " store" , help = " Name of the trait"
17+ ),
18+ make_option(" --path.out" , type = " character" , default = FALSE , action = " store" , help = " Path of the output file plus the prefixes"
19+ )
20+ )
21+
22+ opt <- parse_args(OptionParser(option_list = option_list ))
23+
24+ path.ref <- opt $ path.ref
25+ trait <- opt $ trait
26+ path.out <- opt $ path.out
27+
28+ # User-defined functions
29+ # ACAT
30+ ACAT <- function (Pvals , Weights = NULL ) {
31+ if (sum(is.na(Pvals )) > 0 ) {
32+ stop(" Cannot have NAs in the p-values!" )
33+ }
34+ if ((sum(Pvals < 0 ) + sum(Pvals > 1 )) > 0 ){
35+ stop(" P-values must be between 0 and 1!" )
36+ }
37+ is.zero <- (sum(Pvals == 0 ) > = 1 )
38+ is.one <- (sum(Pvals == 1 ) > = 1 )
39+ if (is.zero && is.one ) {
40+ return (- 1 )
41+ }
42+ if (is.zero ) {
43+ return (0 )
44+ }
45+ if (is.one ) {
46+ return (1 )
47+ }
48+
49+ if (is.null(Weights )) {
50+ Weights <- rep(1 / length(Pvals ), length(Pvals ))
51+ } else if (length(Weights ) != length(Pvals )) {
52+ stop(" The length of weights should be the same as that of the p-values" )
53+ } else if (sum(Weights < 0 ) > 0 ){
54+ stop(" All the weights must be positive!" )
55+ } else {
56+ Weights <- Weights / sum(Weights )
57+ }
58+
59+ is.small <- (Pvals < 1e-16 )
60+ if (sum(is.small ) == 0 ){
61+ cct.stat <- sum(Weights * tan((0.5 - Pvals ) * pi ))
62+ } else {
63+ cct.stat <- sum((Weights [is.small ] / Pvals [is.small ]) / pi )
64+ cct.stat <- cct.stat + sum(Weights [! is.small ] * tan((0.5 - Pvals [! is.small ]) * pi ))
65+ }
66+
67+ if (cct.stat > 1e15 ){
68+ pval <- (1 / cct.stat ) / pi
69+ } else {
70+ pval <- pcauchy(cct.stat , lower.tail = F )
71+ }
72+
73+ return (pval )
74+ }
75+
76+ # PatchUp
77+ PatchUp <- function (M ) {
78+ M <- apply(M , 2 , function (x ) {
79+ x [is.na(x )] <- mean(x , na.rm = TRUE )
80+ return (x )
81+ })
82+
83+ return (M )
84+ }
85+
86+ # Options for test runs
87+ test <- FALSE
88+ if (test ) {
89+ path.ref <- " /gpfs/home/zz17/resource/1000-genome/sequence/1000G.EUR.ALLSNP.QC.CHR"
90+ trait <- " COVID-B2-V5"
91+ path.out <- " /gpfs/research/chongwu/zichenzhang/output/test-run/SUMMIT/"
92+ }
93+
94+ # Object to store the output
95+ out <- data.frame (
96+ gene_symbol = character (),
97+ gene_id = character (),
98+ chromosome = numeric (),
99+ model_best = character (),
100+ r2_test = numeric (),
101+ p_SCAD = numeric (),
102+ p_ElNet = numeric (),
103+ p_LASSO = numeric (),
104+ p_MCP = numeric (),
105+ p_MNet = numeric (),
106+ z_SCAD = numeric (),
107+ z_ElNet = numeric (),
108+ z_LASSO = numeric (),
109+ z_MCP = numeric (),
110+ z_MNet = numeric (),
111+ p_ACAT = numeric (),
112+ gene_pos = numeric (),
113+ runtime = numeric (),
114+ stringsAsFactors = FALSE
115+ )
116+
117+ # Total number of jobs = 16884
118+ files <- dir(" /gpfs/home/zz17/output/output_pos/" )
119+
120+ # Main iteration
121+ for (gene.index in (1 + (id.job - 1 ) * 100 ): (min(16884 , id.job * 100 ))) {
122+ print(gene.index )
123+
124+ # Start tracking runtime
125+ time.start <- proc.time()[3 ]
126+
127+ # The vector to store all the updates in this iteration
128+ update <- rep(NA , 18 )
129+
130+ # Load weight
131+ load(paste0(" /gpfs/home/zz17/output/output_pos/" , files [gene.index ]))
132+ update [5 ] <- max(out.r2 [1 , ], na.rm = TRUE )
133+ if (update [5 ] < = 0.005 ) {
134+ next
135+ }
136+
137+ # Get the chromosome
138+ chr <- snps $ SNPChr [1 ]
139+
140+ # Read the summary statistics file
141+ if (file.exists(paste0(" /gpfs/research/chongwu/zichenzhang/summary-statistics-old/" , trait , " -" , chr , " .sumstats" ))) {
142+ ss <- paste0(" /gpfs/research/chongwu/zichenzhang/summary-statistics-old/" , trait , " -" , chr , " .sumstats" ) %> % fread() %> % as.data.frame()
143+ } else {
144+ next
145+ }
146+ names(ss ) <- colnames(ss ) %> % tolower()
147+ ss [" ID" ] <- paste0(ss $ chr , " _" , ss $ pos )
148+
149+ # Load the reference panel
150+ quiet(
151+ bim.ref <- as.data.frame(fread(paste0(path.ref , chr , " .bim" )))
152+ )
153+ quiet(
154+ genotype.ref <- BEDMatrix(paste0(path.ref , chr ), simple_names = TRUE )
155+ )
156+
157+ names(bim.ref )[c(2 , 5 , 6 )] <- c(" SNP" ," a1" , " a2" )
158+
159+ # Find the best model
160+ model.best <- which.max(out.r2 [1 , ])
161+
162+ # Create new identifier for reference panel
163+ bim.ref [" ID" ] <- paste0(bim.ref $ V1 , " _" , bim.ref $ V4 )
164+
165+ # Find the common snps in all three data sets
166+ list.common <- intersect(bim.ref $ ID , snps $ ID ) %> % intersect(. , ss $ ID )
167+
168+ # Skip if no common snps found
169+ if (length(list.common ) == 0 ) {
170+ next
171+ }
172+
173+ # Trim genotype.ref
174+ genotype.temp <- genotype.ref [, bim.ref $ ID %in% list.common ]
175+ bim.temp <- bim.ref [bim.ref $ ID %in% list.common , ]
176+
177+ # Fix the NAs in reference panel
178+ if (sum(is.na(genotype.temp )) != 0 ) {
179+ genotype.temp <- PatchUp(genotype.temp )
180+ }
181+
182+ # Trim ss
183+ ss.temp <- ss [ss $ ID %in% list.common , ]
184+
185+ # Trim snps and wgt.matrix
186+ index.temp <- snps $ ID %in% list.common
187+
188+ snps <- snps [index.temp , ]
189+ out.weight <- out.weight [index.temp , ]
190+ if (" numeric" %in% class(out.weight )) {
191+ out.weight <- t(as.matrix(out.weight ))
192+ }
193+
194+ rm(index.temp )
195+
196+ # New
197+ # Re-arrange every data sets
198+ m.1 <- match(ss.temp $ ID , bim.temp $ ID )
199+ m.2 <- match(ss.temp $ ID , snps $ ID )
200+
201+ bim.temp <- bim.temp [m.1 , ]
202+ genotype.temp <- genotype.temp [, m.1 ]
203+
204+ snps <- snps [m.2 , ]
205+ out.weight <- out.weight [m.2 , ]
206+
207+ # Align the mismatched alleles
208+ problem.1 <- ss.temp $ a1 != bim.temp $ a1
209+ genotype.temp [, problem.1 ] <- 2 - genotype.temp [, problem.1 ]
210+
211+ problem.2 <- ss.temp $ a1 != snps $ a1
212+ # flipped <- snps$SNP[problem.2]
213+ out.weight [problem.2 , ] <- - 1 * out.weight [problem.2 , ]
214+
215+ # # Old
216+ # # Re-arrange every data sets
217+ # m.1 <- match(bim.temp$ID, ss.temp$ID)
218+ # m.2 <- match(bim.temp$ID, snps$ID)
219+
220+ # ss.temp <- ss.temp[m.1, ]
221+ # snps <- snps[m.2, ]
222+ # out.weight <- out.weight[m.2, ]
223+
224+ # # Align the mismatched alleles
225+ # problem <- ss.temp$a1 != snps$a1
226+ # ss.temp$a1 <- snps$a1
227+ # ss.temp$a2 <- snps$a2
228+ # ss.temp$beta[problem] <- -1 * ss.temp$beta[problem]
229+ # ss.temp$z[problem] <- -1 * ss.temp$z[problem]
230+
231+ # Compute LD matrix
232+ genotype.temp <- scale(genotype.temp )
233+ matrix.LD <- t(genotype.temp ) %*% genotype.temp / (nrow(genotype.temp ) - 1 )
234+
235+ # Catch: When there is only one row in wgt.matrix
236+ if (" numeric" %in% class(out.weight )) {
237+ out.weight <- out.weight %> % as.matrix() %> % t() %> % as.data.frame()
238+ }
239+
240+ # Catch: Over-fitting
241+ for (qwe in 1 : 5 ) {
242+ if (max(abs(out.weight [, qwe ])) > = 10 ) {
243+ out.weight [, qwe ] <- 0
244+ }
245+ }
246+
247+ # Iteration by method
248+ for (w in 1 : ncol(out.weight )) {
249+ # Settings
250+ weights <- out.weight [, w ]
251+
252+ # Skip if weight is a zero vector
253+ if (sum(weights ) == 0 ) {
254+ update [5 + w ] <- NA
255+ next
256+ }
257+
258+ # Keep the non-zero components of weights vector
259+ keep <- (weights != 0 )
260+ print(sum(keep ))
261+ weights <- weights [keep ]
262+ # print(sum(names(weights) %in% flipped))
263+
264+ # Compute TWAS z-score, r2, and p-value
265+ z.twas <- as.numeric(weights %*% ss.temp $ z [keep ])
266+ r2.twas <- as.numeric(weights %*% matrix.LD [keep , keep ] %*% weights )
267+
268+ update [5 + w ] <- 2 * (pnorm(abs(z.twas / sqrt(r2.twas )), lower.tail = F ))
269+ update [10 + w ] <- z.twas
270+ }
271+
272+ # ACAT
273+ check.na <- ! is.na(update [6 : 10 ])
274+ check.sign <- out.r2 [1 , ] > 0
275+
276+ check.final <- check.na & check.sign
277+
278+ if (sum(check.final ) == 0 ) {
279+ update [16 ] <- NA
280+ } else {
281+ update [16 ] <- ACAT(update [6 : 10 ][check.final ], out.r2 [1 , check.final ] / sum(out.r2 [1 , check.final ]))
282+ }
283+
284+ # Stop tracking runtime
285+ time.end <- proc.time()[3 ]
286+
287+ # ################
288+ # Output format #
289+ # ################
290+ # 1. Gene symbol
291+ # 2. ENSG id
292+ # 3. Chromosome
293+ # 4. Best model
294+ # 5. Best model's R^2 on testing data (Skipped)
295+ # 6-10. P-value of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet)
296+ # 11-15. Z-score of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet)
297+ # 16. P-value from ACAT
298+ # 17. Gene position
299+ # 18. Runtime
300+
301+ # Update
302+ update [1 ] <- snps $ GeneSymbol [1 ]
303+ update [2 ] <- snps $ Gene [1 ]
304+ update [3 ] <- chr
305+ update [4 ] <- names(model.best )
306+ update [17 ] <- snps $ GenePos [1 ]
307+ update [18 ] <- time.end - time.start
308+
309+ out [nrow(out ) + 1 , ] <- update
310+ }
311+
312+ # Write the result
313+ dir.create(paste0(path.out , trait ))
314+ write.table(out , file = paste0(path.out , trait , " /" , trait , " -" , id.job ), row.names = FALSE , quote = FALSE )
0 commit comments