Skip to content

Commit f658587

Browse files
Add files via upload
1 parent 1cc48f5 commit f658587

File tree

1 file changed

+314
-0
lines changed

1 file changed

+314
-0
lines changed

step2_pos.R

Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
#!/usr/bin/env Rscript
2+
array <- Sys.getenv("SLURM_ARRAY_TASK_ID")
3+
id.job <- as.numeric(array)
4+
5+
# Packages
6+
library(BEDMatrix)
7+
suppressMessages(library(data.table))
8+
suppressMessages(library(ddpcr))
9+
suppressMessages(library(dplyr))
10+
library(optparse)
11+
12+
# Options
13+
option_list <- list(
14+
make_option("--path.ref", type = "character", default = FALSE, action = "store", help = "Path of the reference panel plus the prefixes"
15+
),
16+
make_option("--trait" , type = "character", default = FALSE, action = "store", help = "Name of the trait"
17+
),
18+
make_option("--path.out", type = "character", default = FALSE, action = "store", help = "Path of the output file plus the prefixes"
19+
)
20+
)
21+
22+
opt <- parse_args(OptionParser(option_list = option_list))
23+
24+
path.ref <- opt$path.ref
25+
trait <- opt$trait
26+
path.out <- opt$path.out
27+
28+
# User-defined functions
29+
# ACAT
30+
ACAT <- function(Pvals, Weights = NULL) {
31+
if (sum(is.na(Pvals)) > 0) {
32+
stop("Cannot have NAs in the p-values!")
33+
}
34+
if ((sum(Pvals < 0) + sum(Pvals > 1)) > 0){
35+
stop("P-values must be between 0 and 1!")
36+
}
37+
is.zero <- (sum(Pvals == 0) >= 1)
38+
is.one <- (sum(Pvals == 1) >= 1)
39+
if (is.zero && is.one) {
40+
return(-1)
41+
}
42+
if (is.zero) {
43+
return(0)
44+
}
45+
if (is.one) {
46+
return(1)
47+
}
48+
49+
if (is.null(Weights)) {
50+
Weights <- rep(1 / length(Pvals), length(Pvals))
51+
} else if (length(Weights) != length(Pvals)) {
52+
stop("The length of weights should be the same as that of the p-values")
53+
} else if (sum(Weights < 0) > 0){
54+
stop("All the weights must be positive!")
55+
} else {
56+
Weights <- Weights / sum(Weights)
57+
}
58+
59+
is.small <- (Pvals < 1e-16)
60+
if (sum(is.small) == 0){
61+
cct.stat <- sum(Weights * tan((0.5 - Pvals) * pi))
62+
} else {
63+
cct.stat <- sum((Weights[is.small] / Pvals[is.small]) / pi)
64+
cct.stat <- cct.stat + sum(Weights[!is.small] * tan((0.5 - Pvals[!is.small]) * pi))
65+
}
66+
67+
if (cct.stat > 1e15){
68+
pval <- (1 / cct.stat) / pi
69+
} else {
70+
pval <- pcauchy(cct.stat, lower.tail = F)
71+
}
72+
73+
return(pval)
74+
}
75+
76+
# PatchUp
77+
PatchUp <- function(M) {
78+
M <- apply(M, 2, function(x) {
79+
x[is.na(x)] <- mean(x, na.rm = TRUE)
80+
return(x)
81+
})
82+
83+
return(M)
84+
}
85+
86+
# Options for test runs
87+
test <- FALSE
88+
if (test) {
89+
path.ref <- "/gpfs/home/zz17/resource/1000-genome/sequence/1000G.EUR.ALLSNP.QC.CHR"
90+
trait <- "COVID-B2-V5"
91+
path.out <- "/gpfs/research/chongwu/zichenzhang/output/test-run/SUMMIT/"
92+
}
93+
94+
# Object to store the output
95+
out <- data.frame(
96+
gene_symbol = character(),
97+
gene_id = character(),
98+
chromosome = numeric(),
99+
model_best = character(),
100+
r2_test = numeric(),
101+
p_SCAD = numeric(),
102+
p_ElNet = numeric(),
103+
p_LASSO = numeric(),
104+
p_MCP = numeric(),
105+
p_MNet = numeric(),
106+
z_SCAD = numeric(),
107+
z_ElNet = numeric(),
108+
z_LASSO = numeric(),
109+
z_MCP = numeric(),
110+
z_MNet = numeric(),
111+
p_ACAT = numeric(),
112+
gene_pos = numeric(),
113+
runtime = numeric(),
114+
stringsAsFactors = FALSE
115+
)
116+
117+
# Total number of jobs = 16884
118+
files <- dir("/gpfs/home/zz17/output/output_pos/")
119+
120+
# Main iteration
121+
for (gene.index in (1 + (id.job - 1) * 100):(min(16884, id.job * 100))) {
122+
print(gene.index)
123+
124+
# Start tracking runtime
125+
time.start <- proc.time()[3]
126+
127+
# The vector to store all the updates in this iteration
128+
update <- rep(NA, 18)
129+
130+
# Load weight
131+
load(paste0("/gpfs/home/zz17/output/output_pos/", files[gene.index]))
132+
update[5] <- max(out.r2[1, ], na.rm = TRUE)
133+
if (update[5] <= 0.005) {
134+
next
135+
}
136+
137+
# Get the chromosome
138+
chr <- snps$SNPChr[1]
139+
140+
# Read the summary statistics file
141+
if (file.exists(paste0("/gpfs/research/chongwu/zichenzhang/summary-statistics-old/", trait, "-", chr, ".sumstats"))) {
142+
ss <- paste0("/gpfs/research/chongwu/zichenzhang/summary-statistics-old/", trait, "-", chr, ".sumstats") %>% fread() %>% as.data.frame()
143+
} else {
144+
next
145+
}
146+
names(ss) <- colnames(ss) %>% tolower()
147+
ss["ID"] <- paste0(ss$chr, "_", ss$pos)
148+
149+
# Load the reference panel
150+
quiet(
151+
bim.ref <- as.data.frame(fread(paste0(path.ref, chr, ".bim")))
152+
)
153+
quiet(
154+
genotype.ref <- BEDMatrix(paste0(path.ref, chr), simple_names = TRUE)
155+
)
156+
157+
names(bim.ref)[c(2, 5, 6)] <- c("SNP" ,"a1", "a2")
158+
159+
# Find the best model
160+
model.best <- which.max(out.r2[1, ])
161+
162+
# Create new identifier for reference panel
163+
bim.ref["ID"] <- paste0(bim.ref$V1, "_", bim.ref$V4)
164+
165+
# Find the common snps in all three data sets
166+
list.common <- intersect(bim.ref$ID, snps$ID) %>% intersect(., ss$ID)
167+
168+
# Skip if no common snps found
169+
if (length(list.common) == 0) {
170+
next
171+
}
172+
173+
# Trim genotype.ref
174+
genotype.temp <- genotype.ref[, bim.ref$ID %in% list.common]
175+
bim.temp <- bim.ref[bim.ref$ID %in% list.common, ]
176+
177+
# Fix the NAs in reference panel
178+
if (sum(is.na(genotype.temp)) != 0) {
179+
genotype.temp <- PatchUp(genotype.temp)
180+
}
181+
182+
# Trim ss
183+
ss.temp <- ss[ss$ID %in% list.common, ]
184+
185+
# Trim snps and wgt.matrix
186+
index.temp <- snps$ID %in% list.common
187+
188+
snps <- snps[index.temp, ]
189+
out.weight <- out.weight[index.temp, ]
190+
if ("numeric" %in% class(out.weight)) {
191+
out.weight <- t(as.matrix(out.weight))
192+
}
193+
194+
rm(index.temp)
195+
196+
# New
197+
# Re-arrange every data sets
198+
m.1 <- match(ss.temp$ID, bim.temp$ID)
199+
m.2 <- match(ss.temp$ID, snps$ID)
200+
201+
bim.temp <- bim.temp[m.1, ]
202+
genotype.temp <- genotype.temp[, m.1]
203+
204+
snps <- snps[m.2, ]
205+
out.weight <- out.weight[m.2, ]
206+
207+
# Align the mismatched alleles
208+
problem.1 <- ss.temp$a1 != bim.temp$a1
209+
genotype.temp[, problem.1] <- 2 - genotype.temp[, problem.1]
210+
211+
problem.2 <- ss.temp$a1 != snps$a1
212+
# flipped <- snps$SNP[problem.2]
213+
out.weight[problem.2, ] <- -1 * out.weight[problem.2, ]
214+
215+
# # Old
216+
# # Re-arrange every data sets
217+
# m.1 <- match(bim.temp$ID, ss.temp$ID)
218+
# m.2 <- match(bim.temp$ID, snps$ID)
219+
220+
# ss.temp <- ss.temp[m.1, ]
221+
# snps <- snps[m.2, ]
222+
# out.weight <- out.weight[m.2, ]
223+
224+
# # Align the mismatched alleles
225+
# problem <- ss.temp$a1 != snps$a1
226+
# ss.temp$a1 <- snps$a1
227+
# ss.temp$a2 <- snps$a2
228+
# ss.temp$beta[problem] <- -1 * ss.temp$beta[problem]
229+
# ss.temp$z[problem] <- -1 * ss.temp$z[problem]
230+
231+
# Compute LD matrix
232+
genotype.temp <- scale(genotype.temp)
233+
matrix.LD <- t(genotype.temp) %*% genotype.temp / (nrow(genotype.temp) - 1)
234+
235+
# Catch: When there is only one row in wgt.matrix
236+
if ("numeric" %in% class(out.weight)) {
237+
out.weight <- out.weight %>% as.matrix() %>% t() %>% as.data.frame()
238+
}
239+
240+
# Catch: Over-fitting
241+
for (qwe in 1:5) {
242+
if (max(abs(out.weight[, qwe])) >= 10) {
243+
out.weight[, qwe] <- 0
244+
}
245+
}
246+
247+
# Iteration by method
248+
for (w in 1:ncol(out.weight)) {
249+
# Settings
250+
weights <- out.weight[, w]
251+
252+
# Skip if weight is a zero vector
253+
if (sum(weights) == 0) {
254+
update[5 + w] <- NA
255+
next
256+
}
257+
258+
# Keep the non-zero components of weights vector
259+
keep <- (weights != 0)
260+
print(sum(keep))
261+
weights <- weights[keep]
262+
#print(sum(names(weights) %in% flipped))
263+
264+
# Compute TWAS z-score, r2, and p-value
265+
z.twas <- as.numeric(weights %*% ss.temp$z[keep])
266+
r2.twas <- as.numeric(weights %*% matrix.LD[keep, keep] %*% weights)
267+
268+
update[5 + w] <- 2 * (pnorm(abs(z.twas / sqrt(r2.twas)), lower.tail = F))
269+
update[10 + w] <- z.twas
270+
}
271+
272+
# ACAT
273+
check.na <- !is.na(update[6:10])
274+
check.sign <- out.r2[1, ] > 0
275+
276+
check.final <- check.na & check.sign
277+
278+
if (sum(check.final) == 0) {
279+
update[16] <- NA
280+
} else {
281+
update[16] <- ACAT(update[6:10][check.final], out.r2[1, check.final] / sum(out.r2[1, check.final]))
282+
}
283+
284+
# Stop tracking runtime
285+
time.end <- proc.time()[3]
286+
287+
#################
288+
# Output format #
289+
#################
290+
# 1. Gene symbol
291+
# 2. ENSG id
292+
# 3. Chromosome
293+
# 4. Best model
294+
# 5. Best model's R^2 on testing data (Skipped)
295+
# 6-10. P-value of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet)
296+
# 11-15. Z-score of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet)
297+
# 16. P-value from ACAT
298+
# 17. Gene position
299+
# 18. Runtime
300+
301+
# Update
302+
update[1] <- snps$GeneSymbol[1]
303+
update[2] <- snps$Gene[1]
304+
update[3] <- chr
305+
update[4] <- names(model.best)
306+
update[17] <- snps$GenePos[1]
307+
update[18] <- time.end - time.start
308+
309+
out[nrow(out) + 1, ] <- update
310+
}
311+
312+
# Write the result
313+
dir.create(paste0(path.out, trait))
314+
write.table(out, file = paste0(path.out, trait, "/", trait, "-", id.job), row.names = FALSE, quote = FALSE)

0 commit comments

Comments
 (0)