Case study 3: Scalable multi-trait polygenic analysis with cross-cohort meta-analysis

PolyGenius enables users to go furher than small-scale targeted PGS analysis, enabling the joint generation, computation, and association of thousands of PGS models, and to combine cohort-level results through meta-analysis.

Show setup code

library(dplyr)
library(data.table)
library(ggplot2)
library(patchwork)
library(ComplexHeatmap)

library(PolyGenius)

Generating 2500 PRS models

We begin with selecting a random subset of 2500 UK Biobank GWAS available in the IEU OpenGWAS repository. Then, using a single generate$models command we generate all models. This code was executed on a machine with 8 cores, of which the PolyGenius execution engine used 7, allowing single-core tasks (such as GWAS retrieval or clumping) to run in parallel. We then used the PolyGeniusData filter.models function to retain only models with at least 5 variants, resulting in a final set of 1176 models.

library(ieugwasr)
set.seed(7)

# Randomly select 2500 UK biobank GWASs from the IEU OpenGWAS
gwas.ids <- gwasinfo() %>% 
  filter(grepl("UKB", note, ignore.case=TRUE)) %>%
  slice_sample(n = 2500) %>%
  pull(id)

# Execute PolyGenius model generation for all 2500 GWASs (retrieve GWAS summary statistics and create models)
models <- generate$models(
  sources = generate$sources$opengwas(gwas.ids),
  algorithms = generate$algorithms$ClumpingThresholding(pval = 5e-8, reference.panel = "EUR"),
)
liftover(models, "GRCh38")

# Filter models with few variants before following with downstream cohort analyses
models <- models$filter.models(nrow(variants) >= 5)
models

Applying models to ROSMAP data

Next, we apply the PolyGenius workflow to the ROSMAP cohort study, starting with calculating PRS values across all models. Using the compute$scores function we computed a 892-by-1176 observations-by-models matrix of values.

# Create a PolyGeniusData object coupling models to genotype+phenotypes
data <- PolyGeniusData(
  name = "ROSMAP",
  models = models,
  genotypes = GenotypeInfo(path = "resources/cohorts/ROSMAP/genotypes", format = "pfile", build = "GRCh38", name = "ROSMAP"),
  obsm = list(phenotypes = fread("resources/cohorts/ROSMAP/phenotypes/phenotypes.csv") %>%
    mutate(demented = factor(demented, levels = 0:1)))
)
data <- data[!is.na(demented), ]

# Compute scores
data$scores$X <- compute$scores(data)

We can now use this high dimensional representation to explore similarities between models. The compute$similarity$mod function provides both SNP based and PRS-value based options, which we can then visualize using visualize$data$similarity$mod

data$modp$corr <- compute$similarity$mod(data, X, method = "score.pearson")

p <- visualize$data$similarity$mod(data, corr, 
  center.zero = TRUE, symmetrize.range = TRUE, 
  column_title = "PRS-PRS Similarity", name = "Pearson correlation",
  show_row_names = FALSE, show_column_names = FALSE, border = TRUE, use_raster = TRUE)
p

, characterized population structure by projecting samples onto the 1000 Genomes reference panel (to ensure a shared principal component space), and tested their associations with dementia diagnosis status.

# Create a PolyGeniusData object coupling models to genotype+phenotypes
data <- PolyGeniusData(
  name = "ROSMAP",
  models = models,
  genotypes = GenotypeInfo(path = "resources/cohorts/ROSMAP/genotypes", format = "pfile", build = "GRCh38", name = "ROSMAP"),
  obsm = list(phenotypes = fread("resources/cohorts/ROSMAP/phenotypes/phenotypes.csv"))
)
data <- data[!is.na(demented), ]

# Compute scores and population structure
data$scores$X <- compute$scores(data)
data$scores$X.scaled <- scale(data$scores$X)
data$obsm$PCA.EUR <- compute$populationStructure(data, reference.panel = "EUR")

# Run glm(demented ~ PRS + covariates) associations across all models
data$modu$associations <- associate$regression(
  data,
  outcome = demented,
  scores.layer = X.scaled,
  covariates = c(sex, age.death, PCA.EUR))
saveRDS(data$modu$associations, "resources/cohorts/ROSMAP/associations.summary.statistics.rds")

Because genetic and phenotypic data for each cohort were stored in isolated computational environments, all analyses were conducted locally within each cohort. Only summary statistics from the association analyses were exported and subsequently meta-analyzed.

# Create a PolyGeniusData object coupling models to genotype+phenotypes
data <- PolyGeniusData(
  name = "LASA",
  models = models,
  genotypes = GenotypeInfo(path = "resources/cohorts/LASA/genotypes", format = "pfile", build = "GRCh38", name = "ROSMAP"),
  obsm = list(phenotypes = fread("resources/cohorts/LASA/phenotypes/phenotypes.csv"))
)
data <- data[!is.na(demented), ]

# Compute scores and population structure
data$scores$X <- compute$scores(data)
data$scores$X.scaled <- scale(data$scores$X)
data$obsm$PCA.EUR <- compute$populationStructure(data, reference.panel = "EUR")

# Run glm(demented ~ PRS + covariates) associations across all models
data$modu$associations <- associate$regression(
  data,
  outcome = demented,
  scores.layer = X.scaled,
  covariates = c(sex, age.death, PCA.EUR))
saveRDS(data$modu$associations, "resources/cohorts/LASA/associations.summary.statistics.rds")

# Create a PolyGeniusData object coupling models to genotype+phenotypes
data <- PolyGeniusData(
  name = "ADC",
  models = models,
  genotypes = GenotypeInfo(path = "resources/cohorts/ADC/genotypes", format = "pfile", build = "GRCh38", name = "ROSMAP"),
  obsm = list(phenotypes = fread("resources/cohorts/ADC/phenotypes/phenotypes.csv"))
)
data <- data[!is.na(demented), ]

# Compute scores and population structure
data$scores$X <- compute$scores(data)
data$scores$X.scaled <- scale(data$scores$X)
data$obsm$PCA.EUR <- compute$populationStructure(data, reference.panel = "EUR")

# Run glm(demented ~ PRS + covariates) associations across all models
data$modu$associations <- associate$regression(
  data,
  outcome = demented,
  scores.layer = X.scaled,
  covariates = c(sex, age.death, PCA.EUR))
saveRDS(data$modu$associations, "resources/cohorts/ADC/associations.summary.statistics.rds")

data <- PolyGeniusData(
  name = "ERGO",
  models = models,
  genotypes = c(
    GenotypeInfo(path = "resources/cohorts/ADC/genotypes/RSI", format = "pfile", build = "GRCh38", name = "RS-I"),
    GenotypeInfo(path = "resources/cohorts/ADC/genotypes/RSII", format = "pfile", build = "GRCh38", name = "RS-II"),
    GenotypeInfo(path = "resources/cohorts/ADC/genotypes/RSIII", format = "pfile", build = "GRCh38", name = "RS-III")
  ),
  obsm = list(phenotypes = fread("resources/cohorts/ADC/phenotypes/phenotypes.csv"))
)
data <- data[!is.na(demented), ]

# Compute scores and population structure
data$scores$X <- compute$scores(data)
data$scores$X.scaled <- scale(data$scores$X)
data$obsm$PCA.EUR <- compute$populationStructure(data, reference.panel = "EUR")

# Run glm(demented ~ PRS + covariates) associations across all models
data$modu$associations <- associate$regression(
  data,
  outcome = demented,
  scores.layer = X.scaled,
  covariates = c(sex, age.death, PCA.EUR))
saveRDS(data$modu$associations, "resources/cohorts/ADC/associations.summary.statistics.rds")