library(PolyGenius)
library(data.table)
library(dplyr)
eur <- dataRetriever$referencePanels$get(name = "EUR")
referencePanels$add(
name = eur$name,
description = eur$description,
genotype.info = eur,
overwrite = TRUE
)First End-to-End Analysis (Local, GRCh37)
This chapter runs one complete workflow with minimal moving parts:
- one local GWAS summary-statistics table on GRCh37,
- one CT model (
pval = 5e-8), - one
PolyGeniusDataobject using GRCh37 genotypes, compute$scores+compute$populationStructure,- one regression with simulated phenotype,
- score-distribution and PCA embedding plots.
All chunks below are shown as template code (eval = FALSE) so you can adapt paths and IDs locally.
1) Setup and register one reference panel
CT requires a registered reference panel. For a quick first run, fetch and register EUR once:
2) Define one local GWAS source (GRCh37)
generate$sources$local(...) expects:
gwas: named list of GWAS tablesmetadata: data frame with at leastidandbuild
gwas_df <- fread("data/gwas/AD_local_grch37.tsv")
# Required columns for generation are typically:
# chr, position, ea, nea, beta, pval (optional: eaf, n_eff, beta_se)
gwas_list <- list(AD_local = gwas_df)
gwas_meta <- data.frame(
id = "AD_local",
build = "GRCh37",
trait = "Alzheimer's disease",
stringsAsFactors = FALSE
)
source_local <- generate$sources$local(
gwas = gwas_list,
metadata = gwas_meta
)3) Generate one CT model (pval = 5e-8)
alg_ct <- generate$algorithms$ClumpingThresholding(
pval = 5e-8,
reference.panel = "EUR",
clump.r2 = 0.1,
clump.kb = 250
)
model <- generate$models(
sources = source_local,
algorithms = alg_ct
)
model4) Build PolyGeniusData with GRCh37 genotypes
For this first tutorial we reuse GRCh37 genotype data from the retriever.
geno <- dataRetriever$genotypes$get(name = "EUR_pfile_19")
data <- PolyGeniusData(
name = "TutorialFirstRun",
genotypes = geno,
models = model
)
data5) Compute PRS scores
data$scores$X <- compute$scores(
data,
minor.allele.freq.threshold = 0.01
)6) Compute population structure (PCA)
data$obsm$PCA <- compute$populationStructure(
data,
npcs = 10
)7) Add a simulated phenotype and run one association
set.seed(20260222)
n <- length(data$obs_names)
pheno <- data.frame(
obs_names = data$obs_names,
age = round(rnorm(n, mean = 70, sd = 8), 1),
sex = factor(sample(c("Female", "Male"), n, replace = TRUE)),
case_status = rbinom(n, size = 1, prob = 0.35),
stringsAsFactors = FALSE
)
rownames(pheno) <- pheno$obs_names
data$obsm$phenotypes <- pheno
assoc <- associate$regression(
data = data,
outcomes = case_status,
predictors = everything(),
covariates = c(age, sex),
scores.layer = X
)
assoc8) Visualize score distribution
p_scores <- visualize$data$scores$distribution(
data,
models = 1,
scores.layer = X,
group.by = sex,
type = "density"
)
p_scores9) Visualize embedding (PC1 vs PC2)
p_pca <- visualize$data$embedding$obs(
data,
key = PCA,
dims = c(1, 2),
group.by = sex
)
p_pca10) Visualize association summary
p_assoc <- visualize$associations$heatmap(
assoc,
rows.by = predictor,
columns.by = outcome,
color.by = estimate
)
p_assocWhat comes next
Chapter 4 expands this same flow to multiple GWAS sources (local, GWAS Catalog, OpenGWAS), including multiple IDs per source and mixed-source generation in one run.