First End-to-End Analysis (Local, GRCh37)

This chapter runs one complete workflow with minimal moving parts:

  1. one local GWAS summary-statistics table on GRCh37,
  2. one CT model (pval = 5e-8),
  3. one PolyGeniusData object using GRCh37 genotypes,
  4. compute$scores + compute$populationStructure,
  5. one regression with simulated phenotype,
  6. score-distribution and PCA embedding plots.

All chunks below are shown as template code (eval = FALSE) so you can adapt paths and IDs locally.

1) Setup and register one reference panel

CT requires a registered reference panel. For a quick first run, fetch and register EUR once:

library(PolyGenius)
library(data.table)
library(dplyr)

eur <- dataRetriever$referencePanels$get(name = "EUR")

referencePanels$add(
  name = eur$name,
  description = eur$description,
  genotype.info = eur,
  overwrite = TRUE
)

2) Define one local GWAS source (GRCh37)

generate$sources$local(...) expects:

  • gwas: named list of GWAS tables
  • metadata: data frame with at least id and build
gwas_df <- fread("data/gwas/AD_local_grch37.tsv")

# Required columns for generation are typically:
# chr, position, ea, nea, beta, pval (optional: eaf, n_eff, beta_se)

gwas_list <- list(AD_local = gwas_df)

gwas_meta <- data.frame(
  id = "AD_local",
  build = "GRCh37",
  trait = "Alzheimer's disease",
  stringsAsFactors = FALSE
)

source_local <- generate$sources$local(
  gwas = gwas_list,
  metadata = gwas_meta
)

3) Generate one CT model (pval = 5e-8)

alg_ct <- generate$algorithms$ClumpingThresholding(
  pval = 5e-8,
  reference.panel = "EUR",
  clump.r2 = 0.1,
  clump.kb = 250
)

model <- generate$models(
  sources = source_local,
  algorithms = alg_ct
)

model

4) Build PolyGeniusData with GRCh37 genotypes

For this first tutorial we reuse GRCh37 genotype data from the retriever.

geno <- dataRetriever$genotypes$get(name = "EUR_pfile_19")

data <- PolyGeniusData(
  name = "TutorialFirstRun",
  genotypes = geno,
  models = model
)

data

5) Compute PRS scores

data$scores$X <- compute$scores(
  data,
  minor.allele.freq.threshold = 0.01
)

6) Compute population structure (PCA)

data$obsm$PCA <- compute$populationStructure(
  data,
  npcs = 10
)

7) Add a simulated phenotype and run one association

set.seed(20260222)
n <- length(data$obs_names)

pheno <- data.frame(
  obs_names = data$obs_names,
  age = round(rnorm(n, mean = 70, sd = 8), 1),
  sex = factor(sample(c("Female", "Male"), n, replace = TRUE)),
  case_status = rbinom(n, size = 1, prob = 0.35),
  stringsAsFactors = FALSE
)
rownames(pheno) <- pheno$obs_names

data$obsm$phenotypes <- pheno

assoc <- associate$regression(
  data = data,
  outcomes = case_status,
  predictors = everything(),
  covariates = c(age, sex),
  scores.layer = X
)

assoc

8) Visualize score distribution

p_scores <- visualize$data$scores$distribution(
  data,
  models = 1,
  scores.layer = X,
  group.by = sex,
  type = "density"
)

p_scores

9) Visualize embedding (PC1 vs PC2)

p_pca <- visualize$data$embedding$obs(
  data,
  key = PCA,
  dims = c(1, 2),
  group.by = sex
)

p_pca

10) Visualize association summary

p_assoc <- visualize$associations$heatmap(
  assoc,
  rows.by = predictor,
  columns.by = outcome,
  color.by = estimate
)

p_assoc

What comes next

Chapter 4 expands this same flow to multiple GWAS sources (local, GWAS Catalog, OpenGWAS), including multiple IDs per source and mixed-source generation in one run.