::: {.callout-note}
See the online [documentation](https://pgsc-calc.readthedocs.io/en/latest/output.html#report) for additional
explanation of the terms and data presented in this report.
:::
```{r setup, echo=FALSE, warning=FALSE, message=FALSE}
library(jsonlite)
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
```
```{r setup_logs, echo=FALSE}
log_paths <- list.files(pattern = params$log_pattern, full.names = TRUE)
read_log <- function(path) {
log <- read.csv(path, stringsAsFactors = FALSE)
return(
log %>%
mutate(
is_multiallelic = factor(is_multiallelic, levels = c("false", "true")),
ambiguous = factor(ambiguous, levels = c("false", "true"))
) %>%
rename(sampleset = dataset)
)
}
log_df <- Reduce(dplyr::bind_rows, lapply(log_paths, read_log))
log_df$sampleset <- gsub("_", " ", log_df$sampleset) # page breaking issues
```
# Pipeline command
```{bash}
cat command.txt | fold -w 80 -s | awk -F ' ' 'NR==1 { print "$", $0} NR>1 { print " " $0}' | sed 's/$/\\/' | sed '$ s/.$//'
```
# Scoring file metadata
## Scoring file summary
```{r load_scorefiles}
json_scorefiles <- read_json(params$log_scorefiles, simplifyVector=TRUE)
link_traits <- function(trait_efo, mapped) {
if (length(trait_efo) == 0) {
return("")
} else {
return(purrr::map2_chr(trait_efo, mapped, ~ stringr::str_glue('{.y}')))
}
}
extract_traits <- function(x) {
trait_efo <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_efo"))
mapped <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_mapped"))
trait_display <- purrr::map2(trait_efo, mapped, link_traits)
mapped_trait_links <- purrr::map_chr(trait_display, ~ paste(.x, collapse = "
"))
reported_traits <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_reported"))
purrr::map2(reported_traits, mapped_trait_links, ~ {
stringr::str_glue("Reported trait: {.x}
Mapped trait(s): {.y}")
})
}
extract_chr_handle_null <- function(x, field) {
return(replace(x[[field]], is.null(x[[field]]), ""))
}
link_pgscatalog <- function(id, link_type) {
if (id != "") {
return(stringr::str_glue('{id}'))
} else {
return(id)
}
}
add_note <- function(id, note) {
if (id != "") {
return(stringr::str_glue("{id}
{note}"))
} else {
return(id)
}
}
annotate_genome_build <- function(original_build, harmonised_build) {
return(stringr::str_glue("Original build: {original_build}
Harmonised build: {harmonised_build}"))
}
tibble::tibble(json = json_scorefiles) %>%
# extract fields from json list
mutate(pgs_id = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgs_id")),
pgs_name = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgs_name")),
pgp_id = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgp_id")),
citation = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "citation")),
# trait_efo = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_efo")),
# trait_reported = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_reported")),
# trait_mapped = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_mapped")),
trait_display = extract_traits(.),
genome_build = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "genome_build")),
harmonised_build = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "HmPOS_build")),
n_variants = purrr::map_chr(json, ~ .x$variants_number),
accession = stringr::str_replace_all(names(json), "_", " ")
) %>%
# add links to pgs catalog identifiers
mutate(pgs_id = purrr::map_chr(pgs_id, ~ link_pgscatalog(.x, "score")),
pgp_id = purrr::map_chr(pgp_id, ~ link_pgscatalog(.x, "publication"))) %>%
# add notes
mutate(pgp_id = purrr::map2_chr(pgp_id, citation, ~ add_note(.x, .y)),
pgs_id = purrr::map2_chr(pgs_id, pgs_name, ~ add_note(.x, .y)),
genome_build = purrr::map2_chr(genome_build, harmonised_build, ~ annotate_genome_build(.x, .y))) %>%
# pick columns
select(accession, pgs_id, pgp_id, trait_display, n_variants, genome_build) -> scorefile_metadata
```
:::{.column-body-outset}
```{r, echo=FALSE}
DT::datatable(
scorefile_metadata,
rownames = FALSE,
escape = FALSE,
colnames = c(
"Scoring file" = "accession",
"Polygenic Score ID" = "pgs_id",
"Publication" = "pgp_id",
"Traits" = "trait_display",
"Number of variants" = "n_variants",
"Genome build" = "genome_build"
),
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv'))
)
```
:::
# Variant matching
## Parameters
```{bash}
cat params.txt
```
```{asis, echo = params$run_ancestry}
## Reference matching summary
```
```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
intersect_stats_files <- list.files(pattern = "intersect_counts*")
intersect_stats <- lapply(intersect_stats_files, function(x) scan(x, sep = "\n", what=integer()))
n_target <- purrr::map_int(intersect_stats, ~ .x[[1]])
n_ref <- purrr::map_int(intersect_stats, ~ .x[[2]])
n_match <- purrr::map_int(intersect_stats, ~ .x[[3]])
data.frame(reference = params$reference_panel_name, n_target = n_target, n_match = n_match, n_ref = n_ref) %>%
group_by(reference) %>%
summarise(n_target = sum(n_target), n_ref = sum(n_ref), n_match = sum(n_match)) %>%
mutate("% matched" = round(n_match / n_ref * 100, 2)) %>%
rename("n target" = n_target, "n (matched)" = n_match, "N variants in panel" = n_ref) %>%
DT::datatable()
```
## Summary
```{r setup_matches}
log_df %>%
mutate(match_status = forcats::fct_collapse(match_status, matched = "matched", other_level = "unmatched")) %>%
group_by(sampleset, accession, match_status, score_pass) %>%
count(wt = count) %>%
group_by(sampleset, accession) %>%
mutate(percent = round(n / sum(n) * 100, 1), n_variants = sum(n)) %>%
arrange(accession, desc(percent)) %>%
tidyr::pivot_wider(names_from = match_status, values_from = c(n, percent)) %>%
replace(is.na(.), 0) -> compat
```
```{r match_table}
if (!"n_unmatched" %in% colnames(compat)) {
# handle missing column if all PGS matches perfectly (e.g. no unmatched or excluded variants)
compat <- compat %>%
mutate(n_unmatched = 0)
}
compat %>%
select(sampleset, accession, n_variants, score_pass, percent_matched,
n_matched, n_unmatched) %>%
mutate(score_pass = as.logical(score_pass)) %>%
DT::datatable(rownames = FALSE,
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv')),
colnames = c(
"Sampleset" = "sampleset",
"Scoring file" = "accession",
"Number of variants" = "n_variants",
"Passed matching" = "score_pass",
"Match %" = "percent_matched",
"Total matched" = "n_matched",
"Total unmatched" = "n_unmatched"
)) %>%
DT::formatStyle('Scoring file',
valueColumns = 'Passed matching',
backgroundColor = DT::styleEqual(c(FALSE, TRUE), c('#c2a5cf', '#a6dba0')))
```
## Detailed log
:::{.column-body-outset}
```{r match_table_detailed, echo = FALSE, warning=FALSE}
if(params$run_ancestry == TRUE){
# Include match_IDs in the groupby to account for
log_df %>%
group_by(sampleset, accession) %>%
count(match_status, match_IDs, ambiguous, is_multiallelic, match_flipped, duplicate_best_match, duplicate_ID, wt = count) %>%
rename(is_ambiguous = ambiguous) %>%
mutate(percent = round(n / sum(n) * 100, 2),
match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>%
arrange(accession, match_status) %>%
mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>%
DT::datatable(rownames=FALSE,
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv')),
colnames = c(
"Sampleset" = "sampleset",
"Scoring file" = "accession",
"Match type" = "match_status",
"Variant in reference panel" = "match_IDs",
"Multiple potential matches" = "duplicate_best_match",
"Duplicated matched variants" = "duplicate_ID",
"Ambiguous" = "is_ambiguous",
"Multiallelic" = "is_multiallelic",
"Matches strand flip" = "match_flipped",
"%" = "percent"
))
} else{
log_df %>%
group_by(sampleset, accession) %>%
count(match_status, ambiguous, is_multiallelic,match_flipped, duplicate_best_match, duplicate_ID, wt = count) %>%
rename(is_ambiguous = ambiguous) %>%
mutate(percent = round(n / sum(n) * 100, 2),
match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>%
arrange(accession, match_status) %>%
mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>%
DT::datatable(rownames=FALSE,
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv')),
colnames = c(
"Sampleset" = "sampleset",
"Scoring file" = "accession",
"Match type" = "match_status",
"Multiple potential matches" = "duplicate_best_match",
"Duplicated matched variants" = "duplicate_ID",
"Ambiguous" = "is_ambiguous",
"Multiallelic" = "is_multiallelic",
"Matches strand flip" = "match_flipped",
"%" = "percent"
))
}
```
:::
```{asis, echo = params$run_ancestry}
# Genetic Ancestry
```
```{r colour_palette, echo = FALSE, eval=params$run_ancestry}
# source: https://github.com/PGScatalog/PGS_Catalog/blob/master/catalog/static/catalog/pgs.scss#L2493-L2520
# $ancestry_colours
if({params$reference_panel_name} == '1000G'){
thousand_genomes_colours <- c("#FFD900", "#E41A1C", "#B15928", "#4DAF4A",
"#377EB8", "#00CED1", "#984EA3", "#A6CEE3",
"#FF7F00", "#BBB", "#999")
names(thousand_genomes_colours) <- c("AFR", "AMR", "ASN", "EAS",
"EUR", "GME", "SAS", "MAE",
"MAO", "NR", "OTH")
current_population_palette <- scale_colour_manual(name = "Populations", values = thousand_genomes_colours)
} else if({params$reference_panel_name} == 'HGDP+1kGP'){
gnomAD_pop_colours <- c("#97519d", "#e42523", "#f67e1e", "#48b24b",
"#3280bb", "#a65528", "#9a9c9b")
names(gnomAD_pop_colours) <- c("AFR", "AMR", "CSA", "EAS",
"EUR", "MID", "OCE")
current_population_palette <- scale_colour_manual(name = "Populations", values = gnomAD_pop_colours)
} else{
current_population_palette <- scale_colour_brewer(palette="Set3")
}
```
```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
popsim <- readr::read_tsv(gsub("_pgs.", "_popsimilarity.", params$score_path))
head(popsim)
new_label_target = paste({params$sampleset}, '(Most Similar Population)')
new_label_ref = paste0('reference (', {params$reference_panel_name}, ' populations)')
popsim$slabel <- new_label_ref
popsim[popsim$sampleset == {params$sampleset}, 'slabel'] <- new_label_target
map_shapes <- c(1, 16)
map_shapes <- setNames(map_shapes, c(new_label_target, new_label_ref))
# Placeholder figure: needs better legend labelling
for(pc in seq.int(1,5,2)){
pcY = paste('PC', pc, sep='')
pcX = paste('PC', pc+1, sep='')
if (pcX %in% colnames(popsim)){
p_pca <- ggplot(popsim[popsim$REFERENCE == TRUE,], aes(x=!!sym(pcX), y=!!sym(pcY))) + geom_point(aes(colour=SuperPop, shape=slabel), alpha=0.25)
p_pca <- p_pca + geom_point(data=popsim[popsim$REFERENCE != TRUE,], aes(color=MostSimilarPop, shape=slabel))
p_pca <- p_pca + theme_bw() + current_population_palette + scale_shape_manual(values=map_shapes, name='sampleset')
print(p_pca)
}
}
```
```{asis, echo = params$run_ancestry}
## Population similarity summary
```
```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
popsim %>%
filter(sampleset == "reference") %>%
group_by(sampleset) %>%
count(`Most similar population` = SuperPop ) %>%
mutate(percent = round(n / sum(n) * 100, 2)) -> ref_count
popsim %>%
filter(sampleset != "reference") %>%
group_by(sampleset) %>%
count(`Most similar population` = MostSimilarPop) %>%
mutate(percent = round(n / sum(n) * 100, 2)) %>%
bind_rows(ref_count) %>%
mutate(count = stringr::str_glue("{n} ({percent}%)")) %>%
select(-n, -percent) %>%
tidyr::pivot_wider(names_from = sampleset, values_from = count) -> pop_summary
write.csv(pop_summary, "pop_summary.csv", quote=FALSE, row.names = FALSE)
pop_summary %>%
DT::datatable()
```
# Scores
```{r, echo = FALSE, message = FALSE, eval=!params$run_ancestry}
# problem: aggregated_scores.txt.gz has a different structure to the ancestry outputs
# solution: pivot the table in the report, but fix in pgscatalog_utils in a future release
# problem: big scores can take a lot of memory
# use data.table as a temporary solution to pivoting datasets
scores <- data.table::fread(params$score_path)
n_scores <- sum(grepl("*_SUM$", colnames(scores)))
n_samples <- nrow(scores)
id_cols <- c("sampleset", "IID")
melted_scores <- data.table::melt(scores, id.vars=id_cols, measure.vars=patterns(SUM="*_SUM$", DENOM="DENOM", AVG="*_AVG$"), variable.name = "PGS")
# annoying fix for scores getting converted to a factor level integer
# fixed in data.table 1.14.9 but not released yet
score_names <- sub("_.*", "", colnames(scores)[grepl("_SUM$", colnames(scores))])
setattr(melted_scores$PGS, "levels", score_names)
data.table::fwrite(melted_scores, params$score_path, sep="\t", compress="gzip")
```
```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
scores <- readr::read_tsv(params$score_path)
n_scores <- length(unique(scores$PGS))
n_samples <- length(unique(scores$IID))
print(n_samples)
```
```{asis, echo = any(table(scores$sampleset) < 50)}
::: {.callout-important title="Warning: small sampleset size (n < 50) detected"}
* plink2 uses allele frequency data to [mean impute](https://www.cog-genomics.org/plink/2.0/score) the dosages of missing genotypes
* Currently `pgsc_calc` disables mean-imputation in these small sample sets to make sure that the calculated PGS is as consistent with the genotype data as possible
* With a small sample size, the resulting score sums may be inconsistent between samples
* The average `([scorename]_AVG)` may be more applicable as it calculates an average weighting over all genotypes present
In the future mean-imputation will be supported in small samplesets using ancestry-matched reference samplesets to ensure consistent calculation of score sums (e.g. 1000G Genomes).
:::
```
```{asis, echo = (nrow(compat) == n_scores)}
:::{.callout-tip title="Success"}
* All requested scores were calculated successfully
:::
```
```{asis, echo = (nrow(compat) != n_scores)}
:::{.callout-caution}
* Some requested scores could not be calculated
* Please check the variant matching summary table to understand why
:::
```
`r n_scores` scores for `r n_samples` samples processed.
### Score data
#### Score extract
::: {.callout-note}
Below is a summary of the aggregated scores, which might be useful for debugging. See here for an explanation of [plink2](https://www.cog-genomics.org/plink/2.0/formats#sscore) column names
:::
```{r, echo = FALSE}
scores %>%
tibble::as_tibble(.)
```
#### Density plot(s)
::: {.callout-note}
The summary density plots show up to six scoring files
:::
```{r density_ancestry, echo=FALSE, message=FALSE, warning=FALSE, eval=params$run_ancestry}
# Select which PGS to plot
uscores <- unique(scores$PGS)
uscores_plot <- uscores[1:min(length(uscores), 6)] # plot max 6 PGS
# Plot multiple adjustment methods at once per PGS
for(current_pgs in uscores_plot){
long_scores <- scores %>% select(!percentile_MostSimilarPop) %>% filter(PGS == current_pgs) %>% gather(Method, score, -sampleset, -IID, -PGS)
long_scores %>%
ggplot(aes(x = score, fill = sampleset)) +
geom_density(alpha = 0.3) +
facet_wrap(~Method, scales = "free", nrow=1) +
theme_bw() +
labs(x = 'PGS', y = "Density", title = paste(current_pgs, '(adjusted distributions)')) -> p_dist
print(p_dist)
}
```
```{r, echo = FALSE, message=FALSE, warning=FALSE, eval=!params$run_ancestry}
scores %>%
ungroup() %>%
select(IID, sampleset, ends_with("SUM")) %>%
group_by(IID, sampleset) %>%
tidyr::pivot_longer(cols = -group_cols()) %>%
ungroup() %>%
filter(name != "DENOM_SUM") %>%
mutate(name = ifelse(
stringr::str_detect(name, "^PGS"),
stringr::str_extract(name, "^PGS[0-9]{6}"),
name)) %>%
group_by(sampleset, name) -> long_scores
group_keys(long_scores) %>%
slice(1:6) -> score_keys
long_scores %>%
inner_join(score_keys) %>%
ggplot(aes(x = value, fill = sampleset)) +
geom_density(alpha = 0.3) +
facet_wrap(~name, ncol = 2, scales = "free") +
theme_bw() +
labs(x = "PGS (SUM)", y = "Density", title = "PGS Distribution(s)")
```
### Get all scores
All scores can be found in the results directory, at:
```{r, eval=params$run_ancestry, echo=FALSE}
stringr::str_glue("{params$sampleset}/score/{params$sampleset}_pgs.txt.gz")
```
```{r, eval=!params$run_ancestry, echo=FALSE}
stringr::str_glue("{params$sampleset}/score/aggregated_scores.txt.gz")
```
# Citations
::: {.callout-important}
For scores from the PGS Catalog, please remember to cite the original publications from which they came (these are listed in the metadata table.)
:::
> PGS Catalog Calculator (in development). PGS Catalog Team. `https://github.com/PGScatalog/pgsc_calc`
> Lambert et al. (2021) The Polygenic Score Catalog as an open database for reproducibility and systematic evaluation. Nature Genetics. 53:420–425 doi:10.1038/s41588-021-00783-5.