::: {.callout-note} See the online [documentation](https://pgsc-calc.readthedocs.io/en/latest/output.html#report) for additional explanation of the terms and data presented in this report. ::: ```{r setup, echo=FALSE, warning=FALSE, message=FALSE} library(jsonlite) library(dplyr) library(tidyr) library(ggplot2) library(data.table) ``` ```{r setup_logs, echo=FALSE} log_paths <- list.files(pattern = params$log_pattern, full.names = TRUE) read_log <- function(path) { log <- read.csv(path, stringsAsFactors = FALSE) return( log %>% mutate( is_multiallelic = factor(is_multiallelic, levels = c("false", "true")), ambiguous = factor(ambiguous, levels = c("false", "true")) ) %>% rename(sampleset = dataset) ) } log_df <- Reduce(dplyr::bind_rows, lapply(log_paths, read_log)) log_df$sampleset <- gsub("_", " ", log_df$sampleset) # page breaking issues ``` # Pipeline command ```{bash} cat command.txt | fold -w 80 -s | awk -F ' ' 'NR==1 { print "$", $0} NR>1 { print " " $0}' | sed 's/$/\\/' | sed '$ s/.$//' ``` # Scoring file metadata ## Scoring file summary ```{r load_scorefiles} json_scorefiles <- read_json(params$log_scorefiles, simplifyVector=TRUE) link_traits <- function(trait_efo, mapped) { if (length(trait_efo) == 0) { return("") } else { return(purrr::map2_chr(trait_efo, mapped, ~ stringr::str_glue('{.y}'))) } } extract_traits <- function(x) { trait_efo <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_efo")) mapped <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_mapped")) trait_display <- purrr::map2(trait_efo, mapped, link_traits) mapped_trait_links <- purrr::map_chr(trait_display, ~ paste(.x, collapse = "
")) reported_traits <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_reported")) purrr::map2(reported_traits, mapped_trait_links, ~ { stringr::str_glue("Reported trait: {.x}
Mapped trait(s): {.y}") }) } extract_chr_handle_null <- function(x, field) { return(replace(x[[field]], is.null(x[[field]]), "")) } link_pgscatalog <- function(id, link_type) { if (id != "") { return(stringr::str_glue('{id}')) } else { return(id) } } add_note <- function(id, note) { if (id != "") { return(stringr::str_glue("{id}
{note}")) } else { return(id) } } annotate_genome_build <- function(original_build, harmonised_build) { return(stringr::str_glue("Original build: {original_build}
Harmonised build: {harmonised_build}")) } tibble::tibble(json = json_scorefiles) %>% # extract fields from json list mutate(pgs_id = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgs_id")), pgs_name = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgs_name")), pgp_id = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgp_id")), citation = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "citation")), # trait_efo = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_efo")), # trait_reported = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_reported")), # trait_mapped = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_mapped")), trait_display = extract_traits(.), genome_build = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "genome_build")), harmonised_build = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "HmPOS_build")), n_variants = purrr::map_chr(json, ~ .x$variants_number), accession = stringr::str_replace_all(names(json), "_", " ") ) %>% # add links to pgs catalog identifiers mutate(pgs_id = purrr::map_chr(pgs_id, ~ link_pgscatalog(.x, "score")), pgp_id = purrr::map_chr(pgp_id, ~ link_pgscatalog(.x, "publication"))) %>% # add notes mutate(pgp_id = purrr::map2_chr(pgp_id, citation, ~ add_note(.x, .y)), pgs_id = purrr::map2_chr(pgs_id, pgs_name, ~ add_note(.x, .y)), genome_build = purrr::map2_chr(genome_build, harmonised_build, ~ annotate_genome_build(.x, .y))) %>% # pick columns select(accession, pgs_id, pgp_id, trait_display, n_variants, genome_build) -> scorefile_metadata ``` :::{.column-body-outset} ```{r, echo=FALSE} DT::datatable( scorefile_metadata, rownames = FALSE, escape = FALSE, colnames = c( "Scoring file" = "accession", "Polygenic Score ID" = "pgs_id", "Publication" = "pgp_id", "Traits" = "trait_display", "Number of variants" = "n_variants", "Genome build" = "genome_build" ), extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')) ) ``` ::: # Variant matching ## Parameters ```{bash} cat params.txt ``` ```{asis, echo = params$run_ancestry} ## Reference matching summary ``` ```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry} intersect_stats_files <- list.files(pattern = "intersect_counts*") intersect_stats <- lapply(intersect_stats_files, function(x) scan(x, sep = "\n", what=integer())) n_target <- purrr::map_int(intersect_stats, ~ .x[[1]]) n_ref <- purrr::map_int(intersect_stats, ~ .x[[2]]) n_match <- purrr::map_int(intersect_stats, ~ .x[[3]]) data.frame(reference = params$reference_panel_name, n_target = n_target, n_match = n_match, n_ref = n_ref) %>% group_by(reference) %>% summarise(n_target = sum(n_target), n_ref = sum(n_ref), n_match = sum(n_match)) %>% mutate("% matched" = round(n_match / n_ref * 100, 2)) %>% rename("n target" = n_target, "n (matched)" = n_match, "N variants in panel" = n_ref) %>% DT::datatable() ``` ## Summary ```{r setup_matches} log_df %>% mutate(match_status = forcats::fct_collapse(match_status, matched = "matched", other_level = "unmatched")) %>% group_by(sampleset, accession, match_status, score_pass) %>% count(wt = count) %>% group_by(sampleset, accession) %>% mutate(percent = round(n / sum(n) * 100, 1), n_variants = sum(n)) %>% arrange(accession, desc(percent)) %>% tidyr::pivot_wider(names_from = match_status, values_from = c(n, percent)) %>% replace(is.na(.), 0) -> compat ``` ```{r match_table} if (!"n_unmatched" %in% colnames(compat)) { # handle missing column if all PGS matches perfectly (e.g. no unmatched or excluded variants) compat <- compat %>% mutate(n_unmatched = 0) } compat %>% select(sampleset, accession, n_variants, score_pass, percent_matched, n_matched, n_unmatched) %>% mutate(score_pass = as.logical(score_pass)) %>% DT::datatable(rownames = FALSE, extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')), colnames = c( "Sampleset" = "sampleset", "Scoring file" = "accession", "Number of variants" = "n_variants", "Passed matching" = "score_pass", "Match %" = "percent_matched", "Total matched" = "n_matched", "Total unmatched" = "n_unmatched" )) %>% DT::formatStyle('Scoring file', valueColumns = 'Passed matching', backgroundColor = DT::styleEqual(c(FALSE, TRUE), c('#c2a5cf', '#a6dba0'))) ``` ## Detailed log :::{.column-body-outset} ```{r match_table_detailed, echo = FALSE, warning=FALSE} if(params$run_ancestry == TRUE){ # Include match_IDs in the groupby to account for log_df %>% group_by(sampleset, accession) %>% count(match_status, match_IDs, ambiguous, is_multiallelic, match_flipped, duplicate_best_match, duplicate_ID, wt = count) %>% rename(is_ambiguous = ambiguous) %>% mutate(percent = round(n / sum(n) * 100, 2), match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>% arrange(accession, match_status) %>% mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>% DT::datatable(rownames=FALSE, extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')), colnames = c( "Sampleset" = "sampleset", "Scoring file" = "accession", "Match type" = "match_status", "Variant in reference panel" = "match_IDs", "Multiple potential matches" = "duplicate_best_match", "Duplicated matched variants" = "duplicate_ID", "Ambiguous" = "is_ambiguous", "Multiallelic" = "is_multiallelic", "Matches strand flip" = "match_flipped", "%" = "percent" )) } else{ log_df %>% group_by(sampleset, accession) %>% count(match_status, ambiguous, is_multiallelic,match_flipped, duplicate_best_match, duplicate_ID, wt = count) %>% rename(is_ambiguous = ambiguous) %>% mutate(percent = round(n / sum(n) * 100, 2), match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>% arrange(accession, match_status) %>% mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>% DT::datatable(rownames=FALSE, extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')), colnames = c( "Sampleset" = "sampleset", "Scoring file" = "accession", "Match type" = "match_status", "Multiple potential matches" = "duplicate_best_match", "Duplicated matched variants" = "duplicate_ID", "Ambiguous" = "is_ambiguous", "Multiallelic" = "is_multiallelic", "Matches strand flip" = "match_flipped", "%" = "percent" )) } ``` ::: ```{asis, echo = params$run_ancestry} # Genetic Ancestry ``` ```{r colour_palette, echo = FALSE, eval=params$run_ancestry} # source: https://github.com/PGScatalog/PGS_Catalog/blob/master/catalog/static/catalog/pgs.scss#L2493-L2520 # $ancestry_colours if({params$reference_panel_name} == '1000G'){ thousand_genomes_colours <- c("#FFD900", "#E41A1C", "#B15928", "#4DAF4A", "#377EB8", "#00CED1", "#984EA3", "#A6CEE3", "#FF7F00", "#BBB", "#999") names(thousand_genomes_colours) <- c("AFR", "AMR", "ASN", "EAS", "EUR", "GME", "SAS", "MAE", "MAO", "NR", "OTH") current_population_palette <- scale_colour_manual(name = "Populations", values = thousand_genomes_colours) } else if({params$reference_panel_name} == 'HGDP+1kGP'){ gnomAD_pop_colours <- c("#97519d", "#e42523", "#f67e1e", "#48b24b", "#3280bb", "#a65528", "#9a9c9b") names(gnomAD_pop_colours) <- c("AFR", "AMR", "CSA", "EAS", "EUR", "MID", "OCE") current_population_palette <- scale_colour_manual(name = "Populations", values = gnomAD_pop_colours) } else{ current_population_palette <- scale_colour_brewer(palette="Set3") } ``` ```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry} popsim <- readr::read_tsv(gsub("_pgs.", "_popsimilarity.", params$score_path)) head(popsim) new_label_target = paste({params$sampleset}, '(Most Similar Population)') new_label_ref = paste0('reference (', {params$reference_panel_name}, ' populations)') popsim$slabel <- new_label_ref popsim[popsim$sampleset == {params$sampleset}, 'slabel'] <- new_label_target map_shapes <- c(1, 16) map_shapes <- setNames(map_shapes, c(new_label_target, new_label_ref)) # Placeholder figure: needs better legend labelling for(pc in seq.int(1,5,2)){ pcY = paste('PC', pc, sep='') pcX = paste('PC', pc+1, sep='') if (pcX %in% colnames(popsim)){ p_pca <- ggplot(popsim[popsim$REFERENCE == TRUE,], aes(x=!!sym(pcX), y=!!sym(pcY))) + geom_point(aes(colour=SuperPop, shape=slabel), alpha=0.25) p_pca <- p_pca + geom_point(data=popsim[popsim$REFERENCE != TRUE,], aes(color=MostSimilarPop, shape=slabel)) p_pca <- p_pca + theme_bw() + current_population_palette + scale_shape_manual(values=map_shapes, name='sampleset') print(p_pca) } } ``` ```{asis, echo = params$run_ancestry} ## Population similarity summary ``` ```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry} popsim %>% filter(sampleset == "reference") %>% group_by(sampleset) %>% count(`Most similar population` = SuperPop ) %>% mutate(percent = round(n / sum(n) * 100, 2)) -> ref_count popsim %>% filter(sampleset != "reference") %>% group_by(sampleset) %>% count(`Most similar population` = MostSimilarPop) %>% mutate(percent = round(n / sum(n) * 100, 2)) %>% bind_rows(ref_count) %>% mutate(count = stringr::str_glue("{n} ({percent}%)")) %>% select(-n, -percent) %>% tidyr::pivot_wider(names_from = sampleset, values_from = count) -> pop_summary write.csv(pop_summary, "pop_summary.csv", quote=FALSE, row.names = FALSE) pop_summary %>% DT::datatable() ``` # Scores ```{r, echo = FALSE, message = FALSE, eval=!params$run_ancestry} # problem: aggregated_scores.txt.gz has a different structure to the ancestry outputs # solution: pivot the table in the report, but fix in pgscatalog_utils in a future release # problem: big scores can take a lot of memory # use data.table as a temporary solution to pivoting datasets scores <- data.table::fread(params$score_path) n_scores <- sum(grepl("*_SUM$", colnames(scores))) n_samples <- nrow(scores) id_cols <- c("sampleset", "IID") melted_scores <- data.table::melt(scores, id.vars=id_cols, measure.vars=patterns(SUM="*_SUM$", DENOM="DENOM", AVG="*_AVG$"), variable.name = "PGS") # annoying fix for scores getting converted to a factor level integer # fixed in data.table 1.14.9 but not released yet score_names <- sub("_.*", "", colnames(scores)[grepl("_SUM$", colnames(scores))]) setattr(melted_scores$PGS, "levels", score_names) data.table::fwrite(melted_scores, params$score_path, sep="\t", compress="gzip") ``` ```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry} scores <- readr::read_tsv(params$score_path) n_scores <- length(unique(scores$PGS)) n_samples <- length(unique(scores$IID)) print(n_samples) ``` ```{asis, echo = any(table(scores$sampleset) < 50)} ::: {.callout-important title="Warning: small sampleset size (n < 50) detected"} * plink2 uses allele frequency data to [mean impute](https://www.cog-genomics.org/plink/2.0/score) the dosages of missing genotypes * Currently `pgsc_calc` disables mean-imputation in these small sample sets to make sure that the calculated PGS is as consistent with the genotype data as possible * With a small sample size, the resulting score sums may be inconsistent between samples * The average `([scorename]_AVG)` may be more applicable as it calculates an average weighting over all genotypes present In the future mean-imputation will be supported in small samplesets using ancestry-matched reference samplesets to ensure consistent calculation of score sums (e.g. 1000G Genomes). ::: ``` ```{asis, echo = (nrow(compat) == n_scores)} :::{.callout-tip title="Success"} * All requested scores were calculated successfully ::: ``` ```{asis, echo = (nrow(compat) != n_scores)} :::{.callout-caution} * Some requested scores could not be calculated * Please check the variant matching summary table to understand why ::: ``` `r n_scores` scores for `r n_samples` samples processed. ### Score data #### Score extract ::: {.callout-note} Below is a summary of the aggregated scores, which might be useful for debugging. See here for an explanation of [plink2](https://www.cog-genomics.org/plink/2.0/formats#sscore) column names ::: ```{r, echo = FALSE} scores %>% tibble::as_tibble(.) ``` #### Density plot(s) ::: {.callout-note} The summary density plots show up to six scoring files ::: ```{r density_ancestry, echo=FALSE, message=FALSE, warning=FALSE, eval=params$run_ancestry} # Select which PGS to plot uscores <- unique(scores$PGS) uscores_plot <- uscores[1:min(length(uscores), 6)] # plot max 6 PGS # Plot multiple adjustment methods at once per PGS for(current_pgs in uscores_plot){ long_scores <- scores %>% select(!percentile_MostSimilarPop) %>% filter(PGS == current_pgs) %>% gather(Method, score, -sampleset, -IID, -PGS) long_scores %>% ggplot(aes(x = score, fill = sampleset)) + geom_density(alpha = 0.3) + facet_wrap(~Method, scales = "free", nrow=1) + theme_bw() + labs(x = 'PGS', y = "Density", title = paste(current_pgs, '(adjusted distributions)')) -> p_dist print(p_dist) } ``` ```{r, echo = FALSE, message=FALSE, warning=FALSE, eval=!params$run_ancestry} scores %>% ungroup() %>% select(IID, sampleset, ends_with("SUM")) %>% group_by(IID, sampleset) %>% tidyr::pivot_longer(cols = -group_cols()) %>% ungroup() %>% filter(name != "DENOM_SUM") %>% mutate(name = ifelse( stringr::str_detect(name, "^PGS"), stringr::str_extract(name, "^PGS[0-9]{6}"), name)) %>% group_by(sampleset, name) -> long_scores group_keys(long_scores) %>% slice(1:6) -> score_keys long_scores %>% inner_join(score_keys) %>% ggplot(aes(x = value, fill = sampleset)) + geom_density(alpha = 0.3) + facet_wrap(~name, ncol = 2, scales = "free") + theme_bw() + labs(x = "PGS (SUM)", y = "Density", title = "PGS Distribution(s)") ``` ### Get all scores All scores can be found in the results directory, at: ```{r, eval=params$run_ancestry, echo=FALSE} stringr::str_glue("{params$sampleset}/score/{params$sampleset}_pgs.txt.gz") ``` ```{r, eval=!params$run_ancestry, echo=FALSE} stringr::str_glue("{params$sampleset}/score/aggregated_scores.txt.gz") ``` # Citations ::: {.callout-important} For scores from the PGS Catalog, please remember to cite the original publications from which they came (these are listed in the metadata table.) ::: > PGS Catalog Calculator (in development). PGS Catalog Team. `https://github.com/PGScatalog/pgsc_calc` > Lambert et al. (2021) The Polygenic Score Catalog as an open database for reproducibility and systematic evaluation. Nature Genetics. 53:420–425 doi:10.1038/s41588-021-00783-5.