PGS Catalog Calculator (pgsc_calc) report

Author

PGS Catalog team

Published

January 30, 2024

Note

See the online documentation for additional explanation of the terms and data presented in this report.

Pipeline command

Code
cat command.txt | fold -w 80 -s | awk -F ' ' 'NR==1 { print "$", $0} NR>1 { print "    " $0}' | sed 's/$/\\/' | sed '$ s/.$//' 
$ nextflow run pgscatalog/pgsc_calc -profile test,singularity -c \
    /usr/local/Cluster-Apps/ceuadmin/pgsc_calc/tests/b.config

Scoring file metadata

Scoring file summary

Code
json_scorefiles <- read_json(params$log_scorefiles, simplifyVector=TRUE) 

link_traits <- function(trait_efo, mapped) {
  if (length(trait_efo) == 0) {
    return("")
  } else {
    return(purrr::map2_chr(trait_efo, mapped, ~ stringr::str_glue('<a href="http://www.ebi.ac.uk/efo/{.x}">{.y}</a>')))
  }
}

extract_traits <- function(x) {
  trait_efo <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_efo"))
  mapped <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_mapped"))
  trait_display <- purrr::map2(trait_efo, mapped, link_traits)
  mapped_trait_links <- purrr::map_chr(trait_display, ~ paste(.x, collapse = "<br />"))
  reported_traits <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_reported"))
  purrr::map2(reported_traits, mapped_trait_links, ~ {
    stringr::str_glue("<u>Reported trait:</u> {.x} <br /> <u>Mapped trait(s):</u> {.y}")
  })
}

extract_chr_handle_null <- function(x, field) {
  return(replace(x[[field]], is.null(x[[field]]), ""))
}

link_pgscatalog <- function(id, link_type) {
  if (id != "") {
    return(stringr::str_glue('<a href="https://www.pgscatalog.org/{link_type}/{id}">{id}</a>'))
  } else {
    return(id)
  }
}

add_note <- function(id, note) {
  if (id != "") {
    return(stringr::str_glue("{id} <br /> <small>{note}</small>"))
  } else {
    return(id)
  }
}

annotate_genome_build <- function(original_build, harmonised_build) {
  return(stringr::str_glue("<u>Original build:</u> {original_build} <br /> <u>Harmonised build:</u> {harmonised_build}"))
}

tibble::tibble(json = json_scorefiles) %>%
  # extract fields from json list
  mutate(pgs_id = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgs_id")),
         pgs_name = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgs_name")),
         pgp_id = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgp_id")),
         citation = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "citation")),
         # trait_efo = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_efo")),
         # trait_reported = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_reported")),
         # trait_mapped = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_mapped")),
         trait_display = extract_traits(.),
         genome_build = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "genome_build")),
         harmonised_build = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "HmPOS_build")),
         n_variants = purrr::map_chr(json, ~ .x$variants_number),
         accession = stringr::str_replace_all(names(json), "_", " ")
         ) %>%
  # add links to pgs catalog identifiers
  mutate(pgs_id = purrr::map_chr(pgs_id, ~ link_pgscatalog(.x, "score")),
         pgp_id = purrr::map_chr(pgp_id, ~ link_pgscatalog(.x, "publication"))) %>%
  # add notes
  mutate(pgp_id = purrr::map2_chr(pgp_id, citation, ~ add_note(.x, .y)),
         pgs_id = purrr::map2_chr(pgs_id, pgs_name, ~ add_note(.x, .y)),
         genome_build = purrr::map2_chr(genome_build, harmonised_build, ~ annotate_genome_build(.x, .y))) %>% 
  # pick columns
  select(accession, pgs_id, pgp_id, trait_display, n_variants, genome_build) -> scorefile_metadata

Variant matching

Parameters

Code
cat params.txt
keep_multiallelic: false
keep_ambiguous   : false
min_overlap      : 0.75

Summary

Code
log_df %>%
  mutate(match_status = forcats::fct_collapse(match_status, matched = "matched", other_level = "unmatched")) %>%
  group_by(sampleset, accession, match_status, score_pass) %>%
  count(wt = count) %>%
  group_by(sampleset, accession) %>%
  mutate(percent = round(n / sum(n) * 100, 1), n_variants = sum(n)) %>%
  arrange(accession, desc(percent)) %>%
  tidyr::pivot_wider(names_from = match_status, values_from = c(n, percent)) %>%
  replace(is.na(.), 0) -> compat
Code
if (!"n_unmatched" %in% colnames(compat)) {
  # handle missing column if all PGS matches perfectly (e.g. no unmatched or excluded variants)
  compat <- compat %>%
    mutate(n_unmatched = 0) 
}

compat %>%
  select(sampleset, accession, n_variants, score_pass, percent_matched,
         n_matched, n_unmatched) %>%
  mutate(score_pass = as.logical(score_pass)) %>%
  DT::datatable(rownames = FALSE,
                extensions = 'Buttons',
    options = list(dom = 'Bfrtip',
                   buttons = c('csv')),
    colnames = c(
      "Sampleset" = "sampleset",
      "Scoring file" = "accession",
      "Number of variants" = "n_variants",
      "Passed matching" = "score_pass",
      "Match %" = "percent_matched",
      "Total matched" = "n_matched",
      "Total unmatched" = "n_unmatched"
    )) %>%
  DT::formatStyle('Scoring file', 
                  valueColumns = 'Passed matching',
                  backgroundColor = DT::styleEqual(c(FALSE, TRUE), c('#c2a5cf', '#a6dba0')))

Detailed log

Scores

Success
  • All requested scores were calculated successfully

1 scores for 2504 samples processed.

Score data

Score extract

Note

Below is a summary of the aggregated scores, which might be useful for debugging. See here for an explanation of plink2 column names

# A tibble: 2,504 × 5
   sampleset IID     DENOM PGS001229_22_SUM PGS001229_22_AVG
   <chr>     <chr>   <int>            <dbl>            <dbl>
 1 cineca    HG00096  1564            0.545         0.000348
 2 cineca    HG00097  1564            0.674         0.000431
 3 cineca    HG00099  1564            0.637         0.000407
 4 cineca    HG00100  1564            0.864         0.000552
 5 cineca    HG00101  1564            0.280         0.000179
 6 cineca    HG00102  1564            0.528         0.000338
 7 cineca    HG00103  1564            0.350         0.000224
 8 cineca    HG00105  1564            0.516         0.000330
 9 cineca    HG00106  1564            0.246         0.000157
10 cineca    HG00107  1564            0.624         0.000399
# ℹ 2,494 more rows

Density plot(s)

Note

The summary density plots show up to six scoring files

Get all scores

All scores can be found in the results directory, at:

cineca/score/aggregated_scores.txt.gz

Citations

Important

For scores from the PGS Catalog, please remember to cite the original publications from which they came (these are listed in the metadata table.)

PGS Catalog Calculator (in development). PGS Catalog Team. https://github.com/PGScatalog/pgsc_calc

Lambert et al. (2021) The Polygenic Score Catalog as an open database for reproducibility and systematic evaluation. Nature Genetics. 53:420–425 doi:10.1038/s41588-021-00783-5.