PCA projection

Official page: https://github.com/covid19-hg/pca_projection.

git clone https://github.com/covid19-hg/pca_projection
cd pca_projection
 

setup

The required steps according to the documentation can be summarised as follows,

# Pre-computed PCA loadings, reference allele frequencies and scores
wget https://storage.googleapis.com/covid19-hg-public/pca_projection/hgdp_tgp_pca_covid19hgi_snps_loadings.GRCh37.plink.tsv
wget https://storage.googleapis.com/covid19-hg-public/pca_projection/hgdp_tgp_pca_covid19hgi_snps_loadings.GRCh37.plink.afreq
wget https://storage.googleapis.com/covid19-hg-public/pca_projection/hgdp_tgp_pca_covid19hgi_snps_loadings.GRCh38.plink.tsv
wget https://storage.googleapis.com/covid19-hg-public/pca_projection/hgdp_tgp_pca_covid19hgi_snps_loadings.GRCh38.plink.afreq
wget https://storage.googleapis.com/covid19-hg-public/pca_projection/hgdp_tgp_pca_covid19hgi_snps_loadings.rsid.plink.tsv
wget https://storage.googleapis.com/covid19-hg-public/pca_projection/hgdp_tgp_pca_covid19hgi_snps_loadings.rsid.plink.afreq
wget https://storage.googleapis.com/covid19-hg-public/pca_projection/hgdp_tgp_pca_covid19hgi_snps_scores.txt.gz

# imputed PLINK2 dosage files

cut -f1 [path to the pre-computed loadings file] | tail -n +2 > variants.extract

## .pgen/.pvar/.psam
plink2 \
  --pfile [path to your per-chromosome pfile] \
  --extract variants.extract \
  --make-pfile \
  --out [per-chromosome output name]
ls [the previous per-chromosome output prefix].*.pgen | sed -e ‘s/.pgen//' > merge-list.txt
plink2 --pmerge-list merge-list.txt --out [all-chromosome output name]

## .bed/.bim/.fam

plink \
  --bfile [path to your per-chromosome pfile] \
  --extract variants.extract \
  --make-bed \
  --out [per-chromosome output name]
ls [outname].*.bed | sed -e ‘s/.bed//' > merge-list.txt
plink --merge-list merge-list.txt --out [all-chromosome output name]

## .bgen/.sample (in doubt)

bgenix \
  -g [path to your per-chromosome bgen] \
  -incl-rsids variant.extract \
  > [per-chromosome output name].bgen
cat-bgen \
-g [path to your per-chromosome bgen 1] \
   [path to your per-chromosome bgen 2] \
...
   [path to your per-chromosome bgen 22] \
-og [all-chromosome outname]
plink2 \
  --bgen [path to all-chromosome bgen] [REF/ALT mode] \
  --make-pfile \
  --out [output pfile name]

## .vcf

bcftools view -Oz \
  -i "ID = @variants.extract" \
  [path to your per-chromosome vcf file] \
  > [per-chromosome outname>.vcf.gz]
bcftools concat -Oz [per-chromosome vcf files] > [all-chromosome outname].vcf.gz
plink2 \
  --vcf [all-chromosome outname].vcf.gz \
  dosage=[dosage field name] \
  --make-pfile \
  --out [outname]

# project and plot PC

plink2 \
  ${input_command} \
  --score ${PCA_LOADINGS} \
  center \
  cols=-scoreavgs,+scoresums \
  list-variants \
  header-read \
  --score-col-nums 3-22 \
  --read-freq ${PCA_AF} \
  --out ${OUTNAME}

Rscript -e "install.packages(c("data.table", "hexbin", "optparse", "patchwork", "R.utils", "tidyverse"))"
Rscript plot_projected_pc.R \
  --sscore [path to .sscore output] \
  --phenotype-file [path to phenotype file] \
  --phenotype-col [phenotype column name]
  --covariate-file [path to covariate file] \
  --pc-prefix [prefix of PC columns] \
  --pc-num [number of PCs used in GWAS] \
  --ancestry [ancestry code: AFR, AMR, EAS, EUR, MID, or SAS] \
  --study [your study name] \
  --out [output name prefix]

# --ancestry-file [path to ancestry file] \
# --ancestry-col [ancestry column name]
# --reference-score-file
 

Implementation

Our first application was for the Host Genetics Initiative contribution, which involved SNPid such as chr:pos:ref:alt.

This is detailed below by section.

#!/usr/bin/bash

export TMPDIR=${HPC_WORK}/work
# genotype
export autosomes=~/rds/post_qc_data/interval/imputed/uk10k_1000g_b37/
# location of PCs
export ref=~/rds/post_qc_data/interval/reference_files/genetic/reference_files_genotyped_imputed/
# HGI working directory
export prefix=~/rds/rds-asb38-ceu-restricted/projects/covid/HGI
export dir=20201201-ANA_C2_V2

export PCA_projection=pca_projection
export PCA_loadings=hgdp_tgp_pca_covid19hgi_snps_loadings.GRCh37.plink.tsv
export PCA_af=hgdp_tgp_pca_covid19hgi_snps_loadings.GRCh37.plink.afreq
export sscore=hgdp_tgp_pca_covid19hgi_snps_scores.txt.gz

module load plink/2.00-alpha ceuadmin/stata

function extract_data()
{
  cut -f1 ${PCA_projection}/${PCA_loadings} | tail -n +2 > variants.extract
  (
    cat variants.extract
    awk '{split($1,a,":");print "chr"a[1]":"a[2]"_"a[4]"_"a[3]}' variants.extract
  ) > variants.extract2

  cp ${prefix}/${dir}/output/INTERVAL-*.bgen.bgi ${TMPDIR}
  seq 22 | \
  parallel -C' ' --env prefix --env dir '
    ln -sf ${prefix}/${dir}/output/INTERVAL-{}.bgen ${TMPDIR}/INTERVAL-{}.bgen
    python update_bgi.py --bgi ${TMPDIR}/INTERVAL-{}.bgen.bgi
    bgenix -g ${TMPDIR}/INTERVAL-{}.bgen -incl-rsids variants.extract2 > ${prefix}/work/INTERVAL-{}.bgen
  '
}

function twist()
{
  cat-bgen -g $(echo ${prefix}/work/INTERVAL-{1..22}.bgen) -og INTERVAL.bgen -clobber
  bgenix -g INTERVAL.bgen -index -clobber
  export csvfile=INTERVAL.csv
  python update_bgi.py --bgi INTERVAL.bgen.bgi
  plink2 --bgen INTERVAL.bgen ref-first --make-pfile --out INTERVAL
  cp INTERVAL.p??? ${prefix}/work
  (
    head -1 INTERVAL.pvar
    paste <(sed '1d' INTERVAL.pvar | cut -f1,2) \
          <(sed '1d' INTERVAL.csv | cut -d, -f3) | \
    paste - <(sed '1d' INTERVAL.pvar | cut -f4,5)
  ) > ${prefix}/work/INTERVAL.pvar
}

function project_pc()
{
#!/bin/bash

  set -eu
################################################################################
# Please fill in the below variables
################################################################################
# Metadata
  STUDY_NAME="INTERVAL"
  ANALYST_LAST_NAME="ZHAO"
  DATE="$(date +'%Y%m%d')"
  OUTNAME="${prefix}/work/${STUDY_NAME}.${ANALYST_LAST_NAME}.${DATE}"
################################################################################
# Location of downloaded input files
  PCA_LOADINGS="${PCA_projection}/${PCA_loadings}"
  PCA_AF="${PCA_projection}/${PCA_af}"
################################################################################
# Location of imputed genotype files
# [Recommended]
# PLINK 2 binary format: a prefix (with directories) of .pgen/.pvar/.psam files
  PFILE="${prefix}/work/INTERVAL"
# [Acceptable]
# PLINK 1 binary format: a prefix of .bed/.bim/.fam files
  BFILE=""
################################################################################

  function error_exit() {
    echo "${1:-"Unknown Error"}" 1>&2
    exit 1
  }

# Input checks
  if [[ -z "${STUDY_NAME}" ]]; then
    error_exit "Please specify \$STUDY_NAME."
  fi

  if [[ -z "${ANALYST_LAST_NAME}" ]]; then
    error_exit "Please specify \$ANALYST_LAST_NAME."
  fi

  if [[ -z "${PCA_LOADINGS}" ]]; then
    error_exit "Please specify \$PCA_LOADINGS."
  fi

  if [[ -z "${PCA_AF}" ]]; then
    error_exit "Please specify \$PCA_AF."
  fi

  if [[ -n "${PFILE}" ]]; then
    input_command="--pfile ${PFILE}"
  elif [[ -n "${BFILE}" ]]; then
    input_command="--bfile ${BFILE}"
  else
    error_exit "Either \$PFILE or \$BFILE should be specified"
  fi

# Run plink2 --score
  plink2 \
    ${input_command} \
    --score ${PCA_LOADINGS} \
            variance-standardize \
            cols=-scoreavgs,+scoresums \
            list-variants \
            header-read \
    --score-col-nums 3-12 \
    --read-freq ${PCA_AF} \
    --out ${OUTNAME}

# The score file does not have FID (=0)
  awk '
  {
    if (NR==1) $1="#FID IID"
    else $1=0" "$1
    print
  }' ${OUTNAME}.sscore > ${prefix}/work/snpid.sscore
  ln -sf ${OUTNAME}.sscore.vars ${prefix}/work/snpid.sscore.vars

# Our PCs are PC_# rather than PC#
# The phenotype and covariate file missed FID (=0) column
  awk '
  {
    if (NR==1)
    {
      $1="FID I"$1
      gsub(/PC_/,"PC",$0)
    }
    else $1=0" " $1
  };1' ${dir}/work/INTERVAL-covid.txt | \
  tr ' ' '\t' > ${prefix}/work/snpid.txt
  cut -f1-3 ${prefix}/work/snpid.txt > ${prefix}/work/snpid.pheno
  cut -f3 --complement ${prefix}/work/snpid.txt > ${prefix}/work/snpid.covars

  stata -b do ethnic.do

  Rscript ${PCA_projection}/plot_projected_pc.R \
        --sscore ${prefix}/work/snpid.sscore \
        --phenotype-file ${prefix}/work/snpid.pheno \
        --phenotype-col SARS_CoV \
        --covariate-file ${prefix}/work/snpid.covars \
        --pc-prefix PC \
        --pc-num 20 \
        --ancestry-file ethnic.txt \
        --ancestry-col ethnic \
        --study ${STUDY_NAME} \
        --out ${OUTNAME}
}

extract_data;twist;project_pc
 

update_bgi.py

The magic update_bgi.py is as follows

import argparse
import os
import pandas as pd
import sqlite3

# mapping chromosome string (incluing 01-09, 23, and X) and correct string
CHROM_MAPPING_STR = dict([(str(i), str(i)) for i in range(1, 23)] + [('0' + str(i), str(i)) for i in range(1, 10)] +
                         [('X', 'X')])
csvfile=os.environ['csvfile']

def main(args):
    conn = sqlite3.connect(args.bgi)
    c = conn.cursor()
    df = pd.read_sql("select * from Variant", conn)
    print(df,flush=True)
    df.rsid = df.apply(lambda x: "{}:{}:{}:{}".format(CHROM_MAPPING_STR[x[0]], x[1], x[4], x[5]), axis=1)
    print(df,flush=True)
    df.to_sql("Variant", conn, if_exists="replace", index=False)
    if csvfile != '':
       df.to_csv(csvfile,index=False)
    conn.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--bgi', type=str, required=True)
    args = parser.parse_args()
    main(args)
 

bgi.sql

One may wonder the information regarding .bgen.bgi at all, which can be viewed as follows,

sqlite3 INTERVAL.bgen.bgi < bgi.sql

while bgi.sql has the following lines

.tables
.separator "\t"
.header on
.output metadata.txt
select * from metadata;
.output Variant.txt
select * from Variant;

ethnic.do

The Stata program generates files containing FID, IID and ethnicity

gzuse work/INTERVAL-omics-covid.dta.gz
sort identifier
gzsave INTERVAL-omics-covid, replace
insheet using 20201201/INTERVALdata_01DEC2020.csv, case clear
sort identifier
gzmerge identifier using INTERVAL-omics-covid.dta.gz
tabulate ethnicPulse SARS_CoV, all
gen str20 ethnic=ethnicPulse
replace ethnic="EUR" if inlist(ethnicPulse,"Eng/W/Scot/NI/Brit","White Irish")==1
replace ethnic="EAS" if inlist(ethnicPulse,"Asian- Bangladeshi","Asian- Indian","Asian- Pakistani","Chinese")==1
replace ethnic="MID" if ethnicPulse=="Arab"
gen ethnic_NA=ethnic
replace ethnic_NA="NA" if inlist(ethnic,"EUR","EAS","MID")==0
tab ethnic SARS_CoV, all
tab ethnic_NA SARS_CoV, all
gen FID=0
rename ID IID
outsheet FID IID ethnic using ethnic.txt if IID!=., noquote replace
outsheet FID IID ethnic_NA using ethnic_NA.txt if IID!=., noquote replace
rm INTERVAL-omics-covid.dta.gz
 

Our second application was the Caprion pilot analysis, which was simplified through use of RSid.

#!/usr/bin/bash

export autosomes=~/rds/post_qc_data/interval/imputed/uk10k_1000g_b37/
export ref=~/rds/post_qc_data/interval/reference_files/genetic/reference_files_genotyped_imputed/
export caprion=${HOME}/Caprion/pilot
export TMPDIR=${HPC_WORK}/work
export work=${caprion}/work
export HGI=~/COVID-19/HGI
export PCA_projection=${HGI}/pca_projection
export PCA_loadings=${PCA_projection}/hgdp_tgp_pca_covid19hgi_snps_loadings.rsid.plink.tsv
export PCA_af=${PCA_projection}/hgdp_tgp_pca_covid19hgi_snps_loadings.rsid.plink.afreq
export sscore=${PCA_projection}/hgdp_tgp_pca_covid19hgi_snps_scores.txt.gz

module load plink/2.00-alpha ceuadmin/stata

cut -f1 ${PCA_loadings} | tail -n +2 > ${work}/variants.extract

function pca_project()
# PCA projection
{
export dir=${1}
export phase=${2}
bgenix -g ${caprion}/${dir}/caprion.01.bgen -incl-rsids ${work}/variants.extract > ${work}/${phase}.bgen
bgenix -g ${work}/${phase}.bgen -index -clobber

sqlite3 ${work}/${phase}.bgen.bgi <<END
.tables
.separator "\t"
.header on
.output ${work}/metadata.txt
select * from metadata;
.output ${work}/Variant.txt
select * from Variant;
END

plink2 --bgen ${work}/${phase}.bgen ref-first --make-pfile --out ${work}/${phase}

set -eu

plink2 \
    --pfile ${work}/${phase} \
    --score ${PCA_loadings} \
            variance-standardize \
            cols=-scoreavgs,+scoresums \
            list-variants \
            header-read \
    --score-col-nums 3-12 \
    --read-freq ${PCA_af} \
    --out ${work}/${phase}

awk '
  {
    if (NR==1) $1="#FID IID"
    else $1=0" "$1
    print
  }' ${work}/${phase}.sscore > ${work}/${phase}-rsid.sscore
ln -sf ${work}/${phase}.sscore.vars ${work}/${phase}-rsid.sscore.vars

cat <(echo FID IID dummy | tr ' ' '\t') \
    <(sed '1d' ${work}/${phase}.psam | awk -v OFS='\t' '{print 0,$1,0}') > ${work}/${phase}-rsid.pheno

export ethnic_file=${HGI}/ethnic.txt
cat <(head -1  ${ethnic_file}) \
    <(cut -f1 ${work}/${phase}.psam | grep -f - ${ethnic_file}) > ${work}/${phase}-ethnic.txt
export covar_file=${HGI}/work/snpid.covars
cat <(head -1  ${covar_file}) \
    <(cut -f1 ${work}/${phase}.psam | grep -f - ${covar_file}) > ${work}/${phase}-covars.txt

Rscript ${PCA_projection}/plot_projected_pc.R \
        --sscore ${work}/${phase}-rsid.sscore \
        --phenotype-file ${work}/${phase}-rsid.pheno \
        --phenotype-col dummy \
        --covariate-file ${work}/${phase}-covars.txt \
        --pc-prefix PC \
        --pc-num 20 \
        --ancestry-file ${work}/${phase}-ethnic.txt \
        --ancestry-col ethnic \
        --study phase1 \
        --out ${work}/${phase}
}

pca_project data phase1
pca_project data2 phase2
pca_project data3 phase3