Snakefile-pilot-prep

import os
import math
import gzip
from pathlib import Path


configfile: "config.yml"
singularity: "methy-qtl.sif"

mem_step_size = 6400


rule invnorm_betas:
    input: 
        config["betas_rds"]
    output:
        "phenotypes/betas.sorted_samples.invnorm.tsv.gz"
    threads: 1
    resources:
        mem_mb = 128000
    shell:
        """
        set -eu

        process-methy-rds.R {input} {output}
        """

rule bed_file:
    input:
        vcf = "genotypes/{chrom}.pass_variants.bcf", #config["genotype_expression"],
        betas = config["betas_rds"],
        array_manifest = config["array_manifest"]
    output:
        "bed_files/betas.{chrom}.bed.gz"
    threads: 1
    resources:
        mem_mb = 64000
    shell:
        """
        set +e
        tmp_dir=`mktemp -d`
        tmp_out=$tmp_dir/$(basename {output})

        header_hash=$(zcat {input.array_manifest} | head -n1 | md5sum | cut -f1 -d' ')
        if [[ $header_hash != "c72b2e7275bdb062c33d895b2bb68a4a" ]]; then
          >&2 echo "Error: unexpected array manifest column names (hashes: c72b2e7275bdb062c33d895b2bb68a4a vs $header_hash)"
          exit 1
        fi 

        set -uo pipefail
        bcftools query -l {input.vcf} > $tmp_dir/genotype_ids.txt
        zcat {input.array_manifest} | awk -F'\t' '$1=="{wildcards.chrom}" {{print $9}}' | sort > $tmp_dir/probe_ids.txt
        echo $PATH
        
        subset_rds_to_tsv.R {input.betas} $tmp_dir/probe_ids.txt $tmp_dir/genotype_ids.txt | join -t $'\t' --header <(zcat {input.array_manifest} | awk -F'\t' 'NR=1||$1=="{wildcards.chrom}" {{b=($10==0?$2+1:$2); print $9"\t"$1"\t"b"\t"b+1"\t"$9}}' | awk -F'\t' 'NR==1 {{print $0;next}} {{print $0 | "sort -k1,1"}}') /dev/stdin | cut -f2- | awk -F'\t' -v 'OFS=\t' 'NR==1 {{ $1="#chr"; $2="start"; $3="end"; $4="probe_id"; print $0;next}} {{print $0| "sort -k2,2 --numeric-sort"}}' | bgzip > $tmp_out && tabix -p bed $tmp_out
        rc=$?

        if [[ $rc == 0 ]]; then
          mv $tmp_out {output} &&
          mv $tmp_out.tbi {output}.tbi
          rc=$?
        fi
        
        rm -r $tmp_dir
        exit $rc
        """

#rule bed_file_old:
#    input:
#        rules.invnorm_betas_with_positions.output
#    output:
#        "bed_files/betas.sorted_samples.invnorm.ids_fixed.samples_filtered.{chrom}.bed.gz"
#    threads: 1
#    resources:
#        mem_mb = 6400
#    shell:
#
#        """
#        set -euo pipefail
#        (echo '#chr' start end probe_id $(zcat {input} | head -n1 | cut -f5-) | tr " " "\t"; zcat {input} | awk -F'\t' 'BEGIN {{OFS="\t"}} {{if ($1=="{wildcards.chrom}") {{$3=$2+1; print}}}}' | sort -k2,2 --numeric-sort) | bgzip > {output}
#        tabix {output}
#        """


rule genotype_subset:
    input:
        vcf = config["genotype_expression"],
        id_convert_map = config["sample_id_convert_map"]
    output:
        "genotypes/{chrom}.pass_variants.bcf"
    threads: 2
    resources:
        mem_mb = lambda wc, attempt:  mem_step_size + mem_step_size * attempt
    shell:
        """
        set -uo pipefail
        tmp_dir=`mktemp -d`

        plink_prefix_path=${{tmp_dir}}/$(basename {output} .bcf)
        bcftools view -S <(cut -f1 {input.id_convert_map} | grep -v "^NA$" | grep -v "NWD359245\|NWD439884\|NWD550834\|NWD582434\|NWD609471\|NWD741494") {input.vcf} --include "FILTER='PASS'" -Ou \
          | bcftools view --include 'AC>0&&AC!=AN' -Ou \
          | bcftools annotate --set-id '%CHROM:%POS:%REF:%ALT' -Ou \
          | bcftools reheader --samples {input.id_convert_map} /dev/stdin \
          | bcftools view -S <(cut -f2 {input.id_convert_map} | sort -k2,2) --force-samples -Ob -o ${{plink_prefix_path}}.bcf
        bcftools index ${{plink_prefix_path}}.bcf

        #bcftools view --include '(AC/AN)>0.01' ${{plink_prefix_path}}.bcf -Ob -o ${{plink_prefix_path}}.min_af_0.01.bcf
        #bcftools index ${{plink_prefix_path}}.min_af_0.01.bcf

        plink2 --make-bed --output-chr chrM --bcf ${{plink_prefix_path}}.bcf --threads 4 --out $plink_prefix_path
        mv ${{plink_prefix_path}}* genotypes/
        """


rule all_genotype_subsets:
    input:
        ["genotypes/chr" + c + ".pass_variants.bed" for c in [str(i) for i in range(1, 23)] + ["X"]]


rule null_genotype_subset:
    input:
        rules.genotype_subset.output
    output:
        "genotypes_null/{chrom}.pass_variants.null.bcf"
    threads: 2
    resources:
        mem_mb = lambda wc, attempt:  mem_step_size + mem_step_size * attempt
    shell:
        """
        set -uo pipefail
        plink_prefix_path=$(dirname {output})/$(basename {output} .bcf)
        bcftools reheader --samples <(bcftools query -l $input_bcf | sort --random-sort) {input} -o ${{plink_prefix_path}}.bcf
        bcftools index --force ${{plink_prefix_path}}.bcf
        plink2 --make-bed --output-chr chrM --bcf ${{plink_prefix_path}}.bcf --threads 4 --out $plink_prefix_path
        """


rule vup_adjusted_bed_file:
    input:
        methy = rules.bed_file.output,
        vcf = rules.genotype_subset.output,
        array_manifest = config["array_manifest"]
    output:
        bed="bed_files/{vup_method}/betas.{chrom}.{vup_method}.bed.gz",
        mim="bed_files/{vup_method}/betas.{chrom}.{vup_method}.missing_indicators.tsv.gz"
    threads: 2
    resources:
        mem_mb = lambda wc, attempt:  mem_step_size + mem_step_size * attempt
    shell:
        """
        set -uo pipefail
        tmp_dir=`mktemp -d`
        tmp_out=$tmp_dir/$(basename {output.bed})
        tmp_mim_out=$tmp_dir/$(basename {output.mim})
        

        method=$(echo {wildcards.vup_method} | cut -f1 -d'_')
 
        vuptool -m $method -p $tmp_mim_out {input.methy} {input.vcf} {input.array_manifest} | bgzip > $tmp_out &&
        tabix -p bed $tmp_out
        rc=$?

        if [[ $rc == 0 ]]; then
          mv $tmp_out {output.bed} && mv $tmp_out.tbi {output.bed}.tbi && mv $tmp_mim_out {output.mim}
          rc=$?
        fi
       
        rm -r $tmp_dir
        exit $rc
        """


rule invnorm_vup_adjusted_bed_file:
    input:
        rules.vup_adjusted_bed_file.output.bed
    output:
        "bed_files/{vup_method}/betas.{chrom}.{vup_method}.invnorm.bed.gz"
    threads: 2
    resources:
        mem_mb = lambda wc, attempt:  mem_step_size + mem_step_size * attempt
    shell:
        """
        set -uo pipefail
        tmp_dir=`mktemp -d`
        tmp_out=$tmp_dir/$(basename {output})


        invnorm_bed.R {input} --mean-impute | bgzip > $tmp_out &&
        tabix -p bed $tmp_out
        rc=$?

        if [[ $rc == 0 ]]; then
          mv $tmp_out {output} && mv $tmp_out.tbi {output}.tbi
          rc=$?
        fi

        rm -r $tmp_dir
        exit $rc
        """


rule pcs:
    input:
        lambda wc: [rules.invnorm_vup_adjusted_bed_file.output[0].format(vup_method=wc.vup_method, chrom="chr"+str(c)) for c in range(1,23)]
    output:
        "pcs/{vup_method}/beta_pcs.{vup_method}.invnorm.scores.tsv",
        "pcs/{vup_method}/beta_pcs.{vup_method}.invnorm.variance_explained.txt"
    params:
        prefix = "pcs/{vup_method}/beta_pcs.{vup_method}.invnorm"
    threads: 2
    resources:
        mem_mb = 64000
    shell:
        """
        set -euo pipefail
      
        start_line=1
        for f in {input}; do
          zcat $f | tail -n+$start_line
          start_line=2
        done | generate-methy-pcs.R {params.prefix}
        """


rule covariates:
    input:
        extra_cov = config["extra_covariates"],
        geno_pcs = "/net/topmed11/working/lefaivej/topmed-methylation/copd_ltrc_toy_data/joint/pca.2540.toe_ids.tsv",
        methy_pcs = rules.pcs.output[0]
    output:
        "covariates/{vup_method}/cov_gpcs_mpcs.{vup_method}.tsv", "covariates/{vup_method}/cov_gpcs_mpcs.{vup_method}.transposed.tsv"
    threads: 1
    resources:
        mem_mb = mem_step_size
    shell:
        """
        set -euo pipefail

        out1=$(echo {output[0]} | rev | cut -f3- -d'.' | rev).tmp
        head -n1 {input.extra_cov} | join --header -t$'\t' <(head -n1 {input.geno_pcs}) /dev/stdin > $out1
        tail -n+2 {input.extra_cov} | sort -k1,1 | join -t$'\t' <(tail -n+2 {input.geno_pcs} | sort -k1,1) /dev/stdin >> $out1

        join --header -t$'\t' <(head -n1 $out1) <(head -n1 {input.methy_pcs}) > {output[0]}
        join -t$'\t' <(tail -n+2 $out1) <(tail -n+2 {input.methy_pcs}) >> {output[0]}

        cat {output[0]} | transpose_tsv.R > {output[1]}
        """


rule methy_pcs_plot:
    input:
        rules.covariates.output[0]
    output:
        "plots/{vup_method}/methy_pcs.{vup_method}.png"
    shell:
        """
        Rscript plot-mpcs.R {input} {output}
        """

rule methy_pcs_variance_explained_plot:
    input:
        rules.pcs.output[1]
    output:
        "plots/methy_pcs_variance_explained.png"
    shell:
        """
        set -euo pipefail
        paste <(seq 1 50) <(head -n 50 {input}) | gnuplot --persist -e 'set term png size 1080,720; set xrange [0:51]; set xlabel "PC"; set ylabel "% Variance Explained"; set style fill solid noborder; set boxwidth 0.8; plot "-" u 1:2 w boxes notitle' > {output}
        """

rule null_covariates:
    input:
        vcf = rules.genotype_subset.output[0].format(chrom="chr20"),
        vcf_null = rules.null_genotype_subset.output[0].format(chrom="chr20"),
        cov = "covariates/joint_invnorm_loadings.ids_fixed.sex_study_pcs_1_100.recode.with_geno_pcs.tsv"
    output:
        "covariates_null/joint_invnorm_loadings.ids_fixed.sex_study_pcs_1_100.recode.with_geno_pcs.null2.tsv",
        "covariates_null/joint_invnorm_loadings.ids_fixed.sex_study_pcs_1_100.recode.with_geno_pcs.null2.transposed.tsv"
    threads: 2
    resources:
        mem_mb = lambda wc, attempt:  mem_step_size + mem_step_size * attempt
    shell:
        """
        set -euo pipefail

        plink_prefix=$(dirname {input.vcf})/$(basename {input.vcf} .bcf)
        diff <(cut -f2 ${{plink_prefix}}.fam) <(tail -n+2 {input.cov} | cut -f1)

        plink_prefix_null=$(dirname {input.vcf_null})/$(basename {input.vcf_null} .bcf)
        tmp_dir=covariates_null
        paste <(cut -f2 ${{plink_prefix_null}}.fam) <(tail -n+2 {input.cov} | cut -f 2-11) | sort -k1,1 > $tmp_dir/null_pcs.tsv
        (head -n1 {input.cov}; paste $tmp_dir/null_pcs.tsv <(tail -n+2 {input.cov} | cut -f12-)) > {output[0]}
        cat {output[0]} | ./transpose_tsv.R > {output[1]}        

        rm $tmp_dir/null_pcs.tsv 
        
        """


chrom_methy_file="bed_files/betas.sorted_samples.invnorm.ids_fixed.samples_filtered.{chrom}.bed.gz"
chrom_genotype_file="genotypes/{chrom}.pass_variants.bim"


rule qtl_scan_chunk:
    input:
        genotype_bim_file=chrom_genotype_file,
        phenotype_bed_file=rules.invnorm_vup_adjusted_bed_file.output,
        covariates_file=rules.covariates.output[1] #"covariates/sex_gpcs_mpcs.transposed.tsv"
    output:
        "qtl/{scan_type}/{vup_method}/temp/permute.{vup_method}.{chrom}.chunk_{chunk}.mpcs_{mpcs}.{scan_type}_qtl.tsv.gz"
    threads: 100
    resources:
        mem_mb = lambda wc, attempt:  mem_step_size + mem_step_size * attempt
    shell:
        """
        #!/bin/bash

        export OMP_NUM_THREADS={threads}
        export MKL_NUM_THREADS={threads}
        
        set +e
        set -uo pipefail
        tmp_dir=`mktemp -d`
        tmp_prefix=${{tmp_dir}}/$(basename {output} .cis_qtl.tsv.gz)
        tmp_out=${{tmp_prefix}}.{wildcards.scan_type}_qtl.txt.gz
        chunk_bed_file=${{tmp_dir}}/betas.{wildcards.chrom}.{wildcards.chunk}.bed.gz
        single_probe_bed=${{tmp_dir}}/betas.{wildcards.chrom}.{wildcards.chunk}.single.bed.gz
        single_cov=${{tmp_dir}}/betas.{wildcards.chrom}.{wildcards.chunk}.single.cov.tsv
        cov_file="$tmp_dir/cov_file.tsv"
        head -n $(( 13 + {wildcards.mpcs})) {input.covariates_file} > $cov_file

        echo "Subsetting BED file ..." >&2
        (zcat {input.phenotype_bed_file} | head -n1; zcat {input.phenotype_bed_file} | tail -n+2 | tail -n+$(( 1 + {wildcards.chunk} * {config[chunk_size]} )) | head -n{config[chunk_size]}) | bgzip > $chunk_bed_file

        if [[ "{wildcards.vup_method}" == "filter" || "{wildcards.vup_method}" == "regress" ]]; then
          python3 -m tensorqtl $(dirname {input.genotype_bim_file})/$(basename {input.genotype_bim_file} .bim) $chunk_bed_file $tmp_prefix  --mode {wildcards.scan_type} --maf_threshold 0.001 --covariates $cov_file --window 1000000 --qvalue_lambda 0 --seed 2023 --permutations 1000 --invnorm
          rc=$?

          if [[ $rc == 0 && "{wildcards.scan_type}" == "cis_nominal" ]]; then
            parquet_to_tsv.py $tmp_prefix.*.parquet | gzip > $tmp_out
            rc=$?
          fi
        else # "{wildcards.vup_method}" == "mask"
          probe_idx_end=$(zcat $chunk_bed_file | wc -l)
          for probe_idx in `seq 2 $probe_idx_end`; do
            zcat $chunk_bed_file | transpose_tsv.R | cut -f1,$probe_idx | grep -v NA | transpose_tsv.R | bgzip > $single_probe_bed &&
            cat $cov_file | transpose_tsv.R | join --header -t $'\t' <(zcat $single_probe_bed | cut -f4- | head -n1 | tr "\t" "\n") /dev/stdin | transpose_tsv.R > $single_cov &&
            python3 -m tensorqtl $(dirname {input.genotype_bim_file})/$(basename {input.genotype_bim_file} .bim) $single_probe_bed ${{tmp_prefix}}.tmp_${{probe_idx}}  --mode cis --maf_threshold 0.001 --covariates $single_cov --window 1000000 --qvalue_lambda 0 --seed 2023 --permutations 1000 --invnorm
            rc=$?

            if [[ $rc == 0 && "{wildcards.scan_type}" == "cis_nominal" ]]; then
              parquet_to_tsv.py ${{tmp_prefix}}.tmp_${{probe_idx}}.*.parquet | gzip > ${{tmp_prefix}}.tmp_${{probe_idx}}.cis_nominal_qtl.txt.gz
              rc=$?
            fi

            [[ $rc != 0 ]] && break
          done

          if [[ $rc == 0 ]]; then
            for f in `ls -v ${{tmp_prefix}}.tmp_*.{wildcards.scan_type}_qtl.txt.gz`; do
              if [[ "$f" == "${{tmp_prefix}}.tmp_2.{wildcards.scan_type}_qtl.txt.gz" ]]; then
                zcat $f
              else
                zcat $f | tail -n+2
              fi
            done | gzip > $tmp_out
            rc=$?
          fi
        fi

        if [[ $rc == 0 && {wildcards.chunk} == 0 ]]; then
          mv $tmp_out {output}
          rc=$?
        elif [[ $rc == 0 ]]; then
          echo "Removing header from output and recompressing ..." >&2
          zcat $tmp_out | tail -n+2 | bgzip > ${{tmp_out}}.no_header && mv ${{tmp_out}}.no_header {output}
          rc=$?
        fi

        rm -r $tmp_dir
        exit $rc
        """


def num_chunks_for_chrom(input_file):
    with gzip.open(input_file) as f:
        n_lines = sum(1 for _ in f) - 1
        if n_lines > 0:
            return int(math.ceil(n_lines / config["chunk_size"]))
    return 0;


chrom_methy_file="bed_files/betas.sorted_samples.invnorm.ids_fixed.samples_filtered.{chrom}.bed.gz"
chrom_genotype_file="genotypes/{chrom}.pass_variants.bim"


rule concatenated_qtl_scan:
    input:
        lambda wc: [rules.permutation_scan_chunk.output[0].format(scan_type=wc.scan_type, vup_method=wc.vup_method, chunk=c, chrom=wc.chrom, mpcs=wc.mpcs) for c in range(0, num_chunks_for_chrom(rules.invnorm_vup_adjusted_bed_file.output[0].format(chrom=wc.chrom, vup_method=wc.vup_method)))]
    output:
        "qtl/{scan_type}/{vup_method}/{concat}/permute.{vup_method}.{chrom}.mpcs_{mpcs}.{scan_type}_qtl.tsv.gz"
    threads: 2
    resources:
        mem_mb = lambda wc, attempt:  mem_step_size + mem_step_size * attempt
    shell:
        """
        cat {input} > {output}
        """

rule qtl_scan_chrom:
    input:
        genotype_bim_file=chrom_genotype_file,
        phenotype_bed_file=rules.invnorm_vup_adjusted_bed_file.output,
        covariates_file=rules.covariates.output[1] #"covariates/regress_additive/cov_gpcs_mpcs.regress.transposed.tsv" #rules.covariates.output[1],
    params:
        mim_file=rules.vup_adjusted_bed_file.output.mim
    output:
        "qtl/{scan_type}/{vup_method}/qtl_scan.{vup_method}.{chrom}.mpcs_{mpcs}.{scan_type}_qtl.tsv.gz"
    threads: 24
    resources:
        mem_mb = lambda wc, attempt:  mem_step_size + mem_step_size * attempt
    shell:
        """
        #!/bin/bash

        export OMP_NUM_THREADS={threads}
        export MKL_NUM_THREADS={threads}
        
        set +e
        set -uo pipefail
        tmp_dir=`mktemp -d`
        tmp_prefix=${{tmp_dir}}/$(basename {output} .cis_qtl.tsv.gz)
        tmp_out=${{tmp_prefix}}.{wildcards.scan_type}_qtl.txt.gz
        cov_file="$tmp_dir/cov_file.tsv"
        head -n $(( 13 + {wildcards.mpcs})) {input.covariates_file} > $cov_file


        if [[ "{wildcards.vup_method}" == "filter" || "{wildcards.vup_method}" == "regress" || || "{wildcards.vup_method}" == "mask" ]]; then
          paired_cov=""
          if [[ "{wildcards.vup_method}" == "mask" ]]; then
            paired_cov="--paired_covariate {params.mim_file}"
          fi
          python3 -m tensorqtl $(dirname {input.genotype_bim_file})/$(basename {input.genotype_bim_file} .bim) {input.phenotype_bed_file} $tmp_prefix  --mode {wildcards.scan_type} --maf_threshold 0.001 --covariates $cov_file --window 1000000 --qvalue_lambda 0 --seed 2023 --permutations 1000 --invnorm --fdr 0.05 $paired_cov
          rc=$?

          if [[ $rc == 0 && "{wildcards.scan_type}" == "cis_nominal" ]]; then
            parquet_to_tsv.py $tmp_prefix.*.parquet | gzip > $tmp_out
            rc=$?
          fi
        fi

        if [[ $rc == 0 ]]; then
          mv $tmp_out {output}
          rc=$?
        fi

        rm -r $tmp_dir
        exit $rc
        """


rule meta_analysis:
    input:
        ["ltrc-fix/qtl/cis_nominal/{vup_method}/qtl_scan.{vup_method}.{chrom}.mpcs_20.cis_nominal_qtl.tsv.gz","copd-fix/qtl/cis_nominal/{vup_method}/qtl_scan.{vup_method}.{chrom}.mpcs_20.cis_nominal_qtl.tsv.gz"]
    output:
        "meta-fix/ltrc_copd_meta.{vup_method}.{chrom}.tsv.gz"
    shell:
        """
        #!/bin/bash
        set +e
        set -uo pipefail
        tmp_dir=`mktemp -d`
        input1=$tmp_dir/input1.tsv.gz
        input2=$tmp_dir/input2.tsv.gz

        zcat {input[0]} | awk -F'\t' '{if(NR>1) {$2=$1"_"$2}; print $2"\t"$4"\t"$7"\t"$8"\t"$9}' | gzip > $input1
        zcat {input[1]} | awk -F'\t' '{if(NR>1) {$2=$1"_"$2}; print $2"\t"$4"\t"$7"\t"$8"\t"$9}' | gzip > $input2

        (echo SCHEME STDERR
         echo EFFECT slope
         echo STDERR slope_se
         echo FREQ af
         echo PVAL pval_nominal
         echo MARKER variant_id
         echo PROCESS $input1
         echo PROCESS $input2
         echo OUTFILE $tmp_dir/meta_results .tsv
         echo ANALYZE HETEROGENEITY
         echo QUIT) > $tmp_dir/analysis.metal

         metal $tmp_dir/analysis.metal
         rc=$?
         
         if [[ $rc == 0 ]]; then
           gzip -cd $tmp_dir/meta_results.tsv > {output} &&
           cp $tmp_dir/meta_results.tsv.info {output}.info
           rc=$?
         fi

         rm -r $tmp_dir
         exit $rc
         """