Closing an issue from Marc: hg38 mode, noPlot mode, reduce the number…

… of required females for PAR region calling
imgag · Oct 5, 2022 · 18972d6 · 18972d6
1 parent ca3841b
commit 18972d6
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ ClinCNV is a part of [MegSAP](https://github.com/imgag/megSAP) pipeline.
 
 ClinCNV detects CNVs in germline and somatic context in NGS data (targeted and whole-genome). We work in cohorts, so it makes sense to try `ClinCNV` if you have more than 10 samples (recommended amount - 40 since we estimate variances from the data). By "cohort" we mean samples sequenced with the same enrichment kit with approximately the same depth (ie 1x WGS and 30x WGS better be analysed in separate runs of ClinCNV). Of course it is better if your samples were sequenced within the same sequencing facility. 
 
-Currently we work with human genomes `hg19` and `hg38` only. **For `hg38` you need to replace `cytobands.txt` with the file `cytobandsHG38.txt`.** For mouse genome or any other diploid organism you have to replace *cytobands.txt* with the corresponding file. ClinCNV can work with small panels (hundreds of regions), but GC-correction can not be performed accurately for samples sequenced with such panels.
+Currently we work with human genomes `hg19` and `hg38` only. **For `hg38` you need to turn on `--hg38` flag, by default it is hg19!** For mouse genome or any other diploid organism you have to replace *cytobands.txt* with the corresponding file. ClinCNV can work with small panels (hundreds of regions), but GC-correction can not be performed accurately for samples sequenced with such panels.
 
 NOTE: Folder `PCAWG` was used for CNVs detection in PanCancer Analysis of Whole Genomes cohort and is *research* only version. It is located here for historical reasons. Feel free to remove it.
 

diff --git a/clinCNV.R b/clinCNV.R
@@ -172,6 +172,12 @@ option_list = list(
   make_option("--onlyTumor", action="store_true", default=F, 
               help="if tumor only calling is to be performed"),  
 
+  make_option("--noPlot", action="store_true", default=F, 
+              help="Do not perform additional plotting"), 
+
+  make_option("--hg38", action="store_true", default=F, 
+              help="Work with hg38 cytobands switch"),  
+
   make_option(c("-d","--debug"), action="store_true", default=FALSE, help="Print debugging information while running.")
 ); 
 
@@ -184,7 +190,6 @@ print(paste("We run script located in folder" , opt$folderWithScript, ". Please,
 
 
 
-
 if (is.null(opt$normal) | is.null(opt$bed)) {
   print("You need to specify file with normal coverages and bed file path at least. Here is the help:")
   print_help(opt_parser)
@@ -458,7 +463,11 @@ if (!is.null(opt$tumorSample)) {
 }
 
 removeCentromeres = T
-lstOfChromBorders <- getCytobands("cytobands.txt", removeCentromeres)
+if (opt$hg38) {
+  lstOfChromBorders <- getCytobands("cytobandsHG38.txt", removeCentromeres)
+} else {
+  lstOfChromBorders <- getCytobands("cytobands.txt", removeCentromeres)
+}
 left_borders <- lstOfChromBorders[[1]]
 right_borders <- lstOfChromBorders[[2]]
 ends_of_chroms <- lstOfChromBorders[[3]]
@@ -639,9 +648,16 @@ if (length(samplesToFilterOut) > 0) {
 
 print(paste("We start to cluster your data (you will find a plot if clustering is possible in your output directory)", opt$out, Sys.time()))
 if (is.null(opt$clusterProvided)) {
-  clusteringList <- returnClustering2(as.numeric(opt$minimumNumOfElemsInCluster))
-  clustering = clusteringList[[1]]
-  outliersByClusteringCohort = clusteringList[[2]]
+  if (opt$noPlot) {
+    clustering = rep("0", ncol(normal))
+    names(clustering) = colnames(normal)
+    print(clustering)
+    outliersByClusteringCohort = c()
+  } else {
+    clusteringList <- returnClustering2(as.numeric(opt$minimumNumOfElemsInCluster))
+    clustering = clusteringList[[1]]
+    outliersByClusteringCohort = clusteringList[[2]]
+  }
 } else {
   clusteringList <- read.table(opt$clusterProvided, stringsAsFactors = F)
   clustering = clusteringList[,2]

diff --git a/germline/helpersGermline.R b/germline/helpersGermline.R
@@ -732,7 +732,10 @@ calculateLocationAndScale <- function(bedFile, coverage, genderOfSamples, autoso
     print(chrom)
     coveragesToDealWith = coverage[which(bedFile[,1] == chrom),]
     if (chrom == "chrX") {
-      if (length(which(genderOfSamples == "F")) > 0.5 * length(which(genderOfSamples == "M"))) {
+      if (length(which(genderOfSamples == "F")) > 0.4 * length(which(genderOfSamples == "M"))) {
+        print("The number of females is too few for accurate PAR estimation, we use males for chrX which leads to wrong PAR calling.")
+      }
+      if (length(which(genderOfSamples == "F")) > 0.4 * length(which(genderOfSamples == "M"))) {
         whichSamplesUsed = which(genderOfSamples == "F")
       } else {
         whichSamplesUsed = which(genderOfSamples == "M")
@@ -751,7 +754,7 @@ calculateLocationAndScale <- function(bedFile, coverage, genderOfSamples, autoso
     } else {
       medians <- apply(coveragesToDealWith[,whichSamplesUsed], 1, EstimateModeSimpleCov)
     }
-    if (chrom == "chrX" & length(which(genderOfSamples == "F")) <= 0.5 * length(which(genderOfSamples == "M"))) {
+    if (chrom == "chrX" & length(which(genderOfSamples == "F")) <= 0.4 * length(which(genderOfSamples == "M"))) {
       medians = sqrt(2) * medians
     }
     if (chrom == "chrY" & length(which(genderOfSamples == "M")) > 2) {
@@ -760,15 +763,15 @@ calculateLocationAndScale <- function(bedFile, coverage, genderOfSamples, autoso
     coverage.normalised[which(bedFile[,1] == chrom),] =  sweep(coverage[which(bedFile[,1] == chrom),], 1, medians + 10**-40, FUN="/")
     mediansResult[which(bedFile[,1] == chrom)] = medians
   }
-
+  
   coverage.normalised = coverage.normalised - 1
   sdsOfGermlineSamplesTmp = apply(coverage.normalised[autosomes,], 2, Qn)
   sdsResults = c()
   for (chrom in unique(bedFile[,1])) {
     whichSamplesUsed = 1:ncol(coverage)
     coveragesToDealWith = coverage.normalised[which(bedFile[,1] == chrom),]
     if (chrom == "chrX") {
-      if (length(which(genderOfSamples == "F")) > 0.5 * length(which(genderOfSamples == "M"))) {
+      if (length(which(genderOfSamples == "F")) > 0.3 * length(which(genderOfSamples == "M"))) {
         whichSamplesUsed = which(genderOfSamples == "F")
       } else {
         whichSamplesUsed = which(genderOfSamples == "M")