Skip to content

Commit

Permalink
Closing an issue from Marc: hg38 mode, noPlot mode, reduce the number…
Browse files Browse the repository at this point in the history
… of required females for PAR region calling
  • Loading branch information
AHDemiG1 authored and AHDemiG1 committed Oct 5, 2022
1 parent ca3841b commit 18972d6
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ClinCNV is a part of [MegSAP](https://github.com/imgag/megSAP) pipeline.

ClinCNV detects CNVs in germline and somatic context in NGS data (targeted and whole-genome). We work in cohorts, so it makes sense to try `ClinCNV` if you have more than 10 samples (recommended amount - 40 since we estimate variances from the data). By "cohort" we mean samples sequenced with the same enrichment kit with approximately the same depth (ie 1x WGS and 30x WGS better be analysed in separate runs of ClinCNV). Of course it is better if your samples were sequenced within the same sequencing facility.

Currently we work with human genomes `hg19` and `hg38` only. **For `hg38` you need to replace `cytobands.txt` with the file `cytobandsHG38.txt`.** For mouse genome or any other diploid organism you have to replace *cytobands.txt* with the corresponding file. ClinCNV can work with small panels (hundreds of regions), but GC-correction can not be performed accurately for samples sequenced with such panels.
Currently we work with human genomes `hg19` and `hg38` only. **For `hg38` you need to turn on `--hg38` flag, by default it is hg19!** For mouse genome or any other diploid organism you have to replace *cytobands.txt* with the corresponding file. ClinCNV can work with small panels (hundreds of regions), but GC-correction can not be performed accurately for samples sequenced with such panels.

NOTE: Folder `PCAWG` was used for CNVs detection in PanCancer Analysis of Whole Genomes cohort and is *research* only version. It is located here for historical reasons. Feel free to remove it.

Expand Down
26 changes: 21 additions & 5 deletions clinCNV.R
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,12 @@ option_list = list(
make_option("--onlyTumor", action="store_true", default=F,
help="if tumor only calling is to be performed"),

make_option("--noPlot", action="store_true", default=F,
help="Do not perform additional plotting"),

make_option("--hg38", action="store_true", default=F,
help="Work with hg38 cytobands switch"),

make_option(c("-d","--debug"), action="store_true", default=FALSE, help="Print debugging information while running.")
);

Expand All @@ -184,7 +190,6 @@ print(paste("We run script located in folder" , opt$folderWithScript, ". Please,




if (is.null(opt$normal) | is.null(opt$bed)) {
print("You need to specify file with normal coverages and bed file path at least. Here is the help:")
print_help(opt_parser)
Expand Down Expand Up @@ -458,7 +463,11 @@ if (!is.null(opt$tumorSample)) {
}

removeCentromeres = T
lstOfChromBorders <- getCytobands("cytobands.txt", removeCentromeres)
if (opt$hg38) {
lstOfChromBorders <- getCytobands("cytobandsHG38.txt", removeCentromeres)
} else {
lstOfChromBorders <- getCytobands("cytobands.txt", removeCentromeres)
}
left_borders <- lstOfChromBorders[[1]]
right_borders <- lstOfChromBorders[[2]]
ends_of_chroms <- lstOfChromBorders[[3]]
Expand Down Expand Up @@ -639,9 +648,16 @@ if (length(samplesToFilterOut) > 0) {

print(paste("We start to cluster your data (you will find a plot if clustering is possible in your output directory)", opt$out, Sys.time()))
if (is.null(opt$clusterProvided)) {
clusteringList <- returnClustering2(as.numeric(opt$minimumNumOfElemsInCluster))
clustering = clusteringList[[1]]
outliersByClusteringCohort = clusteringList[[2]]
if (opt$noPlot) {
clustering = rep("0", ncol(normal))
names(clustering) = colnames(normal)
print(clustering)
outliersByClusteringCohort = c()
} else {
clusteringList <- returnClustering2(as.numeric(opt$minimumNumOfElemsInCluster))
clustering = clusteringList[[1]]
outliersByClusteringCohort = clusteringList[[2]]
}
} else {
clusteringList <- read.table(opt$clusterProvided, stringsAsFactors = F)
clustering = clusteringList[,2]
Expand Down
11 changes: 7 additions & 4 deletions germline/helpersGermline.R
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,10 @@ calculateLocationAndScale <- function(bedFile, coverage, genderOfSamples, autoso
print(chrom)
coveragesToDealWith = coverage[which(bedFile[,1] == chrom),]
if (chrom == "chrX") {
if (length(which(genderOfSamples == "F")) > 0.5 * length(which(genderOfSamples == "M"))) {
if (length(which(genderOfSamples == "F")) > 0.4 * length(which(genderOfSamples == "M"))) {
print("The number of females is too few for accurate PAR estimation, we use males for chrX which leads to wrong PAR calling.")
}
if (length(which(genderOfSamples == "F")) > 0.4 * length(which(genderOfSamples == "M"))) {
whichSamplesUsed = which(genderOfSamples == "F")
} else {
whichSamplesUsed = which(genderOfSamples == "M")
Expand All @@ -751,7 +754,7 @@ calculateLocationAndScale <- function(bedFile, coverage, genderOfSamples, autoso
} else {
medians <- apply(coveragesToDealWith[,whichSamplesUsed], 1, EstimateModeSimpleCov)
}
if (chrom == "chrX" & length(which(genderOfSamples == "F")) <= 0.5 * length(which(genderOfSamples == "M"))) {
if (chrom == "chrX" & length(which(genderOfSamples == "F")) <= 0.4 * length(which(genderOfSamples == "M"))) {
medians = sqrt(2) * medians
}
if (chrom == "chrY" & length(which(genderOfSamples == "M")) > 2) {
Expand All @@ -760,15 +763,15 @@ calculateLocationAndScale <- function(bedFile, coverage, genderOfSamples, autoso
coverage.normalised[which(bedFile[,1] == chrom),] = sweep(coverage[which(bedFile[,1] == chrom),], 1, medians + 10**-40, FUN="/")
mediansResult[which(bedFile[,1] == chrom)] = medians
}

coverage.normalised = coverage.normalised - 1
sdsOfGermlineSamplesTmp = apply(coverage.normalised[autosomes,], 2, Qn)
sdsResults = c()
for (chrom in unique(bedFile[,1])) {
whichSamplesUsed = 1:ncol(coverage)
coveragesToDealWith = coverage.normalised[which(bedFile[,1] == chrom),]
if (chrom == "chrX") {
if (length(which(genderOfSamples == "F")) > 0.5 * length(which(genderOfSamples == "M"))) {
if (length(which(genderOfSamples == "F")) > 0.3 * length(which(genderOfSamples == "M"))) {
whichSamplesUsed = which(genderOfSamples == "F")
} else {
whichSamplesUsed = which(genderOfSamples == "M")
Expand Down

0 comments on commit 18972d6

Please sign in to comment.