From 3637b6febcb8a7622f8c88874efbb6be6635b26a Mon Sep 17 00:00:00 2001 From: GermanDemidov Date: Mon, 26 Nov 2018 11:15:59 +0100 Subject: [PATCH] Allowed chromosomes replaced with chromosome arms --- somatic/bafSegmentation.R | 68 +++++++++++++++++---------- somatic/cytobands.txt | 96 +++++++++++++++++++-------------------- somatic/firstStep.R | 21 ++++----- somatic/generalHelpers.R | 28 +++++++++++- 4 files changed, 128 insertions(+), 85 deletions(-) diff --git a/somatic/bafSegmentation.R b/somatic/bafSegmentation.R index 3ca6ab6..0006cfa 100644 --- a/somatic/bafSegmentation.R +++ b/somatic/bafSegmentation.R @@ -111,7 +111,7 @@ returnBAlleleFreqs <- function(healthySampleName, tumorSampleName, folderBAF, be } -determineAllowedChroms <- function(healthySample, tumorSample, healthySampleName, tumorSampleName, folderBAF) { +determineAllowedChroms <- function(healthySample, tumorSample, healthySampleName, tumorSampleName, folderBAF, leftBorders, rightBorders, endsOfChroms) { vecOfPvalues <- rep(0, nrow(tumorSample)) for (i in 1:nrow(tumorSample)) { refAleleTum <- (round(as.numeric(tumorSample[i,5]) * as.numeric(tumorSample[i,6]))) @@ -134,37 +134,58 @@ determineAllowedChroms <- function(healthySample, tumorSample, healthySampleName thresholdOfNonNormalVariant = 0.01 # pvalue, if lower = BAF is different in normal and tumor chroms = paste0("chr", 1:22) chroms = c(chroms, "chrX") - numberOfSNVs = rep(0, length(chroms)) - evaluated = rep(0, length(chroms)) + numberOfSNVs = rep(0, 2 * length(left_borders)) + evaluated = rep(0, 2 * length(left_borders)) + namesOfChromArms <- rep(0, 2 * length(left_borders)) + plotLabels <- rep(0, 2 * length(left_borders)) counter = 1 - for (chrom in chroms) { - rowsFromChrom = which(healthySample[,1] == chrom) - if (length(rowsFromChrom) > 1) { - evaluationOfChorm = (length(which(pvalues[rowsFromChrom] < thresholdOfNonNormalVariant))) / (length(rowsFromChrom)) - } else { - evaluationOfChorm = 1.0 + for (l in 1:length(left_borders)) { + chrom = names(left_borders)[l] + start = left_borders[[l]] + end = right_borders[[l]] + endOfChrom = endsOfChroms[[l]] + for (k in 1:2) { + if (k == 1) { + rowsFromChrom = which(healthySample[,1] == chrom & healthySample[,2] <= as.numeric(start) ) + namesOfChromArms[counter] = paste(chrom, 0, start, sep="-") + plotLabels[counter] = paste(chrom, "left #", length(rowsFromChrom), "SNVs") + } else { + rowsFromChrom = which(healthySample[,1] == chrom & healthySample[,2] >= as.numeric(end) ) + namesOfChromArms[counter] = paste(chrom, end, endOfChrom, sep="-") + plotLabels[counter] = paste(chrom, "right #", length(rowsFromChrom), "SNVs") + } + + if (length(rowsFromChrom) > 5) { + evaluationOfChorm = (length(which(pvalues[rowsFromChrom] < thresholdOfNonNormalVariant))) / (length(rowsFromChrom)) + } else { + evaluationOfChorm = 1.0 + } + + evaluated[counter] = evaluationOfChorm + numberOfSNVs[counter] = length(rowsFromChrom) + + counter = counter + 1 } - - evaluated[counter] = evaluationOfChorm - numberOfSNVs[counter] = length(rowsFromChrom) - - counter = counter + 1 } - indicesOfAllowedChroms = which(evaluated < max(sort(evaluated)[3], 0.1)) - colVec <- rep("red", length(chroms)) - indicesOfAllowedButNotBestChroms = which(evaluated > 0.1 & evaluated < sort(evaluated)[3]) + indicesOfAllowedChroms = which(evaluated < max(sort(evaluated)[3], 0.075)) + colVec <- rep("red", length(evaluated)) + indicesOfAllowedButNotBestChroms = which(evaluated > 0.075 & evaluated < sort(evaluated)[5]) colVec[indicesOfAllowedChroms] = "darkgreen" colVec[indicesOfAllowedButNotBestChroms] = "darkorange" - names(evaluated) = paste(chroms, "\n", numberOfSNVs) + names(evaluated) = namesOfChromArms + allowedChroms = names(evaluated)[indicesOfAllowedChroms] + subDir = paste0(tumorSampleName, "_", healthySampleName) dir.create(file.path(folderBAF, "result", subDir)) setwd(file.path(folderBAF, "result", subDir)) - png(paste0(tumorSampleName, "_", healthySampleName, ".png"), width=1200, height=600) - barplot(evaluated, col=colVec, main=paste(tumorSampleName, healthySampleName), ylim=c(0,1)) + png(paste0(tumorSampleName, "_", healthySampleName, ".png"), width=1400, height=600) + op <- par(mar=c(11,4,4,2)) + x <- barplot(evaluated, col=colVec, main=paste(tumorSampleName, healthySampleName), ylim=c(0,1), xaxt="n") + text(x-2.5, par("usr")[3] - 0.15, labels = plotLabels, srt = 45, pos = 1, xpd = TRUE) + par(mar=c(5.1, 4.1, 4.1, 2.1), mgp=c(3, 1, 0), las=0) dev.off() - allowedChroms = chroms[indicesOfAllowedChroms] trackFilename = paste0(tumorSampleName, "_", healthySampleName, ".igv") @@ -187,7 +208,7 @@ determineAllowedChroms <- function(healthySample, tumorSample, healthySampleName } -returnAllowedChromsBaf <- function(pairs, normalCov, tumorCov, inputFolderBAF, bedFileForFiltering) { +returnAllowedChromsBaf <- function(pairs, normalCov, tumorCov, inputFolderBAF, bedFileForFiltering, leftBorders, rightBorders, endsOfChroms) { allowedChromsBaf <- list() bAlleleFreqsAllSamples <- list() for (i in 1:ncol(normalCov)) { @@ -200,7 +221,8 @@ returnAllowedChromsBaf <- function(pairs, normalCov, tumorCov, inputFolderBAF, b if (!is.null(bAlleleFreqs[[1]])) { allowedChromsBaf[[paste(colnames(tumorCov)[sampleName2], colnames(normalCov)[i], sep="-")]] = determineAllowedChroms(bAlleleFreqs[[1]], bAlleleFreqs[[2]], colnames(normalCov)[i], colnames(tumorCov)[sampleName2], - inputFolderBAF) + inputFolderBAF, + leftBorders, rightBorders, endsOfChroms) bAlleleFreqsAllSamples[[paste(colnames(tumorCov)[sampleName2], colnames(normalCov)[i], sep="-")]] = bAlleleFreqs } else { print("BAF did not work well") diff --git a/somatic/cytobands.txt b/somatic/cytobands.txt index d5c4b44..f84c14b 100644 --- a/somatic/cytobands.txt +++ b/somatic/cytobands.txt @@ -1,48 +1,48 @@ -chr1 121500000 125000000 p11.1 acen -chr1 125000000 128900000 q11 acen -chr10 38000000 40200000 p11.1 acen -chr10 40200000 42300000 q11.1 acen -chr11 51600000 53700000 p11.11 acen -chr11 53700000 55700000 q11 acen -chr12 33300000 35800000 p11.1 acen -chr12 35800000 38200000 q11 acen -chr13 16300000 17900000 p11.1 acen -chr13 17900000 19500000 q11 acen -chr14 16100000 17600000 p11.1 acen -chr14 17600000 19100000 q11.1 acen -chr15 15800000 19000000 p11.1 acen -chr15 19000000 20700000 q11.1 acen -chr16 34600000 36600000 p11.1 acen -chr16 36600000 38600000 q11.1 acen -chr17 22200000 24000000 p11.1 acen -chr17 24000000 25800000 q11.1 acen -chr18 15400000 17200000 p11.1 acen -chr18 17200000 19000000 q11.1 acen -chr19 24400000 26500000 p11 acen -chr19 26500000 28600000 q11 acen -chr2 90500000 93300000 p11.1 acen -chr2 93300000 96800000 q11.1 acen -chr20 25600000 27500000 p11.1 acen -chr20 27500000 29400000 q11.1 acen -chr21 10900000 13200000 p11.1 acen -chr21 13200000 14300000 q11.1 acen -chr22 12200000 14700000 p11.1 acen -chr22 14700000 17900000 q11.1 acen -chr3 87900000 91000000 p11.1 acen -chr3 91000000 93900000 q11.1 acen -chr4 48200000 50400000 p11 acen -chr4 50400000 52700000 q11 acen -chr5 46100000 48400000 p11 acen -chr5 48400000 50700000 q11.1 acen -chr6 58700000 61000000 p11.1 acen -chr6 61000000 63300000 q11.1 acen -chr7 58000000 59900000 p11.1 acen -chr7 59900000 61700000 q11.1 acen -chr8 43100000 45600000 p11.1 acen -chr8 45600000 48100000 q11.1 acen -chr9 47300000 49000000 p11.1 acen -chr9 49000000 50700000 q11 acen -chrX 58100000 60600000 p11.1 acen -chrX 60600000 63000000 q11.1 acen -chrY 11600000 12500000 p11.1 acen -chrY 12500000 13400000 q11.1 acen +chr1 121500000 125000000 p11.1 249250621 +chr1 125000000 128900000 q11 249250621 +chr10 38000000 40200000 p11.1 135534747 +chr10 40200000 42300000 q11.1 135534747 +chr11 51600000 53700000 p11.11 135006516 +chr11 53700000 55700000 q11 135006516 +chr12 33300000 35800000 p11.1 133851895 +chr12 35800000 38200000 q11 133851895 +chr13 16300000 17900000 p11.1 115169878 +chr13 17900000 19500000 q11 115169878 +chr14 16100000 17600000 p11.1 107349540 +chr14 17600000 19100000 q11.1 107349540 +chr15 15800000 19000000 p11.1 102531392 +chr15 19000000 20700000 q11.1 102531392 +chr16 34600000 36600000 p11.1 90354753 +chr16 36600000 38600000 q11.1 90354753 +chr17 22200000 24000000 p11.1 81195210 +chr17 24000000 25800000 q11.1 81195210 +chr18 15400000 17200000 p11.1 78077248 +chr18 17200000 19000000 q11.1 78077248 +chr19 24400000 26500000 p11 59128983 +chr19 26500000 28600000 q11 59128983 +chr2 90500000 93300000 p11.1 243199373 +chr2 93300000 96800000 q11.1 243199373 +chr20 25600000 27500000 p11.1 63025520 +chr20 27500000 29400000 q11.1 63025520 +chr21 10900000 13200000 p11.1 48129895 +chr21 13200000 14300000 q11.1 48129895 +chr22 12200000 14700000 p11.1 51304566 +chr22 14700000 17900000 q11.1 51304566 +chr3 87900000 91000000 p11.1 198022430 +chr3 91000000 93900000 q11.1 198022430 +chr4 48200000 50400000 p11 191154276 +chr4 50400000 52700000 q11 191154276 +chr5 46100000 48400000 p11 180915260 +chr5 48400000 50700000 q11.1 180915260 +chr6 58700000 61000000 p11.1 171115067 +chr6 61000000 63300000 q11.1 171115067 +chr7 58000000 59900000 p11.1 159138663 +chr7 59900000 61700000 q11.1 159138663 +chr8 43100000 45600000 p11.1 146364022 +chr8 45600000 48100000 q11.1 146364022 +chr9 47300000 49000000 p11.1 141213431 +chr9 49000000 50700000 q11 141213431 +chrX 58100000 60600000 p11.1 155270560 +chrX 60600000 63000000 q11.1 155270560 +chrY 11600000 12500000 p11.1 59373566 +chrY 12500000 13400000 q11.1 59373566 diff --git a/somatic/firstStep.R b/somatic/firstStep.R index 9515f46..7fff673 100644 --- a/somatic/firstStep.R +++ b/somatic/firstStep.R @@ -222,6 +222,11 @@ if (!is.null(opt$tumorSample)) { stopifnot(!opt$tumorSample %in% pairs[coordOfNormalInPairs,1]) } +lstOfChromBorders <- getCytobands("cytobands.txt") +left_borders <- lstOfChromBorders[[1]] +right_borders <- lstOfChromBorders[[2]] +ends_of_chroms <- lstOfChromBorders[[3]] + if (frameworkDataTypes == "covdepthBAF") { setwd(opt$folderWithScript) source("bafSegmentation.R",local=TRUE) @@ -234,7 +239,7 @@ if (frameworkDataTypes == "covdepthBAF") { } else { pairsForBAF = pairs } - listOfValues <- returnAllowedChromsBaf(pairsForBAF, normal, tumor, opt$bafFolder, bedFile) + listOfValues <- returnAllowedChromsBaf(pairsForBAF, normal, tumor, opt$bafFolder, bedFile, left_borders, right_borders, ends_of_chroms) allowedChromsBaf <- listOfValues[[1]] bAlleleFreqsAllSamples <- listOfValues[[2]] } @@ -347,7 +352,7 @@ sdsOfGermlineSamples <- apply(coverage.normalised[autosomes,], 2, determineSDsOf #locationsShiftedLogFoldChanges <- sweep(matrixOfLogFold, 1, locations) -listOfVarianceAndMultiplicator <- esimtateVarianceFromSampleNoise(sdsOfGermlineSamples, 30000) +listOfVarianceAndMultiplicator <- esimtateVarianceFromSampleNoise(sdsOfGermlineSamples, 100000) esimtatedVarianceFromSampleNoise <- listOfVarianceAndMultiplicator[[2]] multiplicator <- listOfVarianceAndMultiplicator[[1]] @@ -359,17 +364,7 @@ cn_states <- 0:20 -cytobands <- read.table("cytobands.txt",stringsAsFactors = F, header = F, sep="\t", row.names=NULL) -left_borders <- vector(mode="list", length=nrow(cytobands)/2) -right_borders <- vector(mode="list", length=nrow(cytobands)/2) -odd_numbers <- seq(from=1, to=nrow(cytobands), by=2) -even_numbers <- seq(from=2, to=nrow(cytobands), by=2) -names(left_borders) = as.vector(t(cytobands[,1]))[odd_numbers] -names(right_borders) = as.vector(t(cytobands[,1]))[even_numbers] -for (i in 1:length(odd_numbers)) { - left_borders[[i]] = cytobands[odd_numbers[i], 2] - right_borders[[i]] = cytobands[even_numbers[i], 3] -} + diff --git a/somatic/generalHelpers.R b/somatic/generalHelpers.R index e13c401..0d92fce 100644 --- a/somatic/generalHelpers.R +++ b/somatic/generalHelpers.R @@ -27,6 +27,23 @@ return_norm_likelik <- function(x) { return(vect_of_norm_likeliks[x]) } +getCytobands <- function(fileName) { + cytobands <- read.table(fileName,stringsAsFactors = F, header = F, sep="\t", row.names=NULL) + left_borders <- vector(mode="list", length=nrow(cytobands)/2) + right_borders <- vector(mode="list", length=nrow(cytobands)/2) + ends_of_chroms <- vector(mode="list", length=nrow(cytobands)/2) + odd_numbers <- seq(from=1, to=nrow(cytobands), by=2) + even_numbers <- seq(from=2, to=nrow(cytobands), by=2) + names(left_borders) = as.vector(t(cytobands[,1]))[odd_numbers] + names(right_borders) = as.vector(t(cytobands[,1]))[even_numbers] + names(ends_of_chroms) = as.vector(t(cytobands[,1]))[even_numbers] + for (i in 1:length(odd_numbers)) { + left_borders[[i]] = cytobands[odd_numbers[i], 2] + right_borders[[i]] = cytobands[even_numbers[i], 3] + ends_of_chroms[[i]] = cytobands[even_numbers[i], 5] + } + return(list(left_borders, right_borders, ends_of_chroms)) +} EstimateModeSimple <- function(x) { @@ -112,7 +129,16 @@ gc_and_sample_size_normalise <- function(info, coverages, averageCoverage=T, all tumorName = colnames(coverages)[j] position <- which(startsWith(names(allowedChroms), prefix=tumorName)) if (length(position) == 1) { - allowedChromosomesAutosomesOnly = which(!info[,1] %in% c("chrX", "chrY") & info[,1] %in% allowedChroms[[position]]) + allowedChromosomesAutosomesOnly = c() + for (allowedArm in allowedChroms[[position]]) { + splittedValue <- strsplit(allowedArm, "-") + chrom = splittedValue[[1]][1] + startOfArm = as.numeric(splittedValue[[1]][2]) + endOfArm = as.numeric(splittedValue[[1]][3]) + allowedChromosomesAutosomesOnly = union(allowedChromosomesAutosomesOnly, which(info[,1] == chrom & + info[,2] >= startOfArm & + info[,3] <= endOfArm)) + } } else { allowedChromosomesAutosomesOnly = which(!info[,1] %in% c("chrX", "chrY")) }